{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.25, "eval_steps": 250, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000125, "grad_norm": 2.537714958190918, "learning_rate": 1.0000000000000002e-06, "loss": 0.5468, "loss/crossentropy": 2.2066214084625244, "loss/hidden": 0.248046875, "loss/logits": 0.03443578630685806, "loss/reg": 0.026429571211338043, "step": 1 }, { "epoch": 0.00025, "grad_norm": 2.4728448390960693, "learning_rate": 2.0000000000000003e-06, "loss": 0.6642, "loss/crossentropy": 2.132329225540161, "loss/hidden": 0.345703125, "loss/logits": 0.05424630641937256, "loss/reg": 0.026429571211338043, "step": 2 }, { "epoch": 0.000375, "grad_norm": 2.773984670639038, "learning_rate": 3e-06, "loss": 0.5822, "loss/crossentropy": 2.3457791805267334, "loss/hidden": 0.2734375, "loss/logits": 0.044443465769290924, "loss/reg": 0.02642953023314476, "step": 3 }, { "epoch": 0.0005, "grad_norm": 4.14040470123291, "learning_rate": 4.000000000000001e-06, "loss": 0.7192, "loss/crossentropy": 2.7209200859069824, "loss/hidden": 0.35546875, "loss/logits": 0.09940779209136963, "loss/reg": 0.026429466903209686, "step": 4 }, { "epoch": 0.000625, "grad_norm": 1.9164764881134033, "learning_rate": 5e-06, "loss": 0.5467, "loss/crossentropy": 2.4304752349853516, "loss/hidden": 0.244140625, "loss/logits": 0.03826362267136574, "loss/reg": 0.02642936445772648, "step": 5 }, { "epoch": 0.00075, "grad_norm": 1.9878246784210205, "learning_rate": 6e-06, "loss": 0.517, "loss/crossentropy": 2.472181797027588, "loss/hidden": 0.2255859375, "loss/logits": 0.027161670848727226, "loss/reg": 0.02642924338579178, "step": 6 }, { "epoch": 0.000875, "grad_norm": 2.1939733028411865, "learning_rate": 7.000000000000001e-06, "loss": 0.6043, "loss/crossentropy": 2.241501808166504, "loss/hidden": 0.298828125, "loss/logits": 0.04118040204048157, "loss/reg": 0.02642909064888954, "step": 7 }, { "epoch": 0.001, "grad_norm": 3.516223907470703, "learning_rate": 8.000000000000001e-06, "loss": 0.5199, "loss/crossentropy": 2.409766912460327, "loss/hidden": 0.2236328125, "loss/logits": 0.032000549137592316, "loss/reg": 0.02642889879643917, "step": 8 }, { "epoch": 0.001125, "grad_norm": 1.9335486888885498, "learning_rate": 9e-06, "loss": 0.5575, "loss/crossentropy": 2.6256861686706543, "loss/hidden": 0.255859375, "loss/logits": 0.037392452359199524, "loss/reg": 0.02642873302102089, "step": 9 }, { "epoch": 0.00125, "grad_norm": 1.6782876253128052, "learning_rate": 1e-05, "loss": 0.5162, "loss/crossentropy": 2.1947107315063477, "loss/hidden": 0.2255859375, "loss/logits": 0.026354767382144928, "loss/reg": 0.026428483426570892, "step": 10 }, { "epoch": 0.001375, "grad_norm": 10.848552703857422, "learning_rate": 1.1000000000000001e-05, "loss": 1.0046, "loss/crossentropy": 3.0539069175720215, "loss/hidden": 0.640625, "loss/logits": 0.09970991313457489, "loss/reg": 0.026428230106830597, "step": 11 }, { "epoch": 0.0015, "grad_norm": 2.237061023712158, "learning_rate": 1.2e-05, "loss": 0.563, "loss/crossentropy": 2.5601325035095215, "loss/hidden": 0.248046875, "loss/logits": 0.050660137087106705, "loss/reg": 0.02642793208360672, "step": 12 }, { "epoch": 0.001625, "grad_norm": 1.4406346082687378, "learning_rate": 1.3000000000000001e-05, "loss": 0.507, "loss/crossentropy": 1.965380311012268, "loss/hidden": 0.2197265625, "loss/logits": 0.02295786701142788, "loss/reg": 0.02642756886780262, "step": 13 }, { "epoch": 0.00175, "grad_norm": 3.0757036209106445, "learning_rate": 1.4000000000000001e-05, "loss": 0.7761, "loss/crossentropy": 2.15138840675354, "loss/hidden": 0.4375, "loss/logits": 0.07431840896606445, "loss/reg": 0.026427194476127625, "step": 14 }, { "epoch": 0.001875, "grad_norm": 2.8731143474578857, "learning_rate": 1.5e-05, "loss": 0.4684, "loss/crossentropy": 2.5530812740325928, "loss/hidden": 0.1845703125, "loss/logits": 0.019558344036340714, "loss/reg": 0.026426764205098152, "step": 15 }, { "epoch": 0.002, "grad_norm": 2.5288755893707275, "grad_norm_var": 4.846526347105633, "learning_rate": 1.6000000000000003e-05, "loss": 0.5781, "loss/crossentropy": 2.5096747875213623, "loss/hidden": 0.275390625, "loss/logits": 0.0384209081530571, "loss/reg": 0.026426298543810844, "step": 16 }, { "epoch": 0.002125, "grad_norm": 2.0281474590301514, "grad_norm_var": 4.89482291121508, "learning_rate": 1.7000000000000003e-05, "loss": 0.5318, "loss/crossentropy": 2.396097421646118, "loss/hidden": 0.232421875, "loss/logits": 0.03516196087002754, "loss/reg": 0.026425909250974655, "step": 17 }, { "epoch": 0.00225, "grad_norm": 2.4487411975860596, "grad_norm_var": 4.896482229626747, "learning_rate": 1.8e-05, "loss": 0.5984, "loss/crossentropy": 2.3916616439819336, "loss/hidden": 0.28125, "loss/logits": 0.052923329174518585, "loss/reg": 0.026425503194332123, "step": 18 }, { "epoch": 0.002375, "grad_norm": 1.986022710800171, "grad_norm_var": 4.956548008938709, "learning_rate": 1.9e-05, "loss": 0.5504, "loss/crossentropy": 2.4791200160980225, "loss/hidden": 0.2421875, "loss/logits": 0.04391499236226082, "loss/reg": 0.026425078511238098, "step": 19 }, { "epoch": 0.0025, "grad_norm": 2.0934784412384033, "grad_norm_var": 4.887277710992484, "learning_rate": 2e-05, "loss": 0.5369, "loss/crossentropy": 2.1741297245025635, "loss/hidden": 0.2392578125, "loss/logits": 0.03335873782634735, "loss/reg": 0.026424556970596313, "step": 20 }, { "epoch": 0.002625, "grad_norm": 1.9445254802703857, "grad_norm_var": 4.884025740026245, "learning_rate": 2.1e-05, "loss": 0.4865, "loss/crossentropy": 2.45112943649292, "loss/hidden": 0.1962890625, "loss/logits": 0.025960583239793777, "loss/reg": 0.02642405778169632, "step": 21 }, { "epoch": 0.00275, "grad_norm": 3.070704221725464, "grad_norm_var": 4.8399171328504895, "learning_rate": 2.2000000000000003e-05, "loss": 0.5887, "loss/crossentropy": 2.1550512313842773, "loss/hidden": 0.275390625, "loss/logits": 0.049097511917352676, "loss/reg": 0.02642347477376461, "step": 22 }, { "epoch": 0.002875, "grad_norm": 2.2452821731567383, "grad_norm_var": 4.835466428034186, "learning_rate": 2.3000000000000003e-05, "loss": 0.5482, "loss/crossentropy": 2.1640255451202393, "loss/hidden": 0.2470703125, "loss/logits": 0.036868080496788025, "loss/reg": 0.026422815397381783, "step": 23 }, { "epoch": 0.003, "grad_norm": 2.032148838043213, "grad_norm_var": 4.84560617678243, "learning_rate": 2.4e-05, "loss": 0.5756, "loss/crossentropy": 2.323482036590576, "loss/hidden": 0.271484375, "loss/logits": 0.03993295133113861, "loss/reg": 0.02642211876809597, "step": 24 }, { "epoch": 0.003125, "grad_norm": 1.763465404510498, "grad_norm_var": 4.8665883230545335, "learning_rate": 2.5e-05, "loss": 0.5139, "loss/crossentropy": 2.33661150932312, "loss/hidden": 0.22265625, "loss/logits": 0.02701444923877716, "loss/reg": 0.026421383023262024, "step": 25 }, { "epoch": 0.00325, "grad_norm": 1.7001625299453735, "grad_norm_var": 4.863438686484135, "learning_rate": 2.6000000000000002e-05, "loss": 0.5696, "loss/crossentropy": 2.2305383682250977, "loss/hidden": 0.275390625, "loss/logits": 0.02997000887989998, "loss/reg": 0.026420695707201958, "step": 26 }, { "epoch": 0.003375, "grad_norm": 3.474130392074585, "grad_norm_var": 0.318824614624526, "learning_rate": 2.7000000000000002e-05, "loss": 0.5456, "loss/crossentropy": 2.1680750846862793, "loss/hidden": 0.2451171875, "loss/logits": 0.036261945962905884, "loss/reg": 0.0264199897646904, "step": 27 }, { "epoch": 0.0035, "grad_norm": 3.8201987743377686, "grad_norm_var": 0.46030846745106163, "learning_rate": 2.8000000000000003e-05, "loss": 0.5181, "loss/crossentropy": 2.418672800064087, "loss/hidden": 0.224609375, "loss/logits": 0.029263213276863098, "loss/reg": 0.02641921117901802, "step": 28 }, { "epoch": 0.003625, "grad_norm": 1.9781090021133423, "grad_norm_var": 0.40905077024170067, "learning_rate": 2.9e-05, "loss": 0.5379, "loss/crossentropy": 2.390868663787842, "loss/hidden": 0.2392578125, "loss/logits": 0.034465983510017395, "loss/reg": 0.026418352499604225, "step": 29 }, { "epoch": 0.00375, "grad_norm": 1.6551319360733032, "grad_norm_var": 0.41503895204729413, "learning_rate": 3e-05, "loss": 0.496, "loss/crossentropy": 2.5960400104522705, "loss/hidden": 0.205078125, "loss/logits": 0.02678578905761242, "loss/reg": 0.026417305693030357, "step": 30 }, { "epoch": 0.003875, "grad_norm": 1.7136921882629395, "grad_norm_var": 0.4185952392532807, "learning_rate": 3.1e-05, "loss": 0.5235, "loss/crossentropy": 2.349839687347412, "loss/hidden": 0.2275390625, "loss/logits": 0.031792763620615005, "loss/reg": 0.026416433975100517, "step": 31 }, { "epoch": 0.004, "grad_norm": 1.9992157220840454, "grad_norm_var": 0.41856547198587274, "learning_rate": 3.2000000000000005e-05, "loss": 0.4928, "loss/crossentropy": 2.3164803981781006, "loss/hidden": 0.2041015625, "loss/logits": 0.024579893797636032, "loss/reg": 0.02641524001955986, "step": 32 }, { "epoch": 0.004125, "grad_norm": 2.705052614212036, "grad_norm_var": 0.42744416353313663, "learning_rate": 3.3e-05, "loss": 0.5732, "loss/crossentropy": 2.42107892036438, "loss/hidden": 0.275390625, "loss/logits": 0.03370767831802368, "loss/reg": 0.02641397900879383, "step": 33 }, { "epoch": 0.00425, "grad_norm": 1.8898464441299438, "grad_norm_var": 0.4350913020843951, "learning_rate": 3.4000000000000007e-05, "loss": 0.5531, "loss/crossentropy": 2.4147770404815674, "loss/hidden": 0.25390625, "loss/logits": 0.03504405915737152, "loss/reg": 0.026412710547447205, "step": 34 }, { "epoch": 0.004375, "grad_norm": 4.9570159912109375, "grad_norm_var": 0.8804344329355491, "learning_rate": 3.5e-05, "loss": 0.6763, "loss/crossentropy": 1.6753497123718262, "loss/hidden": 0.376953125, "loss/logits": 0.03519564867019653, "loss/reg": 0.0264116358011961, "step": 35 }, { "epoch": 0.0045, "grad_norm": 4.928956508636475, "grad_norm_var": 1.2518721453244992, "learning_rate": 3.6e-05, "loss": 0.7329, "loss/crossentropy": 2.6104867458343506, "loss/hidden": 0.400390625, "loss/logits": 0.06845290958881378, "loss/reg": 0.02641039527952671, "step": 36 }, { "epoch": 0.004625, "grad_norm": 7.503647327423096, "grad_norm_var": 2.684651641752033, "learning_rate": 3.7e-05, "loss": 0.6258, "loss/crossentropy": 2.2158656120300293, "loss/hidden": 0.318359375, "loss/logits": 0.043342188000679016, "loss/reg": 0.026409219950437546, "step": 37 }, { "epoch": 0.00475, "grad_norm": 2.6838622093200684, "grad_norm_var": 2.6885420074665602, "learning_rate": 3.8e-05, "loss": 0.5939, "loss/crossentropy": 2.344879627227783, "loss/hidden": 0.28515625, "loss/logits": 0.04461552947759628, "loss/reg": 0.026408080011606216, "step": 38 }, { "epoch": 0.004875, "grad_norm": 3.357893705368042, "grad_norm_var": 2.662758933855309, "learning_rate": 3.9000000000000006e-05, "loss": 0.5729, "loss/crossentropy": 2.6759543418884277, "loss/hidden": 0.275390625, "loss/logits": 0.033413954079151154, "loss/reg": 0.026406895369291306, "step": 39 }, { "epoch": 0.005, "grad_norm": 3.0177316665649414, "grad_norm_var": 2.5949485604856193, "learning_rate": 4e-05, "loss": 0.7498, "loss/crossentropy": 2.2261273860931396, "loss/hidden": 0.408203125, "loss/logits": 0.07758316397666931, "loss/reg": 0.026405224576592445, "step": 40 }, { "epoch": 0.005125, "grad_norm": 2.1196699142456055, "grad_norm_var": 2.54074274703229, "learning_rate": 4.1e-05, "loss": 0.6396, "loss/crossentropy": 2.193378448486328, "loss/hidden": 0.30859375, "loss/logits": 0.06692355871200562, "loss/reg": 0.026403924450278282, "step": 41 }, { "epoch": 0.00525, "grad_norm": 2.456051826477051, "grad_norm_var": 2.435973046683167, "learning_rate": 4.2e-05, "loss": 0.5571, "loss/crossentropy": 1.9526888132095337, "loss/hidden": 0.26171875, "loss/logits": 0.03133418411016464, "loss/reg": 0.026402529329061508, "step": 42 }, { "epoch": 0.005375, "grad_norm": 2.257375717163086, "grad_norm_var": 2.474501380785125, "learning_rate": 4.3e-05, "loss": 0.5544, "loss/crossentropy": 2.3284847736358643, "loss/hidden": 0.25390625, "loss/logits": 0.03650724142789841, "loss/reg": 0.026400938630104065, "step": 43 }, { "epoch": 0.0055, "grad_norm": 2.9145264625549316, "grad_norm_var": 2.4345975605903694, "learning_rate": 4.4000000000000006e-05, "loss": 0.5175, "loss/crossentropy": 2.295241594314575, "loss/hidden": 0.220703125, "loss/logits": 0.03284794092178345, "loss/reg": 0.026399515569210052, "step": 44 }, { "epoch": 0.005625, "grad_norm": 3.1294264793395996, "grad_norm_var": 2.3592519473156615, "learning_rate": 4.5e-05, "loss": 0.5567, "loss/crossentropy": 2.660597085952759, "loss/hidden": 0.255859375, "loss/logits": 0.03686758130788803, "loss/reg": 0.026398126035928726, "step": 45 }, { "epoch": 0.00575, "grad_norm": 2.197265863418579, "grad_norm_var": 2.2745842657817748, "learning_rate": 4.600000000000001e-05, "loss": 0.5512, "loss/crossentropy": 2.3832643032073975, "loss/hidden": 0.2490234375, "loss/logits": 0.038256023079156876, "loss/reg": 0.02639671601355076, "step": 46 }, { "epoch": 0.005875, "grad_norm": 2.883378744125366, "grad_norm_var": 2.141634704665381, "learning_rate": 4.7e-05, "loss": 0.5298, "loss/crossentropy": 2.6035244464874268, "loss/hidden": 0.2333984375, "loss/logits": 0.03240815922617912, "loss/reg": 0.026395246386528015, "step": 47 }, { "epoch": 0.006, "grad_norm": 3.1519744396209717, "grad_norm_var": 2.0420385103816727, "learning_rate": 4.8e-05, "loss": 0.5385, "loss/crossentropy": 2.250037908554077, "loss/hidden": 0.244140625, "loss/logits": 0.03043752908706665, "loss/reg": 0.026393571868538857, "step": 48 }, { "epoch": 0.006125, "grad_norm": 3.187680244445801, "grad_norm_var": 2.0209109756516943, "learning_rate": 4.9e-05, "loss": 0.5614, "loss/crossentropy": 2.366483688354492, "loss/hidden": 0.263671875, "loss/logits": 0.033859170973300934, "loss/reg": 0.02639181725680828, "step": 49 }, { "epoch": 0.00625, "grad_norm": 2.3717658519744873, "grad_norm_var": 1.9454730589909708, "learning_rate": 5e-05, "loss": 0.5865, "loss/crossentropy": 2.007732391357422, "loss/hidden": 0.2890625, "loss/logits": 0.0335388109087944, "loss/reg": 0.02638987824320793, "step": 50 }, { "epoch": 0.006375, "grad_norm": 3.658735990524292, "grad_norm_var": 1.767425501826429, "learning_rate": 5.1000000000000006e-05, "loss": 0.5028, "loss/crossentropy": 2.511072874069214, "loss/hidden": 0.2099609375, "loss/logits": 0.02893088385462761, "loss/reg": 0.026387827470898628, "step": 51 }, { "epoch": 0.0065, "grad_norm": 2.5912654399871826, "grad_norm_var": 1.582150273328572, "learning_rate": 5.2000000000000004e-05, "loss": 0.5619, "loss/crossentropy": 2.3280093669891357, "loss/hidden": 0.263671875, "loss/logits": 0.03436018154025078, "loss/reg": 0.026385735720396042, "step": 52 }, { "epoch": 0.006625, "grad_norm": 2.0419421195983887, "grad_norm_var": 0.23432357463535497, "learning_rate": 5.300000000000001e-05, "loss": 0.5674, "loss/crossentropy": 2.3851194381713867, "loss/hidden": 0.263671875, "loss/logits": 0.039869021624326706, "loss/reg": 0.026383817195892334, "step": 53 }, { "epoch": 0.00675, "grad_norm": 2.4164810180664062, "grad_norm_var": 0.24119551692934707, "learning_rate": 5.4000000000000005e-05, "loss": 0.6087, "loss/crossentropy": 2.6006996631622314, "loss/hidden": 0.296875, "loss/logits": 0.04797635227441788, "loss/reg": 0.026381801813840866, "step": 54 }, { "epoch": 0.006875, "grad_norm": 2.697831153869629, "grad_norm_var": 0.2135682431387058, "learning_rate": 5.500000000000001e-05, "loss": 0.523, "loss/crossentropy": 2.472208261489868, "loss/hidden": 0.2275390625, "loss/logits": 0.031705208122730255, "loss/reg": 0.026379752904176712, "step": 55 }, { "epoch": 0.007, "grad_norm": 4.182509422302246, "grad_norm_var": 0.34874494246430365, "learning_rate": 5.6000000000000006e-05, "loss": 0.6766, "loss/crossentropy": 2.693652868270874, "loss/hidden": 0.3671875, "loss/logits": 0.04566050320863724, "loss/reg": 0.02637762948870659, "step": 56 }, { "epoch": 0.007125, "grad_norm": 2.231238842010498, "grad_norm_var": 0.33990645656106155, "learning_rate": 5.6999999999999996e-05, "loss": 0.5811, "loss/crossentropy": 2.4935543537139893, "loss/hidden": 0.28125, "loss/logits": 0.03613065183162689, "loss/reg": 0.026375366374850273, "step": 57 }, { "epoch": 0.00725, "grad_norm": 2.0192184448242188, "grad_norm_var": 0.37029866859904437, "learning_rate": 5.8e-05, "loss": 0.5285, "loss/crossentropy": 2.192227840423584, "loss/hidden": 0.2294921875, "loss/logits": 0.03531934320926666, "loss/reg": 0.02637314423918724, "step": 58 }, { "epoch": 0.007375, "grad_norm": 2.3108532428741455, "grad_norm_var": 0.36699486123436575, "learning_rate": 5.9e-05, "loss": 0.507, "loss/crossentropy": 2.39101243019104, "loss/hidden": 0.2177734375, "loss/logits": 0.025539016351103783, "loss/reg": 0.02637065388262272, "step": 59 }, { "epoch": 0.0075, "grad_norm": 2.049551486968994, "grad_norm_var": 0.3946811437011318, "learning_rate": 6e-05, "loss": 0.5351, "loss/crossentropy": 2.7062017917633057, "loss/hidden": 0.2353515625, "loss/logits": 0.03605186939239502, "loss/reg": 0.026368385180830956, "step": 60 }, { "epoch": 0.007625, "grad_norm": 2.6327223777770996, "grad_norm_var": 0.38133460463904284, "learning_rate": 6.1e-05, "loss": 0.5344, "loss/crossentropy": 2.0810956954956055, "loss/hidden": 0.23828125, "loss/logits": 0.03246723860502243, "loss/reg": 0.02636607363820076, "step": 61 }, { "epoch": 0.00775, "grad_norm": 2.06585955619812, "grad_norm_var": 0.39059185941555535, "learning_rate": 6.2e-05, "loss": 0.5202, "loss/crossentropy": 2.6240835189819336, "loss/hidden": 0.224609375, "loss/logits": 0.03200242295861244, "loss/reg": 0.026363445445895195, "step": 62 }, { "epoch": 0.007875, "grad_norm": 2.109790563583374, "grad_norm_var": 0.40452198957435875, "learning_rate": 6.3e-05, "loss": 0.5249, "loss/crossentropy": 2.587536573410034, "loss/hidden": 0.2294921875, "loss/logits": 0.03178905323147774, "loss/reg": 0.02636083774268627, "step": 63 }, { "epoch": 0.008, "grad_norm": 3.818783760070801, "grad_norm_var": 0.48072296241430573, "learning_rate": 6.400000000000001e-05, "loss": 0.6632, "loss/crossentropy": 2.011171817779541, "loss/hidden": 0.353515625, "loss/logits": 0.04613731801509857, "loss/reg": 0.02635800838470459, "step": 64 }, { "epoch": 0.008125, "grad_norm": 2.4136369228363037, "grad_norm_var": 0.46258887231494605, "learning_rate": 6.500000000000001e-05, "loss": 0.522, "loss/crossentropy": 2.600787401199341, "loss/hidden": 0.224609375, "loss/logits": 0.03384025767445564, "loss/reg": 0.026355121284723282, "step": 65 }, { "epoch": 0.00825, "grad_norm": 2.3908252716064453, "grad_norm_var": 0.46202963925557394, "learning_rate": 6.6e-05, "loss": 0.5859, "loss/crossentropy": 2.1056201457977295, "loss/hidden": 0.279296875, "loss/logits": 0.04309317469596863, "loss/reg": 0.02635251171886921, "step": 66 }, { "epoch": 0.008375, "grad_norm": 2.5190653800964355, "grad_norm_var": 0.38262308323354144, "learning_rate": 6.7e-05, "loss": 0.5391, "loss/crossentropy": 2.5527184009552, "loss/hidden": 0.2421875, "loss/logits": 0.0334152951836586, "loss/reg": 0.02634957991540432, "step": 67 }, { "epoch": 0.0085, "grad_norm": 2.500368595123291, "grad_norm_var": 0.3824057294099087, "learning_rate": 6.800000000000001e-05, "loss": 0.5958, "loss/crossentropy": 2.30499267578125, "loss/hidden": 0.28125, "loss/logits": 0.05108712613582611, "loss/reg": 0.026346800848841667, "step": 68 }, { "epoch": 0.008625, "grad_norm": 2.7905988693237305, "grad_norm_var": 0.3692126592154902, "learning_rate": 6.9e-05, "loss": 0.6049, "loss/crossentropy": 2.3807146549224854, "loss/hidden": 0.294921875, "loss/logits": 0.04648623988032341, "loss/reg": 0.026344334706664085, "step": 69 }, { "epoch": 0.00875, "grad_norm": 2.147470235824585, "grad_norm_var": 0.3793077808516782, "learning_rate": 7e-05, "loss": 0.5345, "loss/crossentropy": 2.627505302429199, "loss/hidden": 0.236328125, "loss/logits": 0.034769318997859955, "loss/reg": 0.026341637596488, "step": 70 }, { "epoch": 0.008875, "grad_norm": 2.6987268924713135, "grad_norm_var": 0.3793248871627156, "learning_rate": 7.1e-05, "loss": 0.5722, "loss/crossentropy": 2.382685899734497, "loss/hidden": 0.271484375, "loss/logits": 0.03735022246837616, "loss/reg": 0.02633870206773281, "step": 71 }, { "epoch": 0.009, "grad_norm": 3.085085153579712, "grad_norm_var": 0.2164648496489896, "learning_rate": 7.2e-05, "loss": 0.5882, "loss/crossentropy": 2.371429681777954, "loss/hidden": 0.27734375, "loss/logits": 0.04748620092868805, "loss/reg": 0.02633603662252426, "step": 72 }, { "epoch": 0.009125, "grad_norm": 4.158353328704834, "grad_norm_var": 0.3829897758196862, "learning_rate": 7.3e-05, "loss": 0.8663, "loss/crossentropy": 2.29622745513916, "loss/hidden": 0.5234375, "loss/logits": 0.07955377548933029, "loss/reg": 0.026333071291446686, "step": 73 }, { "epoch": 0.00925, "grad_norm": 2.111111879348755, "grad_norm_var": 0.37631661688178514, "learning_rate": 7.4e-05, "loss": 0.5468, "loss/crossentropy": 2.29744815826416, "loss/hidden": 0.24609375, "loss/logits": 0.037446070462465286, "loss/reg": 0.026330096647143364, "step": 74 }, { "epoch": 0.009375, "grad_norm": 2.545919179916382, "grad_norm_var": 0.37031037444480336, "learning_rate": 7.500000000000001e-05, "loss": 0.625, "loss/crossentropy": 2.4376375675201416, "loss/hidden": 0.306640625, "loss/logits": 0.05505819618701935, "loss/reg": 0.026326792314648628, "step": 75 }, { "epoch": 0.0095, "grad_norm": 2.362215042114258, "grad_norm_var": 0.35233204024674003, "learning_rate": 7.6e-05, "loss": 0.5747, "loss/crossentropy": 2.677924156188965, "loss/hidden": 0.267578125, "loss/logits": 0.04392882436513901, "loss/reg": 0.026323769241571426, "step": 76 }, { "epoch": 0.009625, "grad_norm": 3.135709762573242, "grad_norm_var": 0.3671929300455114, "learning_rate": 7.7e-05, "loss": 0.7113, "loss/crossentropy": 1.972798466682434, "loss/hidden": 0.384765625, "loss/logits": 0.06329117715358734, "loss/reg": 0.026320943608880043, "step": 77 }, { "epoch": 0.00975, "grad_norm": 4.418634414672852, "grad_norm_var": 0.5210260544686395, "learning_rate": 7.800000000000001e-05, "loss": 0.6851, "loss/crossentropy": 2.558809518814087, "loss/hidden": 0.357421875, "loss/logits": 0.06447892636060715, "loss/reg": 0.02631756290793419, "step": 78 }, { "epoch": 0.009875, "grad_norm": 3.9261293411254883, "grad_norm_var": 0.55391532710314, "learning_rate": 7.900000000000001e-05, "loss": 0.5968, "loss/crossentropy": 2.6102137565612793, "loss/hidden": 0.28515625, "loss/logits": 0.04846350848674774, "loss/reg": 0.026314500719308853, "step": 79 }, { "epoch": 0.01, "grad_norm": 3.1532020568847656, "grad_norm_var": 0.5035194586586452, "learning_rate": 8e-05, "loss": 0.7066, "loss/crossentropy": 2.36220121383667, "loss/hidden": 0.38671875, "loss/logits": 0.05672474205493927, "loss/reg": 0.026311254128813744, "step": 80 }, { "epoch": 0.010125, "grad_norm": 3.557161808013916, "grad_norm_var": 0.5115010248662256, "learning_rate": 8.1e-05, "loss": 0.6647, "loss/crossentropy": 2.412325859069824, "loss/hidden": 0.345703125, "loss/logits": 0.05596970394253731, "loss/reg": 0.026307715103030205, "step": 81 }, { "epoch": 0.01025, "grad_norm": 2.2158408164978027, "grad_norm_var": 0.5268993015208875, "learning_rate": 8.2e-05, "loss": 0.5575, "loss/crossentropy": 2.3789050579071045, "loss/hidden": 0.255859375, "loss/logits": 0.03855578228831291, "loss/reg": 0.0263040903955698, "step": 82 }, { "epoch": 0.010375, "grad_norm": 3.2140979766845703, "grad_norm_var": 0.5164286227091848, "learning_rate": 8.3e-05, "loss": 0.5548, "loss/crossentropy": 2.401925802230835, "loss/hidden": 0.2578125, "loss/logits": 0.03401009738445282, "loss/reg": 0.026300577446818352, "step": 83 }, { "epoch": 0.0105, "grad_norm": 2.4155867099761963, "grad_norm_var": 0.5225404018326155, "learning_rate": 8.4e-05, "loss": 0.5432, "loss/crossentropy": 2.6546974182128906, "loss/hidden": 0.244140625, "loss/logits": 0.03612750768661499, "loss/reg": 0.02629682794213295, "step": 84 }, { "epoch": 0.010625, "grad_norm": 4.232295036315918, "grad_norm_var": 0.6129643025970267, "learning_rate": 8.5e-05, "loss": 0.7199, "loss/crossentropy": 2.2300524711608887, "loss/hidden": 0.412109375, "loss/logits": 0.044823646545410156, "loss/reg": 0.0262930728495121, "step": 85 }, { "epoch": 0.01075, "grad_norm": 2.7160282135009766, "grad_norm_var": 0.5620128324129702, "learning_rate": 8.6e-05, "loss": 0.6973, "loss/crossentropy": 2.2953288555145264, "loss/hidden": 0.37109375, "loss/logits": 0.06333646178245544, "loss/reg": 0.02628917805850506, "step": 86 }, { "epoch": 0.010875, "grad_norm": 3.0872819423675537, "grad_norm_var": 0.5495392294868544, "learning_rate": 8.7e-05, "loss": 0.636, "loss/crossentropy": 2.449223756790161, "loss/hidden": 0.328125, "loss/logits": 0.045011188834905624, "loss/reg": 0.02628495544195175, "step": 87 }, { "epoch": 0.011, "grad_norm": 2.6966607570648193, "grad_norm_var": 0.5621192378124493, "learning_rate": 8.800000000000001e-05, "loss": 0.6272, "loss/crossentropy": 2.5449047088623047, "loss/hidden": 0.31640625, "loss/logits": 0.04802623763680458, "loss/reg": 0.026280568912625313, "step": 88 }, { "epoch": 0.011125, "grad_norm": 2.9160921573638916, "grad_norm_var": 0.48685408890983506, "learning_rate": 8.900000000000001e-05, "loss": 0.6278, "loss/crossentropy": 2.1442108154296875, "loss/hidden": 0.310546875, "loss/logits": 0.054462507367134094, "loss/reg": 0.02627684734761715, "step": 89 }, { "epoch": 0.01125, "grad_norm": 3.3378944396972656, "grad_norm_var": 0.4283231906687052, "learning_rate": 9e-05, "loss": 0.6536, "loss/crossentropy": 2.4361062049865723, "loss/hidden": 0.349609375, "loss/logits": 0.04130454361438751, "loss/reg": 0.026273205876350403, "step": 90 }, { "epoch": 0.011375, "grad_norm": 2.597607374191284, "grad_norm_var": 0.42452911296148627, "learning_rate": 9.1e-05, "loss": 0.6391, "loss/crossentropy": 2.0079762935638428, "loss/hidden": 0.3203125, "loss/logits": 0.056107863783836365, "loss/reg": 0.02626909501850605, "step": 91 }, { "epoch": 0.0115, "grad_norm": 4.960971355438232, "grad_norm_var": 0.5826997127177641, "learning_rate": 9.200000000000001e-05, "loss": 0.6735, "loss/crossentropy": 2.708275079727173, "loss/hidden": 0.349609375, "loss/logits": 0.06119866296648979, "loss/reg": 0.026265164837241173, "step": 92 }, { "epoch": 0.011625, "grad_norm": 3.8193323612213135, "grad_norm_var": 0.5981799563928756, "learning_rate": 9.300000000000001e-05, "loss": 0.8429, "loss/crossentropy": 2.4117016792297363, "loss/hidden": 0.498046875, "loss/logits": 0.08221981674432755, "loss/reg": 0.026260720565915108, "step": 93 }, { "epoch": 0.01175, "grad_norm": 3.434213638305664, "grad_norm_var": 0.5157332557296352, "learning_rate": 9.4e-05, "loss": 0.6738, "loss/crossentropy": 2.62178111076355, "loss/hidden": 0.36328125, "loss/logits": 0.04794853553175926, "loss/reg": 0.02625615894794464, "step": 94 }, { "epoch": 0.011875, "grad_norm": 3.0944480895996094, "grad_norm_var": 0.4859309000509171, "learning_rate": 9.5e-05, "loss": 0.6829, "loss/crossentropy": 2.582462787628174, "loss/hidden": 0.353515625, "loss/logits": 0.0669020414352417, "loss/reg": 0.026251958683133125, "step": 95 }, { "epoch": 0.012, "grad_norm": 3.9548256397247314, "grad_norm_var": 0.5194300484800329, "learning_rate": 9.6e-05, "loss": 0.8508, "loss/crossentropy": 2.3243165016174316, "loss/hidden": 0.49609375, "loss/logits": 0.09221720695495605, "loss/reg": 0.026247689500451088, "step": 96 }, { "epoch": 0.012125, "grad_norm": 8.949115753173828, "grad_norm_var": 2.5460815450669125, "learning_rate": 9.7e-05, "loss": 0.9668, "loss/crossentropy": 2.3593697547912598, "loss/hidden": 0.62109375, "loss/logits": 0.08330727368593216, "loss/reg": 0.026243869215250015, "step": 97 }, { "epoch": 0.01225, "grad_norm": 3.874511957168579, "grad_norm_var": 2.411331023611861, "learning_rate": 9.8e-05, "loss": 0.7272, "loss/crossentropy": 1.9933784008026123, "loss/hidden": 0.41015625, "loss/logits": 0.05464401841163635, "loss/reg": 0.026239972561597824, "step": 98 }, { "epoch": 0.012375, "grad_norm": 5.088143825531006, "grad_norm_var": 2.507843574169987, "learning_rate": 9.900000000000001e-05, "loss": 0.6761, "loss/crossentropy": 2.578767776489258, "loss/hidden": 0.3515625, "loss/logits": 0.0621890164911747, "loss/reg": 0.026235179975628853, "step": 99 }, { "epoch": 0.0125, "grad_norm": 3.9010627269744873, "grad_norm_var": 2.366914585761602, "learning_rate": 0.0001, "loss": 0.7051, "loss/crossentropy": 2.4717133045196533, "loss/hidden": 0.375, "loss/logits": 0.0677795261144638, "loss/reg": 0.026230769231915474, "step": 100 }, { "epoch": 0.012625, "grad_norm": 5.50706148147583, "grad_norm_var": 2.522191588171327, "learning_rate": 0.0001, "loss": 0.7765, "loss/crossentropy": 2.3764612674713135, "loss/hidden": 0.44921875, "loss/logits": 0.06503438949584961, "loss/reg": 0.02622627653181553, "step": 101 }, { "epoch": 0.01275, "grad_norm": 5.103200435638428, "grad_norm_var": 2.470966679212188, "learning_rate": 0.0001, "loss": 0.7008, "loss/crossentropy": 2.5796542167663574, "loss/hidden": 0.38671875, "loss/logits": 0.05191829800605774, "loss/reg": 0.026221245527267456, "step": 102 }, { "epoch": 0.012875, "grad_norm": 18.05303192138672, "grad_norm_var": 14.358413039824521, "learning_rate": 0.0001, "loss": 1.0008, "loss/crossentropy": 1.927337646484375, "loss/hidden": 0.6796875, "loss/logits": 0.05896752327680588, "loss/reg": 0.02621658518910408, "step": 103 }, { "epoch": 0.013, "grad_norm": 3.410438299179077, "grad_norm_var": 14.16338361533652, "learning_rate": 0.0001, "loss": 0.735, "loss/crossentropy": 2.290928363800049, "loss/hidden": 0.40625, "loss/logits": 0.06666909158229828, "loss/reg": 0.026211561635136604, "step": 104 }, { "epoch": 0.013125, "grad_norm": 3.117622137069702, "grad_norm_var": 14.10656391346422, "learning_rate": 0.0001, "loss": 0.6665, "loss/crossentropy": 2.6549246311187744, "loss/hidden": 0.353515625, "loss/logits": 0.05095440149307251, "loss/reg": 0.026206739246845245, "step": 105 }, { "epoch": 0.01325, "grad_norm": 3.9999241828918457, "grad_norm_var": 13.97508509706009, "learning_rate": 0.0001, "loss": 0.8082, "loss/crossentropy": 2.460174798965454, "loss/hidden": 0.46484375, "loss/logits": 0.08136071264743805, "loss/reg": 0.02620157040655613, "step": 106 }, { "epoch": 0.013375, "grad_norm": 3.405712842941284, "grad_norm_var": 13.73775124044489, "learning_rate": 0.0001, "loss": 0.6518, "loss/crossentropy": 2.521803855895996, "loss/hidden": 0.3359375, "loss/logits": 0.053909383714199066, "loss/reg": 0.02619684301316738, "step": 107 }, { "epoch": 0.0135, "grad_norm": 3.615098237991333, "grad_norm_var": 13.899167673014775, "learning_rate": 0.0001, "loss": 0.7068, "loss/crossentropy": 2.510159969329834, "loss/hidden": 0.37890625, "loss/logits": 0.06601101160049438, "loss/reg": 0.02619197592139244, "step": 108 }, { "epoch": 0.013625, "grad_norm": 4.2520599365234375, "grad_norm_var": 13.834356012442765, "learning_rate": 0.0001, "loss": 0.735, "loss/crossentropy": 2.508683681488037, "loss/hidden": 0.41015625, "loss/logits": 0.0629870742559433, "loss/reg": 0.0261868704110384, "step": 109 }, { "epoch": 0.01375, "grad_norm": 3.215749979019165, "grad_norm_var": 13.887973421518442, "learning_rate": 0.0001, "loss": 0.8268, "loss/crossentropy": 2.3564252853393555, "loss/hidden": 0.48046875, "loss/logits": 0.08449074625968933, "loss/reg": 0.026181429624557495, "step": 110 }, { "epoch": 0.013875, "grad_norm": 4.598328590393066, "grad_norm_var": 13.615373346458785, "learning_rate": 0.0001, "loss": 0.7398, "loss/crossentropy": 2.366943359375, "loss/hidden": 0.41796875, "loss/logits": 0.060083672404289246, "loss/reg": 0.026176555082201958, "step": 111 }, { "epoch": 0.014, "grad_norm": 2.758070707321167, "grad_norm_var": 13.912012390229316, "learning_rate": 0.0001, "loss": 0.6839, "loss/crossentropy": 2.3351521492004395, "loss/hidden": 0.365234375, "loss/logits": 0.056987129151821136, "loss/reg": 0.0261719711124897, "step": 112 }, { "epoch": 0.014125, "grad_norm": 2.9389584064483643, "grad_norm_var": 13.147693721903025, "learning_rate": 0.0001, "loss": 0.8964, "loss/crossentropy": 2.188626766204834, "loss/hidden": 0.54296875, "loss/logits": 0.09171397984027863, "loss/reg": 0.026167072355747223, "step": 113 }, { "epoch": 0.01425, "grad_norm": 2.8545026779174805, "grad_norm_var": 13.338918720074266, "learning_rate": 0.0001, "loss": 0.6652, "loss/crossentropy": 2.488462448120117, "loss/hidden": 0.341796875, "loss/logits": 0.06177069991827011, "loss/reg": 0.026162149384617805, "step": 114 }, { "epoch": 0.014375, "grad_norm": 3.343590497970581, "grad_norm_var": 13.447848849906688, "learning_rate": 0.0001, "loss": 0.7317, "loss/crossentropy": 2.4826672077178955, "loss/hidden": 0.396484375, "loss/logits": 0.07369040697813034, "loss/reg": 0.026156950742006302, "step": 115 }, { "epoch": 0.0145, "grad_norm": 5.309541702270508, "grad_norm_var": 13.435010363164546, "learning_rate": 0.0001, "loss": 0.6918, "loss/crossentropy": 2.715517282485962, "loss/hidden": 0.376953125, "loss/logits": 0.05337735265493393, "loss/reg": 0.026151426136493683, "step": 116 }, { "epoch": 0.014625, "grad_norm": 3.413027763366699, "grad_norm_var": 13.488672790501717, "learning_rate": 0.0001, "loss": 0.7942, "loss/crossentropy": 2.3932089805603027, "loss/hidden": 0.451171875, "loss/logits": 0.08154396712779999, "loss/reg": 0.026146216318011284, "step": 117 }, { "epoch": 0.01475, "grad_norm": 2.735275983810425, "grad_norm_var": 13.676075950246783, "learning_rate": 0.0001, "loss": 0.7606, "loss/crossentropy": 2.2933082580566406, "loss/hidden": 0.4296875, "loss/logits": 0.06949938833713531, "loss/reg": 0.02614082768559456, "step": 118 }, { "epoch": 0.014875, "grad_norm": 3.1346964836120605, "grad_norm_var": 0.5056645529599865, "learning_rate": 0.0001, "loss": 0.7759, "loss/crossentropy": 2.331713914871216, "loss/hidden": 0.4375, "loss/logits": 0.07704727351665497, "loss/reg": 0.026135168969631195, "step": 119 }, { "epoch": 0.015, "grad_norm": 3.8077635765075684, "grad_norm_var": 0.5104468723684629, "learning_rate": 0.0001, "loss": 0.6882, "loss/crossentropy": 2.331220865249634, "loss/hidden": 0.373046875, "loss/logits": 0.05386776477098465, "loss/reg": 0.02612963318824768, "step": 120 }, { "epoch": 0.015125, "grad_norm": 3.209914445877075, "grad_norm_var": 0.5058893418769751, "learning_rate": 0.0001, "loss": 0.7536, "loss/crossentropy": 2.352771759033203, "loss/hidden": 0.4140625, "loss/logits": 0.07825946807861328, "loss/reg": 0.026124266907572746, "step": 121 }, { "epoch": 0.01525, "grad_norm": 3.3548500537872314, "grad_norm_var": 0.49208198737674874, "learning_rate": 0.0001, "loss": 0.7088, "loss/crossentropy": 2.483644723892212, "loss/hidden": 0.384765625, "loss/logits": 0.06287863850593567, "loss/reg": 0.0261182002723217, "step": 122 }, { "epoch": 0.015375, "grad_norm": 3.9953765869140625, "grad_norm_var": 0.5066601541023895, "learning_rate": 0.0001, "loss": 0.7792, "loss/crossentropy": 2.6117637157440186, "loss/hidden": 0.435546875, "loss/logits": 0.08250629901885986, "loss/reg": 0.026112213730812073, "step": 123 }, { "epoch": 0.0155, "grad_norm": 3.1783852577209473, "grad_norm_var": 0.5138316405800327, "learning_rate": 0.0001, "loss": 0.7559, "loss/crossentropy": 2.401679754257202, "loss/hidden": 0.423828125, "loss/logits": 0.07104581594467163, "loss/reg": 0.02610679157078266, "step": 124 }, { "epoch": 0.015625, "grad_norm": 3.2759885787963867, "grad_norm_var": 0.47631527116517774, "learning_rate": 0.0001, "loss": 0.8262, "loss/crossentropy": 2.3979361057281494, "loss/hidden": 0.486328125, "loss/logits": 0.0788530558347702, "loss/reg": 0.026101654395461082, "step": 125 }, { "epoch": 0.01575, "grad_norm": 4.0768632888793945, "grad_norm_var": 0.4963098069624029, "learning_rate": 0.0001, "loss": 0.6983, "loss/crossentropy": 2.5287113189697266, "loss/hidden": 0.375, "loss/logits": 0.062308911234140396, "loss/reg": 0.026096193119883537, "step": 126 }, { "epoch": 0.015875, "grad_norm": 4.300101280212402, "grad_norm_var": 0.45815803943682926, "learning_rate": 0.0001, "loss": 0.858, "loss/crossentropy": 2.255234956741333, "loss/hidden": 0.50390625, "loss/logits": 0.0932290330529213, "loss/reg": 0.026090849190950394, "step": 127 }, { "epoch": 0.016, "grad_norm": 3.303663492202759, "grad_norm_var": 0.42421384752866137, "learning_rate": 0.0001, "loss": 0.7284, "loss/crossentropy": 2.528862476348877, "loss/hidden": 0.40234375, "loss/logits": 0.06523742526769638, "loss/reg": 0.026085302233695984, "step": 128 }, { "epoch": 0.016125, "grad_norm": 6.868241310119629, "grad_norm_var": 1.0876227157335427, "learning_rate": 0.0001, "loss": 0.8999, "loss/crossentropy": 2.6554996967315674, "loss/hidden": 0.5390625, "loss/logits": 0.1000661626458168, "loss/reg": 0.02607985958456993, "step": 129 }, { "epoch": 0.01625, "grad_norm": 3.3035075664520264, "grad_norm_var": 1.046006684658701, "learning_rate": 0.0001, "loss": 0.7367, "loss/crossentropy": 2.293642282485962, "loss/hidden": 0.40625, "loss/logits": 0.06973426043987274, "loss/reg": 0.026074659079313278, "step": 130 }, { "epoch": 0.016375, "grad_norm": 4.2563276290893555, "grad_norm_var": 1.0439696727840135, "learning_rate": 0.0001, "loss": 0.7754, "loss/crossentropy": 2.2192564010620117, "loss/hidden": 0.447265625, "loss/logits": 0.06743350625038147, "loss/reg": 0.026069074869155884, "step": 131 }, { "epoch": 0.0165, "grad_norm": 4.646778583526611, "grad_norm_var": 0.9420233457711596, "learning_rate": 0.0001, "loss": 0.7496, "loss/crossentropy": 2.430368423461914, "loss/hidden": 0.4140625, "loss/logits": 0.07494455575942993, "loss/reg": 0.026063458994030952, "step": 132 }, { "epoch": 0.016625, "grad_norm": 7.465832233428955, "grad_norm_var": 1.7574380087301391, "learning_rate": 0.0001, "loss": 0.9607, "loss/crossentropy": 2.6182897090911865, "loss/hidden": 0.57421875, "loss/logits": 0.1258610337972641, "loss/reg": 0.026057813316583633, "step": 133 }, { "epoch": 0.01675, "grad_norm": 4.5479936599731445, "grad_norm_var": 1.6433309350178509, "learning_rate": 0.0001, "loss": 0.9529, "loss/crossentropy": 2.238551378250122, "loss/hidden": 0.58984375, "loss/logits": 0.10252824425697327, "loss/reg": 0.02605200558900833, "step": 134 }, { "epoch": 0.016875, "grad_norm": 3.4055774211883545, "grad_norm_var": 1.6105102483452751, "learning_rate": 0.0001, "loss": 0.8407, "loss/crossentropy": 2.4216599464416504, "loss/hidden": 0.498046875, "loss/logits": 0.08216647803783417, "loss/reg": 0.026046328246593475, "step": 135 }, { "epoch": 0.017, "grad_norm": 8.839641571044922, "grad_norm_var": 2.938344740358995, "learning_rate": 0.0001, "loss": 1.1023, "loss/crossentropy": 2.534442901611328, "loss/hidden": 0.7109375, "loss/logits": 0.1310025304555893, "loss/reg": 0.026040658354759216, "step": 136 }, { "epoch": 0.017125, "grad_norm": 4.6421589851379395, "grad_norm_var": 2.819843479450094, "learning_rate": 0.0001, "loss": 0.7418, "loss/crossentropy": 2.605559825897217, "loss/hidden": 0.416015625, "loss/logits": 0.06548085808753967, "loss/reg": 0.02603481523692608, "step": 137 }, { "epoch": 0.01725, "grad_norm": 3.1547701358795166, "grad_norm_var": 2.8553314644504555, "learning_rate": 0.0001, "loss": 0.731, "loss/crossentropy": 2.5905325412750244, "loss/hidden": 0.40234375, "loss/logits": 0.06835847347974777, "loss/reg": 0.02602926455438137, "step": 138 }, { "epoch": 0.017375, "grad_norm": 4.074351787567139, "grad_norm_var": 2.849577549707145, "learning_rate": 0.0001, "loss": 0.9582, "loss/crossentropy": 2.1483733654022217, "loss/hidden": 0.60546875, "loss/logits": 0.0924658477306366, "loss/reg": 0.026023706421256065, "step": 139 }, { "epoch": 0.0175, "grad_norm": 3.758636713027954, "grad_norm_var": 2.7618912420839155, "learning_rate": 0.0001, "loss": 0.9996, "loss/crossentropy": 2.335742473602295, "loss/hidden": 0.6328125, "loss/logits": 0.10663188993930817, "loss/reg": 0.026017924770712852, "step": 140 }, { "epoch": 0.017625, "grad_norm": 4.186927795410156, "grad_norm_var": 2.6505093919275544, "learning_rate": 0.0001, "loss": 0.8414, "loss/crossentropy": 2.281843423843384, "loss/hidden": 0.498046875, "loss/logits": 0.08321215212345123, "loss/reg": 0.026011699810624123, "step": 141 }, { "epoch": 0.01775, "grad_norm": 3.1666276454925537, "grad_norm_var": 2.775123140671519, "learning_rate": 0.0001, "loss": 0.7768, "loss/crossentropy": 2.4267935752868652, "loss/hidden": 0.44140625, "loss/logits": 0.07538889348506927, "loss/reg": 0.02600528486073017, "step": 142 }, { "epoch": 0.017875, "grad_norm": 6.386529445648193, "grad_norm_var": 2.9581845034072245, "learning_rate": 0.0001, "loss": 0.8401, "loss/crossentropy": 2.7848174571990967, "loss/hidden": 0.50390625, "loss/logits": 0.07621172070503235, "loss/reg": 0.025999369099736214, "step": 143 }, { "epoch": 0.018, "grad_norm": 3.8083512783050537, "grad_norm_var": 2.876745593692829, "learning_rate": 0.0001, "loss": 0.9046, "loss/crossentropy": 2.341048002243042, "loss/hidden": 0.56640625, "loss/logits": 0.0782276839017868, "loss/reg": 0.0259928647428751, "step": 144 }, { "epoch": 0.018125, "grad_norm": 4.083465576171875, "grad_norm_var": 2.5868089188751657, "learning_rate": 0.0001, "loss": 0.9449, "loss/crossentropy": 2.3942995071411133, "loss/hidden": 0.59375, "loss/logits": 0.09130540490150452, "loss/reg": 0.025986921042203903, "step": 145 }, { "epoch": 0.01825, "grad_norm": 3.654815673828125, "grad_norm_var": 2.533420197907486, "learning_rate": 0.0001, "loss": 0.9459, "loss/crossentropy": 2.2414467334747314, "loss/hidden": 0.5859375, "loss/logits": 0.10016702860593796, "loss/reg": 0.02598092146217823, "step": 146 }, { "epoch": 0.018375, "grad_norm": 5.4976935386657715, "grad_norm_var": 2.567896035243579, "learning_rate": 0.0001, "loss": 0.9079, "loss/crossentropy": 2.1185100078582764, "loss/hidden": 0.58203125, "loss/logits": 0.06617112457752228, "loss/reg": 0.025974513962864876, "step": 147 }, { "epoch": 0.0185, "grad_norm": 3.4107933044433594, "grad_norm_var": 2.673383097164577, "learning_rate": 0.0001, "loss": 0.7982, "loss/crossentropy": 2.417313575744629, "loss/hidden": 0.462890625, "loss/logits": 0.07567355036735535, "loss/reg": 0.025968506932258606, "step": 148 }, { "epoch": 0.018625, "grad_norm": 3.589749574661255, "grad_norm_var": 2.1469293827286418, "learning_rate": 0.0001, "loss": 0.8088, "loss/crossentropy": 2.2554194927215576, "loss/hidden": 0.478515625, "loss/logits": 0.07071521133184433, "loss/reg": 0.025961775332689285, "step": 149 }, { "epoch": 0.01875, "grad_norm": 4.003805160522461, "grad_norm_var": 2.1538296896943887, "learning_rate": 0.0001, "loss": 0.7856, "loss/crossentropy": 2.7128918170928955, "loss/hidden": 0.453125, "loss/logits": 0.07290500402450562, "loss/reg": 0.025955306366086006, "step": 150 }, { "epoch": 0.018875, "grad_norm": 3.536449432373047, "grad_norm_var": 2.1383506752067736, "learning_rate": 0.0001, "loss": 0.7436, "loss/crossentropy": 2.509995460510254, "loss/hidden": 0.421875, "loss/logits": 0.062191903591156006, "loss/reg": 0.02594931609928608, "step": 151 }, { "epoch": 0.019, "grad_norm": 3.434654951095581, "grad_norm_var": 0.7374638182579057, "learning_rate": 0.0001, "loss": 0.9118, "loss/crossentropy": 2.3447437286376953, "loss/hidden": 0.55859375, "loss/logits": 0.0937500149011612, "loss/reg": 0.025943227112293243, "step": 152 }, { "epoch": 0.019125, "grad_norm": 8.066261291503906, "grad_norm_var": 1.7522972641868202, "learning_rate": 0.0001, "loss": 0.8648, "loss/crossentropy": 2.4481232166290283, "loss/hidden": 0.52734375, "loss/logits": 0.07811163365840912, "loss/reg": 0.025936946272850037, "step": 153 }, { "epoch": 0.01925, "grad_norm": 3.2214348316192627, "grad_norm_var": 1.7429433318934864, "learning_rate": 0.0001, "loss": 0.7673, "loss/crossentropy": 2.3142549991607666, "loss/hidden": 0.43359375, "loss/logits": 0.07435894012451172, "loss/reg": 0.02593095973134041, "step": 154 }, { "epoch": 0.019375, "grad_norm": 2.854038715362549, "grad_norm_var": 1.8633807825236384, "learning_rate": 0.0001, "loss": 0.7601, "loss/crossentropy": 2.7022647857666016, "loss/hidden": 0.43359375, "loss/logits": 0.06724615395069122, "loss/reg": 0.025924943387508392, "step": 155 }, { "epoch": 0.0195, "grad_norm": 3.1763484477996826, "grad_norm_var": 1.9162196068123232, "learning_rate": 0.0001, "loss": 0.9, "loss/crossentropy": 2.574676036834717, "loss/hidden": 0.54296875, "loss/logits": 0.09785018861293793, "loss/reg": 0.025919148698449135, "step": 156 }, { "epoch": 0.019625, "grad_norm": 3.999523162841797, "grad_norm_var": 1.9169889601133074, "learning_rate": 0.0001, "loss": 0.858, "loss/crossentropy": 2.480532646179199, "loss/hidden": 0.515625, "loss/logits": 0.08323468267917633, "loss/reg": 0.025913061574101448, "step": 157 }, { "epoch": 0.01975, "grad_norm": 2.9662230014801025, "grad_norm_var": 1.944924590139983, "learning_rate": 0.0001, "loss": 0.818, "loss/crossentropy": 2.459967613220215, "loss/hidden": 0.48046875, "loss/logits": 0.07847169041633606, "loss/reg": 0.025906959548592567, "step": 158 }, { "epoch": 0.019875, "grad_norm": 7.074191093444824, "grad_norm_var": 2.1836107796529065, "learning_rate": 0.0001, "loss": 0.9801, "loss/crossentropy": 2.2858352661132812, "loss/hidden": 0.6484375, "loss/logits": 0.07268328964710236, "loss/reg": 0.025901462882757187, "step": 159 }, { "epoch": 0.02, "grad_norm": 3.6333370208740234, "grad_norm_var": 2.193465227977892, "learning_rate": 0.0001, "loss": 0.9048, "loss/crossentropy": 2.4626388549804688, "loss/hidden": 0.5546875, "loss/logits": 0.09115847945213318, "loss/reg": 0.02589540183544159, "step": 160 }, { "epoch": 0.020125, "grad_norm": 4.283749103546143, "grad_norm_var": 2.1945247126451437, "learning_rate": 0.0001, "loss": 0.9251, "loss/crossentropy": 2.5746121406555176, "loss/hidden": 0.58203125, "loss/logits": 0.0841851755976677, "loss/reg": 0.025888830423355103, "step": 161 }, { "epoch": 0.02025, "grad_norm": 3.628138542175293, "grad_norm_var": 2.196331220420876, "learning_rate": 0.0001, "loss": 0.9408, "loss/crossentropy": 2.398010730743408, "loss/hidden": 0.58203125, "loss/logits": 0.09991887211799622, "loss/reg": 0.025882074609398842, "step": 162 }, { "epoch": 0.020375, "grad_norm": 3.5412559509277344, "grad_norm_var": 2.0836172065033507, "learning_rate": 0.0001, "loss": 0.857, "loss/crossentropy": 2.5064220428466797, "loss/hidden": 0.50390625, "loss/logits": 0.0943569540977478, "loss/reg": 0.025876009836792946, "step": 163 }, { "epoch": 0.0205, "grad_norm": 3.498668670654297, "grad_norm_var": 2.0768887394910185, "learning_rate": 0.0001, "loss": 0.8917, "loss/crossentropy": 2.535529375076294, "loss/hidden": 0.52734375, "loss/logits": 0.10565976053476334, "loss/reg": 0.02587023191154003, "step": 164 }, { "epoch": 0.020625, "grad_norm": 3.14025616645813, "grad_norm_var": 2.116006039378421, "learning_rate": 0.0001, "loss": 0.7816, "loss/crossentropy": 2.7250640392303467, "loss/hidden": 0.453125, "loss/logits": 0.06980661302804947, "loss/reg": 0.025863803923130035, "step": 165 }, { "epoch": 0.02075, "grad_norm": 2.6821706295013428, "grad_norm_var": 2.2251478520018773, "learning_rate": 0.0001, "loss": 0.7558, "loss/crossentropy": 2.230567693710327, "loss/hidden": 0.43359375, "loss/logits": 0.06358660757541656, "loss/reg": 0.02585742622613907, "step": 166 }, { "epoch": 0.020875, "grad_norm": 3.8048856258392334, "grad_norm_var": 2.2158862694911607, "learning_rate": 0.0001, "loss": 0.7965, "loss/crossentropy": 2.69014310836792, "loss/hidden": 0.453125, "loss/logits": 0.08488957583904266, "loss/reg": 0.025850988924503326, "step": 167 }, { "epoch": 0.021, "grad_norm": 4.305826187133789, "grad_norm_var": 2.2048741298976515, "learning_rate": 0.0001, "loss": 1.1009, "loss/crossentropy": 2.2925851345062256, "loss/hidden": 0.73828125, "loss/logits": 0.10412566363811493, "loss/reg": 0.025845136493444443, "step": 168 }, { "epoch": 0.021125, "grad_norm": 4.051706790924072, "grad_norm_var": 1.0314628897999674, "learning_rate": 0.0001, "loss": 0.7756, "loss/crossentropy": 2.4515578746795654, "loss/hidden": 0.4453125, "loss/logits": 0.07186460494995117, "loss/reg": 0.02583896555006504, "step": 169 }, { "epoch": 0.02125, "grad_norm": 3.3800575733184814, "grad_norm_var": 1.022039210437893, "learning_rate": 0.0001, "loss": 0.798, "loss/crossentropy": 2.3387436866760254, "loss/hidden": 0.455078125, "loss/logits": 0.08458675444126129, "loss/reg": 0.025832952931523323, "step": 170 }, { "epoch": 0.021375, "grad_norm": 3.069735527038574, "grad_norm_var": 0.9991429378891439, "learning_rate": 0.0001, "loss": 0.7416, "loss/crossentropy": 2.660727024078369, "loss/hidden": 0.412109375, "loss/logits": 0.07125158607959747, "loss/reg": 0.025827286764979362, "step": 171 }, { "epoch": 0.0215, "grad_norm": 3.5827748775482178, "grad_norm_var": 0.9775809993657352, "learning_rate": 0.0001, "loss": 0.9301, "loss/crossentropy": 2.160230875015259, "loss/hidden": 0.5703125, "loss/logits": 0.10158500075340271, "loss/reg": 0.025821613147854805, "step": 172 }, { "epoch": 0.021625, "grad_norm": 3.0348143577575684, "grad_norm_var": 1.008817027257092, "learning_rate": 0.0001, "loss": 0.9059, "loss/crossentropy": 2.5519795417785645, "loss/hidden": 0.5390625, "loss/logits": 0.10867513716220856, "loss/reg": 0.025815250352025032, "step": 173 }, { "epoch": 0.02175, "grad_norm": 3.325514316558838, "grad_norm_var": 0.980302655794393, "learning_rate": 0.0001, "loss": 0.7651, "loss/crossentropy": 2.3060855865478516, "loss/hidden": 0.43359375, "loss/logits": 0.0733788013458252, "loss/reg": 0.02580902725458145, "step": 174 }, { "epoch": 0.021875, "grad_norm": 2.9402523040771484, "grad_norm_var": 0.21740374576502078, "learning_rate": 0.0001, "loss": 0.859, "loss/crossentropy": 2.4109151363372803, "loss/hidden": 0.515625, "loss/logits": 0.0853327289223671, "loss/reg": 0.025802936404943466, "step": 175 }, { "epoch": 0.022, "grad_norm": 4.970669746398926, "grad_norm_var": 0.354037293260262, "learning_rate": 0.0001, "loss": 0.9835, "loss/crossentropy": 2.9366648197174072, "loss/hidden": 0.6171875, "loss/logits": 0.10840301960706711, "loss/reg": 0.02579565905034542, "step": 176 }, { "epoch": 0.022125, "grad_norm": 4.438536643981934, "grad_norm_var": 0.37010993593279384, "learning_rate": 0.0001, "loss": 0.8574, "loss/crossentropy": 2.4177615642547607, "loss/hidden": 0.51171875, "loss/logits": 0.08775197714567184, "loss/reg": 0.025788920000195503, "step": 177 }, { "epoch": 0.02225, "grad_norm": 4.3905863761901855, "grad_norm_var": 0.41060424896311526, "learning_rate": 0.0001, "loss": 0.8796, "loss/crossentropy": 2.320418119430542, "loss/hidden": 0.53125, "loss/logits": 0.09051363915205002, "loss/reg": 0.02578234300017357, "step": 178 }, { "epoch": 0.022375, "grad_norm": 3.2044453620910645, "grad_norm_var": 0.42189777730298467, "learning_rate": 0.0001, "loss": 0.785, "loss/crossentropy": 2.327108383178711, "loss/hidden": 0.447265625, "loss/logits": 0.08003242313861847, "loss/reg": 0.025775177404284477, "step": 179 }, { "epoch": 0.0225, "grad_norm": 3.2016260623931885, "grad_norm_var": 0.4319725268587102, "learning_rate": 0.0001, "loss": 0.8076, "loss/crossentropy": 2.6913022994995117, "loss/hidden": 0.466796875, "loss/logits": 0.08316424489021301, "loss/reg": 0.02576799876987934, "step": 180 }, { "epoch": 0.022625, "grad_norm": 3.216141939163208, "grad_norm_var": 0.42772885748243633, "learning_rate": 0.0001, "loss": 0.7782, "loss/crossentropy": 2.4444668292999268, "loss/hidden": 0.447265625, "loss/logits": 0.07332297414541245, "loss/reg": 0.025760415941476822, "step": 181 }, { "epoch": 0.02275, "grad_norm": 3.6005637645721436, "grad_norm_var": 0.3680557604441513, "learning_rate": 0.0001, "loss": 0.8578, "loss/crossentropy": 2.2084226608276367, "loss/hidden": 0.51171875, "loss/logits": 0.0885147675871849, "loss/reg": 0.025753989815711975, "step": 182 }, { "epoch": 0.022875, "grad_norm": 4.19577693939209, "grad_norm_var": 0.3852931468556484, "learning_rate": 0.0001, "loss": 0.9394, "loss/crossentropy": 2.557783842086792, "loss/hidden": 0.56640625, "loss/logits": 0.11552520841360092, "loss/reg": 0.025747526437044144, "step": 183 }, { "epoch": 0.023, "grad_norm": 3.024552822113037, "grad_norm_var": 0.38129301153876227, "learning_rate": 0.0001, "loss": 0.8858, "loss/crossentropy": 2.994615316390991, "loss/hidden": 0.52734375, "loss/logits": 0.10108112543821335, "loss/reg": 0.02574075385928154, "step": 184 }, { "epoch": 0.023125, "grad_norm": 3.3255319595336914, "grad_norm_var": 0.3706833429951111, "learning_rate": 0.0001, "loss": 0.8407, "loss/crossentropy": 2.677245855331421, "loss/hidden": 0.4765625, "loss/logits": 0.10678394883871078, "loss/reg": 0.02573317475616932, "step": 185 }, { "epoch": 0.02325, "grad_norm": 3.341599464416504, "grad_norm_var": 0.3716797590150757, "learning_rate": 0.0001, "loss": 0.9127, "loss/crossentropy": 2.156444549560547, "loss/hidden": 0.56640625, "loss/logits": 0.0890708938241005, "loss/reg": 0.02572541870176792, "step": 186 }, { "epoch": 0.023375, "grad_norm": 2.925915241241455, "grad_norm_var": 0.3822577484351124, "learning_rate": 0.0001, "loss": 0.8815, "loss/crossentropy": 2.2716755867004395, "loss/hidden": 0.53125, "loss/logits": 0.09304732084274292, "loss/reg": 0.02571748197078705, "step": 187 }, { "epoch": 0.0235, "grad_norm": 4.192226886749268, "grad_norm_var": 0.40854537365233884, "learning_rate": 0.0001, "loss": 0.8547, "loss/crossentropy": 2.378901720046997, "loss/hidden": 0.5078125, "loss/logits": 0.08977752178907394, "loss/reg": 0.025709524750709534, "step": 188 }, { "epoch": 0.023625, "grad_norm": 5.648179054260254, "grad_norm_var": 0.6443691048121629, "learning_rate": 0.0001, "loss": 1.0032, "loss/crossentropy": 2.4307687282562256, "loss/hidden": 0.63671875, "loss/logits": 0.10948194563388824, "loss/reg": 0.025701580569148064, "step": 189 }, { "epoch": 0.02375, "grad_norm": 6.345841884613037, "grad_norm_var": 1.045029826307897, "learning_rate": 0.0001, "loss": 1.0995, "loss/crossentropy": 2.279203414916992, "loss/hidden": 0.74609375, "loss/logits": 0.09642117470502853, "loss/reg": 0.025694590061903, "step": 190 }, { "epoch": 0.023875, "grad_norm": 4.242865085601807, "grad_norm_var": 0.9782837984014707, "learning_rate": 0.0001, "loss": 1.155, "loss/crossentropy": 2.235325574874878, "loss/hidden": 0.7890625, "loss/logits": 0.10906486213207245, "loss/reg": 0.025686509907245636, "step": 191 }, { "epoch": 0.024, "grad_norm": 6.347895622253418, "grad_norm_var": 1.272032888242258, "learning_rate": 0.0001, "loss": 1.0789, "loss/crossentropy": 2.5386898517608643, "loss/hidden": 0.67578125, "loss/logits": 0.1463102549314499, "loss/reg": 0.025679145008325577, "step": 192 }, { "epoch": 0.024125, "grad_norm": 3.077846050262451, "grad_norm_var": 1.3268106432816444, "learning_rate": 0.0001, "loss": 0.855, "loss/crossentropy": 2.5889694690704346, "loss/hidden": 0.515625, "loss/logits": 0.08266487717628479, "loss/reg": 0.025671878829598427, "step": 193 }, { "epoch": 0.02425, "grad_norm": 3.672849416732788, "grad_norm_var": 1.323313109234428, "learning_rate": 0.0001, "loss": 0.8213, "loss/crossentropy": 2.269009590148926, "loss/hidden": 0.4921875, "loss/logits": 0.07244250178337097, "loss/reg": 0.02566472254693508, "step": 194 }, { "epoch": 0.024375, "grad_norm": 3.13712215423584, "grad_norm_var": 1.330492936258482, "learning_rate": 0.0001, "loss": 0.9044, "loss/crossentropy": 2.3420536518096924, "loss/hidden": 0.5390625, "loss/logits": 0.10874692350625992, "loss/reg": 0.02565707452595234, "step": 195 }, { "epoch": 0.0245, "grad_norm": 5.941372871398926, "grad_norm_var": 1.5194802994125645, "learning_rate": 0.0001, "loss": 1.0268, "loss/crossentropy": 2.245668649673462, "loss/hidden": 0.63671875, "loss/logits": 0.13362175226211548, "loss/reg": 0.02564912661910057, "step": 196 }, { "epoch": 0.024625, "grad_norm": 2.8778631687164307, "grad_norm_var": 1.5682913914576866, "learning_rate": 0.0001, "loss": 0.933, "loss/crossentropy": 2.4744086265563965, "loss/hidden": 0.57421875, "loss/logits": 0.10236240178346634, "loss/reg": 0.025642510503530502, "step": 197 }, { "epoch": 0.02475, "grad_norm": 5.235295295715332, "grad_norm_var": 1.6223942527523605, "learning_rate": 0.0001, "loss": 0.993, "loss/crossentropy": 2.3120462894439697, "loss/hidden": 0.61328125, "loss/logits": 0.12334179133176804, "loss/reg": 0.025634463876485825, "step": 198 }, { "epoch": 0.024875, "grad_norm": 3.2772397994995117, "grad_norm_var": 1.6781902664948347, "learning_rate": 0.0001, "loss": 0.9134, "loss/crossentropy": 2.2896904945373535, "loss/hidden": 0.5625, "loss/logits": 0.0946369469165802, "loss/reg": 0.025627706199884415, "step": 199 }, { "epoch": 0.025, "grad_norm": 4.021130084991455, "grad_norm_var": 1.5889382838258257, "learning_rate": 0.0001, "loss": 0.9467, "loss/crossentropy": 2.3281702995300293, "loss/hidden": 0.5859375, "loss/logits": 0.10459813475608826, "loss/reg": 0.025620225816965103, "step": 200 }, { "epoch": 0.025125, "grad_norm": 3.3705508708953857, "grad_norm_var": 1.5836618344967157, "learning_rate": 0.0001, "loss": 1.0247, "loss/crossentropy": 2.3704278469085693, "loss/hidden": 0.640625, "loss/logits": 0.12795141339302063, "loss/reg": 0.025613589212298393, "step": 201 }, { "epoch": 0.02525, "grad_norm": 5.586423397064209, "grad_norm_var": 1.6331597901730046, "learning_rate": 0.0001, "loss": 1.2703, "loss/crossentropy": 2.3346736431121826, "loss/hidden": 0.83984375, "loss/logits": 0.17438159883022308, "loss/reg": 0.025606893002986908, "step": 202 }, { "epoch": 0.025375, "grad_norm": 6.558523654937744, "grad_norm_var": 1.7590475344041898, "learning_rate": 0.0001, "loss": 1.0711, "loss/crossentropy": 2.264883518218994, "loss/hidden": 0.71484375, "loss/logits": 0.10028564184904099, "loss/reg": 0.025599893182516098, "step": 203 }, { "epoch": 0.0255, "grad_norm": 3.024080991744995, "grad_norm_var": 1.9071946132324447, "learning_rate": 0.0001, "loss": 0.9349, "loss/crossentropy": 2.505457878112793, "loss/hidden": 0.58203125, "loss/logits": 0.09690214693546295, "loss/reg": 0.025592036545276642, "step": 204 }, { "epoch": 0.025625, "grad_norm": 3.268216133117676, "grad_norm_var": 1.9040994009139534, "learning_rate": 0.0001, "loss": 0.8433, "loss/crossentropy": 2.629786968231201, "loss/hidden": 0.5, "loss/logits": 0.08743932843208313, "loss/reg": 0.025583887472748756, "step": 205 }, { "epoch": 0.02575, "grad_norm": 5.203437328338623, "grad_norm_var": 1.6853258867355625, "learning_rate": 0.0001, "loss": 0.9278, "loss/crossentropy": 2.368603229522705, "loss/hidden": 0.5625, "loss/logits": 0.10955986380577087, "loss/reg": 0.025574835017323494, "step": 206 }, { "epoch": 0.025875, "grad_norm": 5.106112480163574, "grad_norm_var": 1.725017173963382, "learning_rate": 0.0001, "loss": 0.9881, "loss/crossentropy": 2.554746389389038, "loss/hidden": 0.6171875, "loss/logits": 0.11524944007396698, "loss/reg": 0.02556804195046425, "step": 207 }, { "epoch": 0.026, "grad_norm": 8.258187294006348, "grad_norm_var": 2.4602814049521387, "learning_rate": 0.0001, "loss": 1.3675, "loss/crossentropy": 2.4391205310821533, "loss/hidden": 0.94140625, "loss/logits": 0.17047560214996338, "loss/reg": 0.025560656562447548, "step": 208 }, { "epoch": 0.026125, "grad_norm": 3.8223764896392822, "grad_norm_var": 2.3561294395388566, "learning_rate": 0.0001, "loss": 1.02, "loss/crossentropy": 2.5300426483154297, "loss/hidden": 0.62890625, "loss/logits": 0.13556598126888275, "loss/reg": 0.025551345199346542, "step": 209 }, { "epoch": 0.02625, "grad_norm": 3.6237666606903076, "grad_norm_var": 2.36184075461094, "learning_rate": 0.0001, "loss": 1.0218, "loss/crossentropy": 2.3368213176727295, "loss/hidden": 0.6640625, "loss/logits": 0.10230866074562073, "loss/reg": 0.025544527918100357, "step": 210 }, { "epoch": 0.026375, "grad_norm": 3.1394050121307373, "grad_norm_var": 2.3614203164344407, "learning_rate": 0.0001, "loss": 0.964, "loss/crossentropy": 2.338050603866577, "loss/hidden": 0.609375, "loss/logits": 0.09931059181690216, "loss/reg": 0.025535617023706436, "step": 211 }, { "epoch": 0.0265, "grad_norm": 3.1498186588287354, "grad_norm_var": 2.319283484830568, "learning_rate": 0.0001, "loss": 0.8039, "loss/crossentropy": 2.5108940601348877, "loss/hidden": 0.46875, "loss/logits": 0.07982419431209564, "loss/reg": 0.025528721511363983, "step": 212 }, { "epoch": 0.026625, "grad_norm": 3.334510326385498, "grad_norm_var": 2.2429786468962374, "learning_rate": 0.0001, "loss": 1.1105, "loss/crossentropy": 2.458519697189331, "loss/hidden": 0.71875, "loss/logits": 0.1365831047296524, "loss/reg": 0.02552017569541931, "step": 213 }, { "epoch": 0.02675, "grad_norm": 3.3243789672851562, "grad_norm_var": 2.2516768547287977, "learning_rate": 0.0001, "loss": 1.0016, "loss/crossentropy": 2.2530109882354736, "loss/hidden": 0.62890625, "loss/logits": 0.11759582161903381, "loss/reg": 0.025511734187602997, "step": 214 }, { "epoch": 0.026875, "grad_norm": 3.3768937587738037, "grad_norm_var": 2.2393156807375245, "learning_rate": 0.0001, "loss": 1.0452, "loss/crossentropy": 2.3267643451690674, "loss/hidden": 0.65625, "loss/logits": 0.13390058279037476, "loss/reg": 0.0255054272711277, "step": 215 }, { "epoch": 0.027, "grad_norm": 7.391561031341553, "grad_norm_var": 2.841738119885866, "learning_rate": 0.0001, "loss": 1.2263, "loss/crossentropy": 2.285508394241333, "loss/hidden": 0.8203125, "loss/logits": 0.1510239541530609, "loss/reg": 0.025499247014522552, "step": 216 }, { "epoch": 0.027125, "grad_norm": 3.143969774246216, "grad_norm_var": 2.8781965049841705, "learning_rate": 0.0001, "loss": 0.9421, "loss/crossentropy": 2.6111316680908203, "loss/hidden": 0.5703125, "loss/logits": 0.11691074818372726, "loss/reg": 0.025491848587989807, "step": 217 }, { "epoch": 0.02725, "grad_norm": 4.989267826080322, "grad_norm_var": 2.8105564664802234, "learning_rate": 0.0001, "loss": 0.9104, "loss/crossentropy": 2.7856109142303467, "loss/hidden": 0.546875, "loss/logits": 0.10870229452848434, "loss/reg": 0.025485411286354065, "step": 218 }, { "epoch": 0.027375, "grad_norm": 8.867380142211914, "grad_norm_var": 3.802177537112387, "learning_rate": 0.0001, "loss": 1.2976, "loss/crossentropy": 2.414778470993042, "loss/hidden": 0.84375, "loss/logits": 0.199102982878685, "loss/reg": 0.025478005409240723, "step": 219 }, { "epoch": 0.0275, "grad_norm": 3.949193239212036, "grad_norm_var": 3.665725599495321, "learning_rate": 0.0001, "loss": 1.0781, "loss/crossentropy": 2.3898277282714844, "loss/hidden": 0.6796875, "loss/logits": 0.14368270337581635, "loss/reg": 0.02547168917953968, "step": 220 }, { "epoch": 0.027625, "grad_norm": 7.980153560638428, "grad_norm_var": 4.202985170085262, "learning_rate": 0.0001, "loss": 1.4005, "loss/crossentropy": 2.0598959922790527, "loss/hidden": 0.9921875, "loss/logits": 0.15361803770065308, "loss/reg": 0.02546495571732521, "step": 221 }, { "epoch": 0.02775, "grad_norm": 4.118736743927002, "grad_norm_var": 4.234989890674561, "learning_rate": 0.0001, "loss": 0.9084, "loss/crossentropy": 2.2568750381469727, "loss/hidden": 0.55859375, "loss/logits": 0.09525588899850845, "loss/reg": 0.02545757219195366, "step": 222 }, { "epoch": 0.027875, "grad_norm": 3.730299711227417, "grad_norm_var": 4.306033514824207, "learning_rate": 0.0001, "loss": 0.9394, "loss/crossentropy": 2.7180914878845215, "loss/hidden": 0.57421875, "loss/logits": 0.11064038425683975, "loss/reg": 0.025450890883803368, "step": 223 }, { "epoch": 0.028, "grad_norm": 3.138925552368164, "grad_norm_var": 3.557911666554559, "learning_rate": 0.0001, "loss": 0.8406, "loss/crossentropy": 2.3719072341918945, "loss/hidden": 0.5078125, "loss/logits": 0.07834647595882416, "loss/reg": 0.02544352412223816, "step": 224 }, { "epoch": 0.028125, "grad_norm": 3.478994846343994, "grad_norm_var": 3.593674795871404, "learning_rate": 0.0001, "loss": 0.9072, "loss/crossentropy": 2.4101219177246094, "loss/hidden": 0.56640625, "loss/logits": 0.0863800048828125, "loss/reg": 0.025437019765377045, "step": 225 }, { "epoch": 0.02825, "grad_norm": 5.091615676879883, "grad_norm_var": 3.572291640880083, "learning_rate": 0.0001, "loss": 0.9864, "loss/crossentropy": 2.2258856296539307, "loss/hidden": 0.640625, "loss/logits": 0.09143185615539551, "loss/reg": 0.025430168956518173, "step": 226 }, { "epoch": 0.028375, "grad_norm": 3.616190195083618, "grad_norm_var": 3.4991896025782796, "learning_rate": 0.0001, "loss": 0.9246, "loss/crossentropy": 2.765105724334717, "loss/hidden": 0.5703125, "loss/logits": 0.10003923624753952, "loss/reg": 0.025423482060432434, "step": 227 }, { "epoch": 0.0285, "grad_norm": 4.406581878662109, "grad_norm_var": 3.364516245493509, "learning_rate": 0.0001, "loss": 0.98, "loss/crossentropy": 2.41001033782959, "loss/hidden": 0.6171875, "loss/logits": 0.10860306769609451, "loss/reg": 0.025416266173124313, "step": 228 }, { "epoch": 0.028625, "grad_norm": 3.003995418548584, "grad_norm_var": 3.4280449285692023, "learning_rate": 0.0001, "loss": 0.9759, "loss/crossentropy": 2.518749952316284, "loss/hidden": 0.62109375, "loss/logits": 0.10075733810663223, "loss/reg": 0.025409165769815445, "step": 229 }, { "epoch": 0.02875, "grad_norm": 4.073727130889893, "grad_norm_var": 3.3356380380428576, "learning_rate": 0.0001, "loss": 0.8881, "loss/crossentropy": 2.6824846267700195, "loss/hidden": 0.5390625, "loss/logits": 0.09498724341392517, "loss/reg": 0.025402268394827843, "step": 230 }, { "epoch": 0.028875, "grad_norm": 3.8958635330200195, "grad_norm_var": 3.2645611787952435, "learning_rate": 0.0001, "loss": 0.8794, "loss/crossentropy": 2.684971332550049, "loss/hidden": 0.53515625, "loss/logits": 0.09024453163146973, "loss/reg": 0.025395380333065987, "step": 231 }, { "epoch": 0.029, "grad_norm": 3.5619406700134277, "grad_norm_var": 2.796506014438906, "learning_rate": 0.0001, "loss": 0.957, "loss/crossentropy": 2.7883284091949463, "loss/hidden": 0.6015625, "loss/logits": 0.10159540176391602, "loss/reg": 0.02538810484111309, "step": 232 }, { "epoch": 0.029125, "grad_norm": 12.658771514892578, "grad_norm_var": 6.809983669727001, "learning_rate": 0.0001, "loss": 1.1843, "loss/crossentropy": 2.537827253341675, "loss/hidden": 0.7890625, "loss/logits": 0.14144758880138397, "loss/reg": 0.02538110502064228, "step": 233 }, { "epoch": 0.02925, "grad_norm": 8.465475082397461, "grad_norm_var": 7.543990683504188, "learning_rate": 0.0001, "loss": 1.2958, "loss/crossentropy": 2.6715972423553467, "loss/hidden": 0.875, "loss/logits": 0.16701380908489227, "loss/reg": 0.02537420578300953, "step": 234 }, { "epoch": 0.029375, "grad_norm": 6.49767541885376, "grad_norm_var": 6.75275709893707, "learning_rate": 0.0001, "loss": 1.0826, "loss/crossentropy": 2.316210985183716, "loss/hidden": 0.7109375, "loss/logits": 0.11794352531433105, "loss/reg": 0.025367144495248795, "step": 235 }, { "epoch": 0.0295, "grad_norm": 4.668674945831299, "grad_norm_var": 6.674304000957216, "learning_rate": 0.0001, "loss": 1.0642, "loss/crossentropy": 1.956210732460022, "loss/hidden": 0.69921875, "loss/logits": 0.11135473847389221, "loss/reg": 0.02535996399819851, "step": 236 }, { "epoch": 0.029625, "grad_norm": 6.132915019989014, "grad_norm_var": 6.19031909782113, "learning_rate": 0.0001, "loss": 1.0462, "loss/crossentropy": 2.6552460193634033, "loss/hidden": 0.6640625, "loss/logits": 0.12862679362297058, "loss/reg": 0.025352442637085915, "step": 237 }, { "epoch": 0.02975, "grad_norm": 4.94125509262085, "grad_norm_var": 6.132251305092321, "learning_rate": 0.0001, "loss": 1.2795, "loss/crossentropy": 2.204523801803589, "loss/hidden": 0.84765625, "loss/logits": 0.17835499346256256, "loss/reg": 0.025344664230942726, "step": 238 }, { "epoch": 0.029875, "grad_norm": 3.7683677673339844, "grad_norm_var": 6.125464850588167, "learning_rate": 0.0001, "loss": 0.985, "loss/crossentropy": 2.652397632598877, "loss/hidden": 0.625, "loss/logits": 0.10665792226791382, "loss/reg": 0.0253366157412529, "step": 239 }, { "epoch": 0.03, "grad_norm": 4.732015609741211, "grad_norm_var": 5.870172361717241, "learning_rate": 0.0001, "loss": 1.1018, "loss/crossentropy": 2.544055938720703, "loss/hidden": 0.7109375, "loss/logits": 0.13754525780677795, "loss/reg": 0.02532930299639702, "step": 240 }, { "epoch": 0.030125, "grad_norm": 4.046056270599365, "grad_norm_var": 5.761120866273571, "learning_rate": 0.0001, "loss": 0.9396, "loss/crossentropy": 2.6015427112579346, "loss/hidden": 0.5859375, "loss/logits": 0.10041546076536179, "loss/reg": 0.025322062894701958, "step": 241 }, { "epoch": 0.03025, "grad_norm": 3.066027879714966, "grad_norm_var": 6.052926687728684, "learning_rate": 0.0001, "loss": 1.0668, "loss/crossentropy": 2.4402151107788086, "loss/hidden": 0.6875, "loss/logits": 0.12614867091178894, "loss/reg": 0.0253145694732666, "step": 242 }, { "epoch": 0.030375, "grad_norm": 3.168888807296753, "grad_norm_var": 6.1536859873832555, "learning_rate": 0.0001, "loss": 0.9413, "loss/crossentropy": 2.5689940452575684, "loss/hidden": 0.58984375, "loss/logits": 0.09841729700565338, "loss/reg": 0.02530776336789131, "step": 243 }, { "epoch": 0.0305, "grad_norm": 3.806450366973877, "grad_norm_var": 6.229122059899357, "learning_rate": 0.0001, "loss": 1.021, "loss/crossentropy": 2.2438809871673584, "loss/hidden": 0.6328125, "loss/logits": 0.13515594601631165, "loss/reg": 0.02530042454600334, "step": 244 }, { "epoch": 0.030625, "grad_norm": 4.313736915588379, "grad_norm_var": 5.982441934425083, "learning_rate": 0.0001, "loss": 1.0307, "loss/crossentropy": 2.5067784786224365, "loss/hidden": 0.65234375, "loss/logits": 0.1253766119480133, "loss/reg": 0.025293124839663506, "step": 245 }, { "epoch": 0.03075, "grad_norm": 4.419394016265869, "grad_norm_var": 5.942040082684442, "learning_rate": 0.0001, "loss": 0.8957, "loss/crossentropy": 2.4725239276885986, "loss/hidden": 0.55078125, "loss/logits": 0.09201550483703613, "loss/reg": 0.02528616413474083, "step": 246 }, { "epoch": 0.030875, "grad_norm": 3.709151268005371, "grad_norm_var": 5.975041529003943, "learning_rate": 0.0001, "loss": 0.9152, "loss/crossentropy": 2.413250207901001, "loss/hidden": 0.56640625, "loss/logits": 0.09596529603004456, "loss/reg": 0.025278838351368904, "step": 247 }, { "epoch": 0.031, "grad_norm": 3.4139935970306396, "grad_norm_var": 6.007189625317282, "learning_rate": 0.0001, "loss": 1.0268, "loss/crossentropy": 2.1580560207366943, "loss/hidden": 0.65625, "loss/logits": 0.11786328256130219, "loss/reg": 0.025271562859416008, "step": 248 }, { "epoch": 0.031125, "grad_norm": 6.089378833770752, "grad_norm_var": 2.0950588257899443, "learning_rate": 0.0001, "loss": 1.0999, "loss/crossentropy": 2.5466785430908203, "loss/hidden": 0.703125, "loss/logits": 0.1441519856452942, "loss/reg": 0.025264522060751915, "step": 249 }, { "epoch": 0.03125, "grad_norm": 4.001245498657227, "grad_norm_var": 1.1007847740596406, "learning_rate": 0.0001, "loss": 0.9263, "loss/crossentropy": 2.433518171310425, "loss/hidden": 0.578125, "loss/logits": 0.09559185057878494, "loss/reg": 0.025257611647248268, "step": 250 }, { "epoch": 0.031375, "grad_norm": 4.982790946960449, "grad_norm_var": 0.8252532202355399, "learning_rate": 0.0001, "loss": 1.1439, "loss/crossentropy": 2.3542721271514893, "loss/hidden": 0.73046875, "loss/logits": 0.16091328859329224, "loss/reg": 0.025250321254134178, "step": 251 }, { "epoch": 0.0315, "grad_norm": 3.6227378845214844, "grad_norm_var": 0.8462248829388517, "learning_rate": 0.0001, "loss": 1.1789, "loss/crossentropy": 2.1333043575286865, "loss/hidden": 0.7890625, "loss/logits": 0.1374010145664215, "loss/reg": 0.025242896750569344, "step": 252 }, { "epoch": 0.031625, "grad_norm": 3.415194511413574, "grad_norm_var": 0.6304077366129337, "learning_rate": 0.0001, "loss": 1.0916, "loss/crossentropy": 2.23525071144104, "loss/hidden": 0.71484375, "loss/logits": 0.1243971735239029, "loss/reg": 0.025235962122678757, "step": 253 }, { "epoch": 0.03175, "grad_norm": 3.7671289443969727, "grad_norm_var": 0.5838590152321442, "learning_rate": 0.0001, "loss": 0.908, "loss/crossentropy": 2.254054546356201, "loss/hidden": 0.56640625, "loss/logits": 0.0893106684088707, "loss/reg": 0.025228681042790413, "step": 254 }, { "epoch": 0.031875, "grad_norm": 4.164376735687256, "grad_norm_var": 0.5803655311074387, "learning_rate": 0.0001, "loss": 0.9548, "loss/crossentropy": 2.4128520488739014, "loss/hidden": 0.5859375, "loss/logits": 0.11668873578310013, "loss/reg": 0.025221774354577065, "step": 255 }, { "epoch": 0.032, "grad_norm": 3.6412672996520996, "grad_norm_var": 0.5547959425019464, "learning_rate": 0.0001, "loss": 1.0098, "loss/crossentropy": 2.7745320796966553, "loss/hidden": 0.62890625, "loss/logits": 0.12875014543533325, "loss/reg": 0.025214577093720436, "step": 256 }, { "epoch": 0.032125, "grad_norm": 2.900871515274048, "grad_norm_var": 0.6261772657264061, "learning_rate": 0.0001, "loss": 0.9348, "loss/crossentropy": 2.286968469619751, "loss/hidden": 0.578125, "loss/logits": 0.10455667227506638, "loss/reg": 0.02520710788667202, "step": 257 }, { "epoch": 0.03225, "grad_norm": 3.6853647232055664, "grad_norm_var": 0.5808564529014478, "learning_rate": 0.0001, "loss": 0.9013, "loss/crossentropy": 2.4515562057495117, "loss/hidden": 0.55078125, "loss/logits": 0.09847953915596008, "loss/reg": 0.025199349969625473, "step": 258 }, { "epoch": 0.032375, "grad_norm": 2.9091956615448, "grad_norm_var": 0.6119059054418109, "learning_rate": 0.0001, "loss": 0.8784, "loss/crossentropy": 2.163628339767456, "loss/hidden": 0.5390625, "loss/logits": 0.08743810653686523, "loss/reg": 0.025191258639097214, "step": 259 }, { "epoch": 0.0325, "grad_norm": 2.9488673210144043, "grad_norm_var": 0.6717290813098125, "learning_rate": 0.0001, "loss": 0.9195, "loss/crossentropy": 2.3084585666656494, "loss/hidden": 0.5546875, "loss/logits": 0.11298206448554993, "loss/reg": 0.025183765217661858, "step": 260 }, { "epoch": 0.032625, "grad_norm": 3.4345953464508057, "grad_norm_var": 0.6684943296663647, "learning_rate": 0.0001, "loss": 0.8851, "loss/crossentropy": 2.19558048248291, "loss/hidden": 0.546875, "loss/logits": 0.08649411797523499, "loss/reg": 0.02517561800777912, "step": 261 }, { "epoch": 0.03275, "grad_norm": 28.159074783325195, "grad_norm_var": 37.79188620028727, "learning_rate": 0.0001, "loss": 1.1781, "loss/crossentropy": 2.839057207107544, "loss/hidden": 0.7890625, "loss/logits": 0.13732999563217163, "loss/reg": 0.025168145075440407, "step": 262 }, { "epoch": 0.032875, "grad_norm": 2.734104871749878, "grad_norm_var": 38.05849364469687, "learning_rate": 0.0001, "loss": 0.8892, "loss/crossentropy": 2.4754340648651123, "loss/hidden": 0.54296875, "loss/logits": 0.09462890028953552, "loss/reg": 0.025160137563943863, "step": 263 }, { "epoch": 0.033, "grad_norm": 4.089654922485352, "grad_norm_var": 37.922354469790704, "learning_rate": 0.0001, "loss": 0.9326, "loss/crossentropy": 2.476450204849243, "loss/hidden": 0.57421875, "loss/logits": 0.1068512499332428, "loss/reg": 0.025151856243610382, "step": 264 }, { "epoch": 0.033125, "grad_norm": 3.838066577911377, "grad_norm_var": 37.99741003814723, "learning_rate": 0.0001, "loss": 1.0909, "loss/crossentropy": 2.114243268966675, "loss/hidden": 0.734375, "loss/logits": 0.10510236769914627, "loss/reg": 0.025143325328826904, "step": 265 }, { "epoch": 0.03325, "grad_norm": 3.0674874782562256, "grad_norm_var": 38.19410456778619, "learning_rate": 0.0001, "loss": 0.8898, "loss/crossentropy": 2.6426563262939453, "loss/hidden": 0.54296875, "loss/logits": 0.09550425410270691, "loss/reg": 0.025135278701782227, "step": 266 }, { "epoch": 0.033375, "grad_norm": 7.777696132659912, "grad_norm_var": 38.64421623432597, "learning_rate": 0.0001, "loss": 1.0482, "loss/crossentropy": 2.640554666519165, "loss/hidden": 0.6796875, "loss/logits": 0.11728060245513916, "loss/reg": 0.025126684457063675, "step": 267 }, { "epoch": 0.0335, "grad_norm": 3.6375653743743896, "grad_norm_var": 38.64099364344996, "learning_rate": 0.0001, "loss": 0.9888, "loss/crossentropy": 2.227538824081421, "loss/hidden": 0.63671875, "loss/logits": 0.1008879542350769, "loss/reg": 0.0251180287450552, "step": 268 }, { "epoch": 0.033625, "grad_norm": 4.305724620819092, "grad_norm_var": 38.4714335626231, "learning_rate": 0.0001, "loss": 1.1268, "loss/crossentropy": 2.556248903274536, "loss/hidden": 0.73828125, "loss/logits": 0.1374487727880478, "loss/reg": 0.02511041797697544, "step": 269 }, { "epoch": 0.03375, "grad_norm": 3.658052921295166, "grad_norm_var": 38.4947077039297, "learning_rate": 0.0001, "loss": 0.9616, "loss/crossentropy": 2.437307596206665, "loss/hidden": 0.60546875, "loss/logits": 0.1050904244184494, "loss/reg": 0.025102900341153145, "step": 270 }, { "epoch": 0.033875, "grad_norm": 6.534849643707275, "grad_norm_var": 38.483973576312195, "learning_rate": 0.0001, "loss": 1.2119, "loss/crossentropy": 2.36796236038208, "loss/hidden": 0.81640625, "loss/logits": 0.1445278525352478, "loss/reg": 0.025095317512750626, "step": 271 }, { "epoch": 0.034, "grad_norm": 5.486751079559326, "grad_norm_var": 38.24988881420663, "learning_rate": 0.0001, "loss": 1.0337, "loss/crossentropy": 2.517333984375, "loss/hidden": 0.66015625, "loss/logits": 0.12263435125350952, "loss/reg": 0.025087665766477585, "step": 272 }, { "epoch": 0.034125, "grad_norm": 3.231940746307373, "grad_norm_var": 38.13878485092763, "learning_rate": 0.0001, "loss": 1.0586, "loss/crossentropy": 2.6812305450439453, "loss/hidden": 0.67578125, "loss/logits": 0.13204006850719452, "loss/reg": 0.025080092251300812, "step": 273 }, { "epoch": 0.03425, "grad_norm": 4.71685791015625, "grad_norm_var": 37.942827296069424, "learning_rate": 0.0001, "loss": 1.0409, "loss/crossentropy": 2.093151807785034, "loss/hidden": 0.6796875, "loss/logits": 0.11049959808588028, "loss/reg": 0.02507280558347702, "step": 274 }, { "epoch": 0.034375, "grad_norm": 3.7086901664733887, "grad_norm_var": 37.68973967522894, "learning_rate": 0.0001, "loss": 0.9144, "loss/crossentropy": 2.2392518520355225, "loss/hidden": 0.57421875, "loss/logits": 0.08954055607318878, "loss/reg": 0.02506582997739315, "step": 275 }, { "epoch": 0.0345, "grad_norm": 5.537543773651123, "grad_norm_var": 37.1561912525544, "learning_rate": 0.0001, "loss": 1.3586, "loss/crossentropy": 1.8622019290924072, "loss/hidden": 0.96484375, "loss/logits": 0.14318415522575378, "loss/reg": 0.025058824568986893, "step": 276 }, { "epoch": 0.034625, "grad_norm": 2.898383617401123, "grad_norm_var": 37.34827444255352, "learning_rate": 0.0001, "loss": 0.9413, "loss/crossentropy": 2.4155869483947754, "loss/hidden": 0.578125, "loss/logits": 0.11263684928417206, "loss/reg": 0.025052132084965706, "step": 277 }, { "epoch": 0.03475, "grad_norm": 2.803711175918579, "grad_norm_var": 2.0625830733920933, "learning_rate": 0.0001, "loss": 0.9404, "loss/crossentropy": 2.2058048248291016, "loss/hidden": 0.59765625, "loss/logits": 0.09232598543167114, "loss/reg": 0.02504453808069229, "step": 278 }, { "epoch": 0.034875, "grad_norm": 3.180114507675171, "grad_norm_var": 1.9847680294286107, "learning_rate": 0.0001, "loss": 0.9311, "loss/crossentropy": 2.6069722175598145, "loss/hidden": 0.57421875, "loss/logits": 0.10655493289232254, "loss/reg": 0.02503693662583828, "step": 279 }, { "epoch": 0.035, "grad_norm": 5.609209060668945, "grad_norm_var": 2.0906055341906256, "learning_rate": 0.0001, "loss": 1.0675, "loss/crossentropy": 2.5023670196533203, "loss/hidden": 0.68359375, "loss/logits": 0.13361594080924988, "loss/reg": 0.025029515847563744, "step": 280 }, { "epoch": 0.035125, "grad_norm": 5.165370464324951, "grad_norm_var": 2.105772188928517, "learning_rate": 0.0001, "loss": 1.1103, "loss/crossentropy": 2.5107007026672363, "loss/hidden": 0.73828125, "loss/logits": 0.12183347344398499, "loss/reg": 0.025022249668836594, "step": 281 }, { "epoch": 0.03525, "grad_norm": 4.322115421295166, "grad_norm_var": 1.9716269568170388, "learning_rate": 0.0001, "loss": 0.95, "loss/crossentropy": 2.483142137527466, "loss/hidden": 0.5859375, "loss/logits": 0.11390361189842224, "loss/reg": 0.025014575570821762, "step": 282 }, { "epoch": 0.035375, "grad_norm": 4.432065486907959, "grad_norm_var": 1.2250959918754403, "learning_rate": 0.0001, "loss": 0.9886, "loss/crossentropy": 2.1206424236297607, "loss/hidden": 0.6328125, "loss/logits": 0.10568062961101532, "loss/reg": 0.025007015094161034, "step": 283 }, { "epoch": 0.0355, "grad_norm": 4.0429558753967285, "grad_norm_var": 1.198112283867575, "learning_rate": 0.0001, "loss": 1.0004, "loss/crossentropy": 2.6465060710906982, "loss/hidden": 0.6328125, "loss/logits": 0.11765305697917938, "loss/reg": 0.024997249245643616, "step": 284 }, { "epoch": 0.035625, "grad_norm": 3.551006555557251, "grad_norm_var": 1.23838358717468, "learning_rate": 0.0001, "loss": 0.9764, "loss/crossentropy": 2.4274182319641113, "loss/hidden": 0.61328125, "loss/logits": 0.11320526152849197, "loss/reg": 0.02498740889132023, "step": 285 }, { "epoch": 0.03575, "grad_norm": 2.874697685241699, "grad_norm_var": 1.344305852802292, "learning_rate": 0.0001, "loss": 1.0133, "loss/crossentropy": 2.512948751449585, "loss/hidden": 0.6484375, "loss/logits": 0.11510799080133438, "loss/reg": 0.02497740648686886, "step": 286 }, { "epoch": 0.035875, "grad_norm": 3.4613194465637207, "grad_norm_var": 1.0008425760722102, "learning_rate": 0.0001, "loss": 0.9274, "loss/crossentropy": 2.4450602531433105, "loss/hidden": 0.5703125, "loss/logits": 0.10743874311447144, "loss/reg": 0.024968616664409637, "step": 287 }, { "epoch": 0.036, "grad_norm": 3.452150344848633, "grad_norm_var": 0.8735820507410946, "learning_rate": 0.0001, "loss": 0.952, "loss/crossentropy": 2.304729700088501, "loss/hidden": 0.578125, "loss/logits": 0.1242954432964325, "loss/reg": 0.024959923699498177, "step": 288 }, { "epoch": 0.036125, "grad_norm": 3.3603436946868896, "grad_norm_var": 0.8625457550688983, "learning_rate": 0.0001, "loss": 0.9447, "loss/crossentropy": 2.425712823867798, "loss/hidden": 0.58984375, "loss/logits": 0.1053181067109108, "loss/reg": 0.024950530380010605, "step": 289 }, { "epoch": 0.03625, "grad_norm": 3.3667662143707275, "grad_norm_var": 0.8374846368179986, "learning_rate": 0.0001, "loss": 0.9067, "loss/crossentropy": 2.2623162269592285, "loss/hidden": 0.55859375, "loss/logits": 0.098650723695755, "loss/reg": 0.02494119666516781, "step": 290 }, { "epoch": 0.036375, "grad_norm": 2.7996857166290283, "grad_norm_var": 0.9075153562133816, "learning_rate": 0.0001, "loss": 0.9594, "loss/crossentropy": 2.370417356491089, "loss/hidden": 0.59765625, "loss/logits": 0.11243726313114166, "loss/reg": 0.024933209642767906, "step": 291 }, { "epoch": 0.0365, "grad_norm": 2.9999380111694336, "grad_norm_var": 0.7233017120786666, "learning_rate": 0.0001, "loss": 1.0396, "loss/crossentropy": 2.286776542663574, "loss/hidden": 0.6640625, "loss/logits": 0.12626898288726807, "loss/reg": 0.02492516115307808, "step": 292 }, { "epoch": 0.036625, "grad_norm": 3.880777597427368, "grad_norm_var": 0.685825505757979, "learning_rate": 0.0001, "loss": 0.9764, "loss/crossentropy": 2.613823652267456, "loss/hidden": 0.61328125, "loss/logits": 0.11395551264286041, "loss/reg": 0.02491726726293564, "step": 293 }, { "epoch": 0.03675, "grad_norm": 4.2950568199157715, "grad_norm_var": 0.6453385025094112, "learning_rate": 0.0001, "loss": 1.0535, "loss/crossentropy": 2.6882171630859375, "loss/hidden": 0.66796875, "loss/logits": 0.13644324243068695, "loss/reg": 0.02490835078060627, "step": 294 }, { "epoch": 0.036875, "grad_norm": 3.61423659324646, "grad_norm_var": 0.6212598300915251, "learning_rate": 0.0001, "loss": 1.155, "loss/crossentropy": 2.0867068767547607, "loss/hidden": 0.76953125, "loss/logits": 0.13649392127990723, "loss/reg": 0.024899456650018692, "step": 295 }, { "epoch": 0.037, "grad_norm": 3.9245386123657227, "grad_norm_var": 0.3982568915415859, "learning_rate": 0.0001, "loss": 0.951, "loss/crossentropy": 2.703791618347168, "loss/hidden": 0.5859375, "loss/logits": 0.11614765971899033, "loss/reg": 0.02489159069955349, "step": 296 }, { "epoch": 0.037125, "grad_norm": 4.020047187805176, "grad_norm_var": 0.2597397925732442, "learning_rate": 0.0001, "loss": 0.9707, "loss/crossentropy": 2.429636001586914, "loss/hidden": 0.61328125, "loss/logits": 0.1086028665304184, "loss/reg": 0.024883201345801353, "step": 297 }, { "epoch": 0.03725, "grad_norm": 4.086516857147217, "grad_norm_var": 0.24209119003572066, "learning_rate": 0.0001, "loss": 1.0769, "loss/crossentropy": 2.8581056594848633, "loss/hidden": 0.6796875, "loss/logits": 0.14844708144664764, "loss/reg": 0.024874594062566757, "step": 298 }, { "epoch": 0.037375, "grad_norm": 2.956085443496704, "grad_norm_var": 0.22141400399237127, "learning_rate": 0.0001, "loss": 0.9903, "loss/crossentropy": 2.4802041053771973, "loss/hidden": 0.609375, "loss/logits": 0.13230839371681213, "loss/reg": 0.024866636842489243, "step": 299 }, { "epoch": 0.0375, "grad_norm": 3.165804386138916, "grad_norm_var": 0.2110158468875736, "learning_rate": 0.0001, "loss": 1.0347, "loss/crossentropy": 2.4370639324188232, "loss/hidden": 0.66015625, "loss/logits": 0.1259341686964035, "loss/reg": 0.024857714772224426, "step": 300 }, { "epoch": 0.037625, "grad_norm": 3.0942695140838623, "grad_norm_var": 0.22022059823099174, "learning_rate": 0.0001, "loss": 0.9747, "loss/crossentropy": 2.674190044403076, "loss/hidden": 0.6015625, "loss/logits": 0.12467385828495026, "loss/reg": 0.024848785251379013, "step": 301 }, { "epoch": 0.03775, "grad_norm": 2.949205160140991, "grad_norm_var": 0.21475779393049, "learning_rate": 0.0001, "loss": 0.8898, "loss/crossentropy": 2.491746664047241, "loss/hidden": 0.55078125, "loss/logits": 0.09065801650285721, "loss/reg": 0.02483983524143696, "step": 302 }, { "epoch": 0.037875, "grad_norm": 3.549431324005127, "grad_norm_var": 0.21520952048913009, "learning_rate": 0.0001, "loss": 0.9197, "loss/crossentropy": 2.7578208446502686, "loss/hidden": 0.57421875, "loss/logits": 0.09712585806846619, "loss/reg": 0.024831857532262802, "step": 303 }, { "epoch": 0.038, "grad_norm": 3.0621931552886963, "grad_norm_var": 0.22562503941355938, "learning_rate": 0.0001, "loss": 0.894, "loss/crossentropy": 2.440351724624634, "loss/hidden": 0.546875, "loss/logits": 0.09888219833374023, "loss/reg": 0.02482294850051403, "step": 304 }, { "epoch": 0.038125, "grad_norm": 3.334942579269409, "grad_norm_var": 0.22595311715915212, "learning_rate": 0.0001, "loss": 1.1509, "loss/crossentropy": 2.378168821334839, "loss/hidden": 0.74609375, "loss/logits": 0.15670377016067505, "loss/reg": 0.024814244359731674, "step": 305 }, { "epoch": 0.03825, "grad_norm": 3.3253893852233887, "grad_norm_var": 0.22648465837488155, "learning_rate": 0.0001, "loss": 0.9744, "loss/crossentropy": 2.4409470558166504, "loss/hidden": 0.6015625, "loss/logits": 0.12483348697423935, "loss/reg": 0.024805361405014992, "step": 306 }, { "epoch": 0.038375, "grad_norm": 3.1687541007995605, "grad_norm_var": 0.20343285009947346, "learning_rate": 0.0001, "loss": 0.8998, "loss/crossentropy": 2.3339829444885254, "loss/hidden": 0.55078125, "loss/logits": 0.10110392421483994, "loss/reg": 0.02479635737836361, "step": 307 }, { "epoch": 0.0385, "grad_norm": 4.096661567687988, "grad_norm_var": 0.21071919009248533, "learning_rate": 0.0001, "loss": 1.0154, "loss/crossentropy": 2.468813180923462, "loss/hidden": 0.64453125, "loss/logits": 0.12295258045196533, "loss/reg": 0.024787236005067825, "step": 308 }, { "epoch": 0.038625, "grad_norm": 3.097642660140991, "grad_norm_var": 0.2127095324618587, "learning_rate": 0.0001, "loss": 0.8772, "loss/crossentropy": 2.5380513668060303, "loss/hidden": 0.53125, "loss/logits": 0.09821398556232452, "loss/reg": 0.024777989834547043, "step": 309 }, { "epoch": 0.03875, "grad_norm": 4.647727012634277, "grad_norm_var": 0.25863060133758775, "learning_rate": 0.0001, "loss": 0.9162, "loss/crossentropy": 2.0996336936950684, "loss/hidden": 0.5625, "loss/logits": 0.10599061101675034, "loss/reg": 0.024768849834799767, "step": 310 }, { "epoch": 0.038875, "grad_norm": 3.627246141433716, "grad_norm_var": 0.25882920418562944, "learning_rate": 0.0001, "loss": 1.0161, "loss/crossentropy": 2.2958993911743164, "loss/hidden": 0.65234375, "loss/logits": 0.11611323803663254, "loss/reg": 0.024759870022535324, "step": 311 }, { "epoch": 0.039, "grad_norm": 3.7236764430999756, "grad_norm_var": 0.2501591619921593, "learning_rate": 0.0001, "loss": 1.1308, "loss/crossentropy": 2.4322752952575684, "loss/hidden": 0.7421875, "loss/logits": 0.1410805881023407, "loss/reg": 0.024751078337430954, "step": 312 }, { "epoch": 0.039125, "grad_norm": 3.998807430267334, "grad_norm_var": 0.24869789076210413, "learning_rate": 0.0001, "loss": 1.0749, "loss/crossentropy": 2.374758720397949, "loss/hidden": 0.703125, "loss/logits": 0.1243831142783165, "loss/reg": 0.024742012843489647, "step": 313 }, { "epoch": 0.03925, "grad_norm": 4.315618515014648, "grad_norm_var": 0.2701154191311919, "learning_rate": 0.0001, "loss": 0.9542, "loss/crossentropy": 2.290766477584839, "loss/hidden": 0.60546875, "loss/logits": 0.10138154029846191, "loss/reg": 0.02473386563360691, "step": 314 }, { "epoch": 0.039375, "grad_norm": 11.789870262145996, "grad_norm_var": 4.498354875640615, "learning_rate": 0.0001, "loss": 1.6276, "loss/crossentropy": 2.391585350036621, "loss/hidden": 1.25, "loss/logits": 0.13034726679325104, "loss/reg": 0.02472575195133686, "step": 315 }, { "epoch": 0.0395, "grad_norm": 3.499288320541382, "grad_norm_var": 4.465581075155145, "learning_rate": 0.0001, "loss": 0.9478, "loss/crossentropy": 2.4527926445007324, "loss/hidden": 0.5703125, "loss/logits": 0.13031822443008423, "loss/reg": 0.024717645719647408, "step": 316 }, { "epoch": 0.039625, "grad_norm": 3.0139851570129395, "grad_norm_var": 4.476536239649594, "learning_rate": 0.0001, "loss": 0.9555, "loss/crossentropy": 2.506457805633545, "loss/hidden": 0.58203125, "loss/logits": 0.12640972435474396, "loss/reg": 0.02470862865447998, "step": 317 }, { "epoch": 0.03975, "grad_norm": 3.671523094177246, "grad_norm_var": 4.4007183053584145, "learning_rate": 0.0001, "loss": 1.0902, "loss/crossentropy": 2.1730313301086426, "loss/hidden": 0.7109375, "loss/logits": 0.13228294253349304, "loss/reg": 0.024700626730918884, "step": 318 }, { "epoch": 0.039875, "grad_norm": 3.453159809112549, "grad_norm_var": 4.408623714873802, "learning_rate": 0.0001, "loss": 0.9518, "loss/crossentropy": 2.7452075481414795, "loss/hidden": 0.58984375, "loss/logits": 0.11500866711139679, "loss/reg": 0.02469259686768055, "step": 319 }, { "epoch": 0.04, "grad_norm": 8.18558406829834, "grad_norm_var": 5.330579476502794, "learning_rate": 0.0001, "loss": 1.2593, "loss/crossentropy": 2.295231819152832, "loss/hidden": 0.78125, "loss/logits": 0.2312176525592804, "loss/reg": 0.024684064090251923, "step": 320 }, { "epoch": 0.040125, "grad_norm": 4.58513879776001, "grad_norm_var": 5.245000173569268, "learning_rate": 0.0001, "loss": 1.0428, "loss/crossentropy": 2.3456668853759766, "loss/hidden": 0.67578125, "loss/logits": 0.12027865648269653, "loss/reg": 0.024676108732819557, "step": 321 }, { "epoch": 0.04025, "grad_norm": 3.6148953437805176, "grad_norm_var": 5.20441494141252, "learning_rate": 0.0001, "loss": 0.885, "loss/crossentropy": 2.3615198135375977, "loss/hidden": 0.53515625, "loss/logits": 0.10311760008335114, "loss/reg": 0.02466769702732563, "step": 322 }, { "epoch": 0.040375, "grad_norm": 4.596187114715576, "grad_norm_var": 5.072570501388933, "learning_rate": 0.0001, "loss": 1.3642, "loss/crossentropy": 1.9407044649124146, "loss/hidden": 0.9140625, "loss/logits": 0.20351722836494446, "loss/reg": 0.02465960383415222, "step": 323 }, { "epoch": 0.0405, "grad_norm": 4.305622577667236, "grad_norm_var": 5.060723771971758, "learning_rate": 0.0001, "loss": 1.2854, "loss/crossentropy": 2.1887283325195312, "loss/hidden": 0.83203125, "loss/logits": 0.20682212710380554, "loss/reg": 0.024651024490594864, "step": 324 }, { "epoch": 0.040625, "grad_norm": 3.1052956581115723, "grad_norm_var": 5.059160883569213, "learning_rate": 0.0001, "loss": 0.9277, "loss/crossentropy": 2.7552340030670166, "loss/hidden": 0.56640625, "loss/logits": 0.11491444706916809, "loss/reg": 0.024642454460263252, "step": 325 }, { "epoch": 0.04075, "grad_norm": 3.902174234390259, "grad_norm_var": 5.092472426369553, "learning_rate": 0.0001, "loss": 1.0922, "loss/crossentropy": 2.328878402709961, "loss/hidden": 0.70703125, "loss/logits": 0.13887017965316772, "loss/reg": 0.024634363129734993, "step": 326 }, { "epoch": 0.040875, "grad_norm": 3.144326686859131, "grad_norm_var": 5.168830163995767, "learning_rate": 0.0001, "loss": 0.9028, "loss/crossentropy": 2.765711545944214, "loss/hidden": 0.55078125, "loss/logits": 0.10574661940336227, "loss/reg": 0.02462565153837204, "step": 327 }, { "epoch": 0.041, "grad_norm": 4.085259914398193, "grad_norm_var": 5.136846736797656, "learning_rate": 0.0001, "loss": 1.1366, "loss/crossentropy": 2.2278430461883545, "loss/hidden": 0.75390625, "loss/logits": 0.13655412197113037, "loss/reg": 0.0246175117790699, "step": 328 }, { "epoch": 0.041125, "grad_norm": 3.578554391860962, "grad_norm_var": 5.180404969237465, "learning_rate": 0.0001, "loss": 1.0227, "loss/crossentropy": 2.7795305252075195, "loss/hidden": 0.640625, "loss/logits": 0.13600832223892212, "loss/reg": 0.024609515443444252, "step": 329 }, { "epoch": 0.04125, "grad_norm": 3.7418434619903564, "grad_norm_var": 5.219134310055354, "learning_rate": 0.0001, "loss": 1.0244, "loss/crossentropy": 2.636258840560913, "loss/hidden": 0.63671875, "loss/logits": 0.1417178213596344, "loss/reg": 0.024600951001048088, "step": 330 }, { "epoch": 0.041375, "grad_norm": 3.369840383529663, "grad_norm_var": 1.4852025101017772, "learning_rate": 0.0001, "loss": 1.0208, "loss/crossentropy": 2.448961019515991, "loss/hidden": 0.6640625, "loss/logits": 0.1107659712433815, "loss/reg": 0.024592852219939232, "step": 331 }, { "epoch": 0.0415, "grad_norm": 2.90629506111145, "grad_norm_var": 1.5460412234751968, "learning_rate": 0.0001, "loss": 0.9411, "loss/crossentropy": 2.595834732055664, "loss/hidden": 0.59375, "loss/logits": 0.10147438943386078, "loss/reg": 0.024584442377090454, "step": 332 }, { "epoch": 0.041625, "grad_norm": 3.402386426925659, "grad_norm_var": 1.5068032644485272, "learning_rate": 0.0001, "loss": 1.0144, "loss/crossentropy": 2.363499402999878, "loss/hidden": 0.62890625, "loss/logits": 0.1397327035665512, "loss/reg": 0.02457563206553459, "step": 333 }, { "epoch": 0.04175, "grad_norm": 3.597465753555298, "grad_norm_var": 1.5101723473758888, "learning_rate": 0.0001, "loss": 1.1917, "loss/crossentropy": 2.3256943225860596, "loss/hidden": 0.7890625, "loss/logits": 0.15691371262073517, "loss/reg": 0.02456764318048954, "step": 334 }, { "epoch": 0.041875, "grad_norm": 3.512730121612549, "grad_norm_var": 1.506262203991567, "learning_rate": 0.0001, "loss": 1.0631, "loss/crossentropy": 2.383385181427002, "loss/hidden": 0.69140625, "loss/logits": 0.12608817219734192, "loss/reg": 0.024559510871767998, "step": 335 }, { "epoch": 0.042, "grad_norm": 5.968827247619629, "grad_norm_var": 0.5694964439723395, "learning_rate": 0.0001, "loss": 1.258, "loss/crossentropy": 2.212991952896118, "loss/hidden": 0.828125, "loss/logits": 0.18438610434532166, "loss/reg": 0.02455153875052929, "step": 336 }, { "epoch": 0.042125, "grad_norm": 3.644115924835205, "grad_norm_var": 0.5311677507970897, "learning_rate": 0.0001, "loss": 1.1203, "loss/crossentropy": 2.3620948791503906, "loss/hidden": 0.72265625, "loss/logits": 0.15219135582447052, "loss/reg": 0.02454366721212864, "step": 337 }, { "epoch": 0.04225, "grad_norm": 2.716367244720459, "grad_norm_var": 0.6013761572733584, "learning_rate": 0.0001, "loss": 0.919, "loss/crossentropy": 2.3863117694854736, "loss/hidden": 0.578125, "loss/logits": 0.0955524817109108, "loss/reg": 0.024536145851016045, "step": 338 }, { "epoch": 0.042375, "grad_norm": 3.8117287158966064, "grad_norm_var": 0.5485673092684531, "learning_rate": 0.0001, "loss": 1.0529, "loss/crossentropy": 2.4951469898223877, "loss/hidden": 0.67578125, "loss/logits": 0.1318708062171936, "loss/reg": 0.024528514593839645, "step": 339 }, { "epoch": 0.0425, "grad_norm": 2.5218894481658936, "grad_norm_var": 0.5973356289049185, "learning_rate": 0.0001, "loss": 0.9673, "loss/crossentropy": 2.370839834213257, "loss/hidden": 0.6015625, "loss/logits": 0.12049120664596558, "loss/reg": 0.02452007494866848, "step": 340 }, { "epoch": 0.042625, "grad_norm": 2.7343108654022217, "grad_norm_var": 0.6285810690168129, "learning_rate": 0.0001, "loss": 1.1201, "loss/crossentropy": 2.2990617752075195, "loss/hidden": 0.7109375, "loss/logits": 0.16403642296791077, "loss/reg": 0.024511409923434258, "step": 341 }, { "epoch": 0.04275, "grad_norm": 2.5267186164855957, "grad_norm_var": 0.6803812464423664, "learning_rate": 0.0001, "loss": 0.9056, "loss/crossentropy": 2.5629754066467285, "loss/hidden": 0.5546875, "loss/logits": 0.10588675737380981, "loss/reg": 0.024503152817487717, "step": 342 }, { "epoch": 0.042875, "grad_norm": 2.2731988430023193, "grad_norm_var": 0.7637691760365584, "learning_rate": 0.0001, "loss": 0.8154, "loss/crossentropy": 2.4123644828796387, "loss/hidden": 0.490234375, "loss/logits": 0.08022630214691162, "loss/reg": 0.024494923651218414, "step": 343 }, { "epoch": 0.043, "grad_norm": 3.2791945934295654, "grad_norm_var": 0.7306725618305314, "learning_rate": 0.0001, "loss": 0.9879, "loss/crossentropy": 2.7643816471099854, "loss/hidden": 0.6171875, "loss/logits": 0.12586694955825806, "loss/reg": 0.024486759677529335, "step": 344 }, { "epoch": 0.043125, "grad_norm": 3.6740994453430176, "grad_norm_var": 0.7341663188433093, "learning_rate": 0.0001, "loss": 0.8947, "loss/crossentropy": 2.6802780628204346, "loss/hidden": 0.5546875, "loss/logits": 0.0951782613992691, "loss/reg": 0.024478696286678314, "step": 345 }, { "epoch": 0.04325, "grad_norm": 3.234722137451172, "grad_norm_var": 0.7240869727338347, "learning_rate": 0.0001, "loss": 0.8517, "loss/crossentropy": 2.577921152114868, "loss/hidden": 0.515625, "loss/logits": 0.09137749671936035, "loss/reg": 0.024470962584018707, "step": 346 }, { "epoch": 0.043375, "grad_norm": 4.024074554443359, "grad_norm_var": 0.7548921970504276, "learning_rate": 0.0001, "loss": 1.0017, "loss/crossentropy": 2.8129634857177734, "loss/hidden": 0.63671875, "loss/logits": 0.120377317070961, "loss/reg": 0.02446298860013485, "step": 347 }, { "epoch": 0.0435, "grad_norm": 3.4327027797698975, "grad_norm_var": 0.7400679146500114, "learning_rate": 0.0001, "loss": 0.9864, "loss/crossentropy": 2.4310224056243896, "loss/hidden": 0.640625, "loss/logits": 0.10121208429336548, "loss/reg": 0.0244552381336689, "step": 348 }, { "epoch": 0.043625, "grad_norm": 3.2115890979766846, "grad_norm_var": 0.7422101391295163, "learning_rate": 0.0001, "loss": 1.0594, "loss/crossentropy": 2.3658318519592285, "loss/hidden": 0.6796875, "loss/logits": 0.13528358936309814, "loss/reg": 0.02444704994559288, "step": 349 }, { "epoch": 0.04375, "grad_norm": 3.4005136489868164, "grad_norm_var": 0.739061242813568, "learning_rate": 0.0001, "loss": 1.0154, "loss/crossentropy": 2.9171154499053955, "loss/hidden": 0.6171875, "loss/logits": 0.15386469662189484, "loss/reg": 0.024438532069325447, "step": 350 }, { "epoch": 0.043875, "grad_norm": 4.1175127029418945, "grad_norm_var": 0.7731950105324129, "learning_rate": 0.0001, "loss": 0.949, "loss/crossentropy": 2.3405494689941406, "loss/hidden": 0.59765625, "loss/logits": 0.10704682767391205, "loss/reg": 0.024429937824606895, "step": 351 }, { "epoch": 0.044, "grad_norm": 3.9496090412139893, "grad_norm_var": 0.33930652052577653, "learning_rate": 0.0001, "loss": 0.889, "loss/crossentropy": 2.9135613441467285, "loss/hidden": 0.54296875, "loss/logits": 0.1018136739730835, "loss/reg": 0.024421829730272293, "step": 352 }, { "epoch": 0.044125, "grad_norm": 3.794520139694214, "grad_norm_var": 0.347931624130162, "learning_rate": 0.0001, "loss": 1.1771, "loss/crossentropy": 2.5529844760894775, "loss/hidden": 0.75390625, "loss/logits": 0.17909468710422516, "loss/reg": 0.02441396936774254, "step": 353 }, { "epoch": 0.04425, "grad_norm": 3.164349317550659, "grad_norm_var": 0.3259767305032634, "learning_rate": 0.0001, "loss": 1.0804, "loss/crossentropy": 2.3843231201171875, "loss/hidden": 0.71484375, "loss/logits": 0.12148329615592957, "loss/reg": 0.02440580353140831, "step": 354 }, { "epoch": 0.044375, "grad_norm": 2.7847914695739746, "grad_norm_var": 0.32482231475126633, "learning_rate": 0.0001, "loss": 1.0463, "loss/crossentropy": 2.521649122238159, "loss/hidden": 0.68359375, "loss/logits": 0.1187494546175003, "loss/reg": 0.024397339671850204, "step": 355 }, { "epoch": 0.0445, "grad_norm": 3.2391860485076904, "grad_norm_var": 0.28660331114573767, "learning_rate": 0.0001, "loss": 1.0225, "loss/crossentropy": 2.4073832035064697, "loss/hidden": 0.640625, "loss/logits": 0.13795481622219086, "loss/reg": 0.024389205500483513, "step": 356 }, { "epoch": 0.044625, "grad_norm": 5.839352130889893, "grad_norm_var": 0.6539216724231817, "learning_rate": 0.0001, "loss": 1.1058, "loss/crossentropy": 2.6860220432281494, "loss/hidden": 0.7265625, "loss/logits": 0.13543812930583954, "loss/reg": 0.02438061311841011, "step": 357 }, { "epoch": 0.04475, "grad_norm": 4.340758800506592, "grad_norm_var": 0.6249977794062299, "learning_rate": 0.0001, "loss": 1.1, "loss/crossentropy": 2.770012378692627, "loss/hidden": 0.703125, "loss/logits": 0.1531440019607544, "loss/reg": 0.024372335523366928, "step": 358 }, { "epoch": 0.044875, "grad_norm": 8.795727729797363, "grad_norm_var": 2.1213731683569863, "learning_rate": 0.0001, "loss": 1.3149, "loss/crossentropy": 2.2893428802490234, "loss/hidden": 0.91796875, "loss/logits": 0.15326841175556183, "loss/reg": 0.02436378225684166, "step": 359 }, { "epoch": 0.045, "grad_norm": 4.104447841644287, "grad_norm_var": 2.0826812332104394, "learning_rate": 0.0001, "loss": 1.15, "loss/crossentropy": 2.625378370285034, "loss/hidden": 0.7578125, "loss/logits": 0.14858195185661316, "loss/reg": 0.024355949833989143, "step": 360 }, { "epoch": 0.045125, "grad_norm": 3.354828357696533, "grad_norm_var": 2.105873348197963, "learning_rate": 0.0001, "loss": 0.9494, "loss/crossentropy": 2.4916443824768066, "loss/hidden": 0.5859375, "loss/logits": 0.11993111670017242, "loss/reg": 0.024348480626940727, "step": 361 }, { "epoch": 0.04525, "grad_norm": 3.3247056007385254, "grad_norm_var": 2.096606359520402, "learning_rate": 0.0001, "loss": 0.9604, "loss/crossentropy": 2.282543420791626, "loss/hidden": 0.609375, "loss/logits": 0.10765975713729858, "loss/reg": 0.02434113249182701, "step": 362 }, { "epoch": 0.045375, "grad_norm": 4.314214706420898, "grad_norm_var": 2.1006745469652883, "learning_rate": 0.0001, "loss": 1.0684, "loss/crossentropy": 2.447218179702759, "loss/hidden": 0.6953125, "loss/logits": 0.12974585592746735, "loss/reg": 0.024332784116268158, "step": 363 }, { "epoch": 0.0455, "grad_norm": 3.6783666610717773, "grad_norm_var": 2.083471757970481, "learning_rate": 0.0001, "loss": 1.1509, "loss/crossentropy": 2.6020665168762207, "loss/hidden": 0.75, "loss/logits": 0.1576002687215805, "loss/reg": 0.024325383827090263, "step": 364 }, { "epoch": 0.045625, "grad_norm": 3.535550832748413, "grad_norm_var": 2.0521572529950523, "learning_rate": 0.0001, "loss": 0.9672, "loss/crossentropy": 2.2514841556549072, "loss/hidden": 0.61328125, "loss/logits": 0.110772505402565, "loss/reg": 0.024317855015397072, "step": 365 }, { "epoch": 0.04575, "grad_norm": 6.190865993499756, "grad_norm_var": 2.275325586048537, "learning_rate": 0.0001, "loss": 1.2696, "loss/crossentropy": 2.5250887870788574, "loss/hidden": 0.87890625, "loss/logits": 0.1475657969713211, "loss/reg": 0.02430957928299904, "step": 366 }, { "epoch": 0.045875, "grad_norm": 6.9109907150268555, "grad_norm_var": 2.701389202772653, "learning_rate": 0.0001, "loss": 1.42, "loss/crossentropy": 2.0714285373687744, "loss/hidden": 0.98828125, "loss/logits": 0.18868008255958557, "loss/reg": 0.024301210418343544, "step": 367 }, { "epoch": 0.046, "grad_norm": 3.939924955368042, "grad_norm_var": 2.702051041555256, "learning_rate": 0.0001, "loss": 1.0163, "loss/crossentropy": 2.677281379699707, "loss/hidden": 0.640625, "loss/logits": 0.1327013224363327, "loss/reg": 0.024292904883623123, "step": 368 }, { "epoch": 0.046125, "grad_norm": 2.942032814025879, "grad_norm_var": 2.822776844100568, "learning_rate": 0.0001, "loss": 0.9582, "loss/crossentropy": 2.3630495071411133, "loss/hidden": 0.6015625, "loss/logits": 0.11380600929260254, "loss/reg": 0.02428455464541912, "step": 369 }, { "epoch": 0.04625, "grad_norm": 3.125349521636963, "grad_norm_var": 2.8293167859701627, "learning_rate": 0.0001, "loss": 1.0463, "loss/crossentropy": 2.5113635063171387, "loss/hidden": 0.67578125, "loss/logits": 0.12774598598480225, "loss/reg": 0.02427608147263527, "step": 370 }, { "epoch": 0.046375, "grad_norm": 3.15159273147583, "grad_norm_var": 2.758666518773039, "learning_rate": 0.0001, "loss": 0.9198, "loss/crossentropy": 2.31132435798645, "loss/hidden": 0.5703125, "loss/logits": 0.10678299516439438, "loss/reg": 0.024267377331852913, "step": 371 }, { "epoch": 0.0465, "grad_norm": 4.394677639007568, "grad_norm_var": 2.6595375525429406, "learning_rate": 0.0001, "loss": 0.9954, "loss/crossentropy": 2.4987642765045166, "loss/hidden": 0.6328125, "loss/logits": 0.11996030062437057, "loss/reg": 0.0242580845952034, "step": 372 }, { "epoch": 0.046625, "grad_norm": 3.7915477752685547, "grad_norm_var": 2.5549678839666425, "learning_rate": 0.0001, "loss": 0.9193, "loss/crossentropy": 2.3996403217315674, "loss/hidden": 0.578125, "loss/logits": 0.09870465099811554, "loss/reg": 0.02424856275320053, "step": 373 }, { "epoch": 0.04675, "grad_norm": 2.833364725112915, "grad_norm_var": 2.7025530371611066, "learning_rate": 0.0001, "loss": 0.9417, "loss/crossentropy": 2.336583137512207, "loss/hidden": 0.59765625, "loss/logits": 0.10166990756988525, "loss/reg": 0.024240419268608093, "step": 374 }, { "epoch": 0.046875, "grad_norm": 5.302695274353027, "grad_norm_var": 1.359315799583595, "learning_rate": 0.0001, "loss": 0.9495, "loss/crossentropy": 2.5733697414398193, "loss/hidden": 0.578125, "loss/logits": 0.1290503740310669, "loss/reg": 0.0242319293320179, "step": 375 }, { "epoch": 0.047, "grad_norm": 5.087683200836182, "grad_norm_var": 1.426096117003745, "learning_rate": 0.0001, "loss": 0.9067, "loss/crossentropy": 2.548323631286621, "loss/hidden": 0.56640625, "loss/logits": 0.09806410223245621, "loss/reg": 0.02422359585762024, "step": 376 }, { "epoch": 0.047125, "grad_norm": 2.840883255004883, "grad_norm_var": 1.4948607984557458, "learning_rate": 0.0001, "loss": 1.0234, "loss/crossentropy": 2.484494209289551, "loss/hidden": 0.65234375, "loss/logits": 0.12894591689109802, "loss/reg": 0.024214565753936768, "step": 377 }, { "epoch": 0.04725, "grad_norm": 6.647733688354492, "grad_norm_var": 1.848030946106532, "learning_rate": 0.0001, "loss": 1.131, "loss/crossentropy": 2.832048177719116, "loss/hidden": 0.7421875, "loss/logits": 0.14676988124847412, "loss/reg": 0.024205682799220085, "step": 378 }, { "epoch": 0.047375, "grad_norm": 3.465564727783203, "grad_norm_var": 1.8906396391038638, "learning_rate": 0.0001, "loss": 1.0051, "loss/crossentropy": 2.5129964351654053, "loss/hidden": 0.62890625, "loss/logits": 0.134174644947052, "loss/reg": 0.024197354912757874, "step": 379 }, { "epoch": 0.0475, "grad_norm": 3.6985855102539062, "grad_norm_var": 1.8891513099755568, "learning_rate": 0.0001, "loss": 1.4422, "loss/crossentropy": 2.2384328842163086, "loss/hidden": 1.0, "loss/logits": 0.20027095079421997, "loss/reg": 0.02418883889913559, "step": 380 }, { "epoch": 0.047625, "grad_norm": 3.4343388080596924, "grad_norm_var": 1.8993141107729303, "learning_rate": 0.0001, "loss": 0.8648, "loss/crossentropy": 2.753706932067871, "loss/hidden": 0.5234375, "loss/logits": 0.09959565848112106, "loss/reg": 0.024180689826607704, "step": 381 }, { "epoch": 0.04775, "grad_norm": 3.418168067932129, "grad_norm_var": 1.6566847859375398, "learning_rate": 0.0001, "loss": 1.0532, "loss/crossentropy": 2.1205835342407227, "loss/hidden": 0.69921875, "loss/logits": 0.11221310496330261, "loss/reg": 0.02417258359491825, "step": 382 }, { "epoch": 0.047875, "grad_norm": 3.3881125450134277, "grad_norm_var": 1.093930487598201, "learning_rate": 0.0001, "loss": 0.9968, "loss/crossentropy": 2.540379285812378, "loss/hidden": 0.62890625, "loss/logits": 0.1262589991092682, "loss/reg": 0.024164721369743347, "step": 383 }, { "epoch": 0.048, "grad_norm": 3.142486095428467, "grad_norm_var": 1.1231981378319982, "learning_rate": 0.0001, "loss": 0.9186, "loss/crossentropy": 2.5265657901763916, "loss/hidden": 0.58203125, "loss/logits": 0.0949624702334404, "loss/reg": 0.02415630966424942, "step": 384 }, { "epoch": 0.048125, "grad_norm": 7.363763332366943, "grad_norm_var": 1.8443340238906671, "learning_rate": 0.0001, "loss": 1.1703, "loss/crossentropy": 2.442409038543701, "loss/hidden": 0.8046875, "loss/logits": 0.12408800423145294, "loss/reg": 0.024148130789399147, "step": 385 }, { "epoch": 0.04825, "grad_norm": 3.801536798477173, "grad_norm_var": 1.7879312710551798, "learning_rate": 0.0001, "loss": 1.0129, "loss/crossentropy": 2.461942434310913, "loss/hidden": 0.640625, "loss/logits": 0.13092291355133057, "loss/reg": 0.024139659479260445, "step": 386 }, { "epoch": 0.048375, "grad_norm": 4.24082612991333, "grad_norm_var": 1.7228677295443293, "learning_rate": 0.0001, "loss": 1.1161, "loss/crossentropy": 2.2889018058776855, "loss/hidden": 0.74609375, "loss/logits": 0.12870003283023834, "loss/reg": 0.024130841717123985, "step": 387 }, { "epoch": 0.0485, "grad_norm": 4.190494060516357, "grad_norm_var": 1.7195812284180172, "learning_rate": 0.0001, "loss": 1.1649, "loss/crossentropy": 2.4475483894348145, "loss/hidden": 0.76171875, "loss/logits": 0.16198021173477173, "loss/reg": 0.024122456088662148, "step": 388 }, { "epoch": 0.048625, "grad_norm": 15.206843376159668, "grad_norm_var": 9.294742605150057, "learning_rate": 0.0001, "loss": 1.0606, "loss/crossentropy": 2.3164589405059814, "loss/hidden": 0.703125, "loss/logits": 0.116313636302948, "loss/reg": 0.024113710969686508, "step": 389 }, { "epoch": 0.04875, "grad_norm": 10.421854972839355, "grad_norm_var": 10.824103712955843, "learning_rate": 0.0001, "loss": 1.0638, "loss/crossentropy": 2.1764075756073, "loss/hidden": 0.70703125, "loss/logits": 0.11567908525466919, "loss/reg": 0.024104835465550423, "step": 390 }, { "epoch": 0.048875, "grad_norm": 4.923640727996826, "grad_norm_var": 10.83563756748113, "learning_rate": 0.0001, "loss": 0.9465, "loss/crossentropy": 2.6762068271636963, "loss/hidden": 0.6015625, "loss/logits": 0.10399520397186279, "loss/reg": 0.02409605123102665, "step": 391 }, { "epoch": 0.049, "grad_norm": 4.840577125549316, "grad_norm_var": 10.847422220224528, "learning_rate": 0.0001, "loss": 1.2037, "loss/crossentropy": 2.482144832611084, "loss/hidden": 0.8125, "loss/logits": 0.15031081438064575, "loss/reg": 0.02408732660114765, "step": 392 }, { "epoch": 0.049125, "grad_norm": 2.9535470008850098, "grad_norm_var": 10.811063470934846, "learning_rate": 0.0001, "loss": 1.0811, "loss/crossentropy": 2.5998470783233643, "loss/hidden": 0.71484375, "loss/logits": 0.12545132637023926, "loss/reg": 0.024078134447336197, "step": 393 }, { "epoch": 0.04925, "grad_norm": 4.08555793762207, "grad_norm_var": 10.768160950066, "learning_rate": 0.0001, "loss": 1.3211, "loss/crossentropy": 2.611746072769165, "loss/hidden": 0.90625, "loss/logits": 0.17412351071834564, "loss/reg": 0.02406897209584713, "step": 394 }, { "epoch": 0.049375, "grad_norm": 4.730119705200195, "grad_norm_var": 10.582242923890295, "learning_rate": 0.0001, "loss": 0.9463, "loss/crossentropy": 2.5379066467285156, "loss/hidden": 0.59765625, "loss/logits": 0.10806328058242798, "loss/reg": 0.024059420451521873, "step": 395 }, { "epoch": 0.0495, "grad_norm": 4.012064456939697, "grad_norm_var": 10.523956759484621, "learning_rate": 0.0001, "loss": 0.9181, "loss/crossentropy": 2.567375898361206, "loss/hidden": 0.57421875, "loss/logits": 0.10337453335523605, "loss/reg": 0.024049852043390274, "step": 396 }, { "epoch": 0.049625, "grad_norm": 4.57706880569458, "grad_norm_var": 10.32746400090784, "learning_rate": 0.0001, "loss": 1.051, "loss/crossentropy": 2.6955151557922363, "loss/hidden": 0.6875, "loss/logits": 0.12309969961643219, "loss/reg": 0.02403969317674637, "step": 397 }, { "epoch": 0.04975, "grad_norm": 2.7931346893310547, "grad_norm_var": 10.511295288820635, "learning_rate": 0.0001, "loss": 1.0373, "loss/crossentropy": 2.34134578704834, "loss/hidden": 0.66796875, "loss/logits": 0.12901151180267334, "loss/reg": 0.02403116784989834, "step": 398 }, { "epoch": 0.049875, "grad_norm": 2.4789700508117676, "grad_norm_var": 10.793738555266915, "learning_rate": 0.0001, "loss": 1.1463, "loss/crossentropy": 2.1852893829345703, "loss/hidden": 0.75390625, "loss/logits": 0.15216518938541412, "loss/reg": 0.024022690951824188, "step": 399 }, { "epoch": 0.05, "grad_norm": 5.340605735778809, "grad_norm_var": 10.482396698239997, "learning_rate": 0.0001, "loss": 1.2874, "loss/crossentropy": 2.4223577976226807, "loss/hidden": 0.91015625, "loss/logits": 0.13714072108268738, "loss/reg": 0.024014031514525414, "step": 400 }, { "epoch": 0.050125, "grad_norm": 5.600348949432373, "grad_norm_var": 10.208567826877847, "learning_rate": 0.0001, "loss": 0.9905, "loss/crossentropy": 2.494180917739868, "loss/hidden": 0.64453125, "loss/logits": 0.10594967007637024, "loss/reg": 0.024006787687540054, "step": 401 }, { "epoch": 0.05025, "grad_norm": 5.094600677490234, "grad_norm_var": 10.061216488426146, "learning_rate": 0.0001, "loss": 1.2828, "loss/crossentropy": 2.1899051666259766, "loss/hidden": 0.89453125, "loss/logits": 0.1482730507850647, "loss/reg": 0.02399739809334278, "step": 402 }, { "epoch": 0.050375, "grad_norm": 3.915607452392578, "grad_norm_var": 10.115626051260246, "learning_rate": 0.0001, "loss": 1.0375, "loss/crossentropy": 2.790240526199341, "loss/hidden": 0.65625, "loss/logits": 0.14136558771133423, "loss/reg": 0.02398892305791378, "step": 403 }, { "epoch": 0.0505, "grad_norm": 70.49039459228516, "grad_norm_var": 274.8357269833382, "learning_rate": 0.0001, "loss": 1.1534, "loss/crossentropy": 2.2490358352661133, "loss/hidden": 0.79296875, "loss/logits": 0.12061962485313416, "loss/reg": 0.02398114837706089, "step": 404 }, { "epoch": 0.050625, "grad_norm": 6.5985307693481445, "grad_norm_var": 272.87861181728414, "learning_rate": 0.0001, "loss": 1.1479, "loss/crossentropy": 2.856632709503174, "loss/hidden": 0.78515625, "loss/logits": 0.12297768890857697, "loss/reg": 0.0239717997610569, "step": 405 }, { "epoch": 0.05075, "grad_norm": 4.041878700256348, "grad_norm_var": 274.1523084795167, "learning_rate": 0.0001, "loss": 1.1408, "loss/crossentropy": 2.5744457244873047, "loss/hidden": 0.76171875, "loss/logits": 0.13939061760902405, "loss/reg": 0.02396412193775177, "step": 406 }, { "epoch": 0.050875, "grad_norm": 3.5284969806671143, "grad_norm_var": 274.94477307618513, "learning_rate": 0.0001, "loss": 1.0838, "loss/crossentropy": 2.7705729007720947, "loss/hidden": 0.72265625, "loss/logits": 0.12160193920135498, "loss/reg": 0.023956267163157463, "step": 407 }, { "epoch": 0.051, "grad_norm": 3.867558240890503, "grad_norm_var": 275.4712566581114, "learning_rate": 0.0001, "loss": 1.1238, "loss/crossentropy": 2.2807302474975586, "loss/hidden": 0.75390625, "loss/logits": 0.1304117739200592, "loss/reg": 0.023948216810822487, "step": 408 }, { "epoch": 0.051125, "grad_norm": 4.9265217781066895, "grad_norm_var": 274.2865770164501, "learning_rate": 0.0001, "loss": 0.955, "loss/crossentropy": 2.6176843643188477, "loss/hidden": 0.62890625, "loss/logits": 0.08674542605876923, "loss/reg": 0.02393944188952446, "step": 409 }, { "epoch": 0.05125, "grad_norm": 3.1925463676452637, "grad_norm_var": 274.86264478448203, "learning_rate": 0.0001, "loss": 1.0329, "loss/crossentropy": 2.790286064147949, "loss/hidden": 0.6796875, "loss/logits": 0.11385629326105118, "loss/reg": 0.02393159456551075, "step": 410 }, { "epoch": 0.051375, "grad_norm": 2.6993377208709717, "grad_norm_var": 276.12743945534294, "learning_rate": 0.0001, "loss": 0.9268, "loss/crossentropy": 2.5643763542175293, "loss/hidden": 0.58203125, "loss/logits": 0.10557639598846436, "loss/reg": 0.0239238403737545, "step": 411 }, { "epoch": 0.0515, "grad_norm": 4.7841315269470215, "grad_norm_var": 275.72098389943244, "learning_rate": 0.0001, "loss": 1.3399, "loss/crossentropy": 2.3737518787384033, "loss/hidden": 0.89453125, "loss/logits": 0.2062419056892395, "loss/reg": 0.023915138095617294, "step": 412 }, { "epoch": 0.051625, "grad_norm": 2.8743667602539062, "grad_norm_var": 276.7634192046356, "learning_rate": 0.0001, "loss": 1.105, "loss/crossentropy": 2.225154399871826, "loss/hidden": 0.734375, "loss/logits": 0.13158033788204193, "loss/reg": 0.023907041177153587, "step": 413 }, { "epoch": 0.05175, "grad_norm": 4.873403072357178, "grad_norm_var": 275.51638736026473, "learning_rate": 0.0001, "loss": 1.1328, "loss/crossentropy": 2.444675922393799, "loss/hidden": 0.734375, "loss/logits": 0.15947584807872772, "loss/reg": 0.023899447172880173, "step": 414 }, { "epoch": 0.051875, "grad_norm": 3.6676955223083496, "grad_norm_var": 274.6671585398764, "learning_rate": 0.0001, "loss": 0.9504, "loss/crossentropy": 2.6828341484069824, "loss/hidden": 0.6015625, "loss/logits": 0.10996139049530029, "loss/reg": 0.023892199620604515, "step": 415 }, { "epoch": 0.052, "grad_norm": 2.703623056411743, "grad_norm_var": 276.2015243387772, "learning_rate": 0.0001, "loss": 1.049, "loss/crossentropy": 2.4529809951782227, "loss/hidden": 0.6796875, "loss/logits": 0.1305149495601654, "loss/reg": 0.023883724585175514, "step": 416 }, { "epoch": 0.052125, "grad_norm": 2.9288852214813232, "grad_norm_var": 277.61048629826035, "learning_rate": 0.0001, "loss": 1.0151, "loss/crossentropy": 2.6102705001831055, "loss/hidden": 0.640625, "loss/logits": 0.13567429780960083, "loss/reg": 0.023875238373875618, "step": 417 }, { "epoch": 0.05225, "grad_norm": 2.5006144046783447, "grad_norm_var": 279.0831974622093, "learning_rate": 0.0001, "loss": 1.0571, "loss/crossentropy": 2.457202672958374, "loss/hidden": 0.69140625, "loss/logits": 0.12700514495372772, "loss/reg": 0.023867420852184296, "step": 418 }, { "epoch": 0.052375, "grad_norm": 5.180849552154541, "grad_norm_var": 278.49850212580674, "learning_rate": 0.0001, "loss": 1.1214, "loss/crossentropy": 2.7064497470855713, "loss/hidden": 0.74609375, "loss/logits": 0.13676053285598755, "loss/reg": 0.02385888434946537, "step": 419 }, { "epoch": 0.0525, "grad_norm": 3.2954249382019043, "grad_norm_var": 1.3051375489769337, "learning_rate": 0.0001, "loss": 1.0717, "loss/crossentropy": 2.31532621383667, "loss/hidden": 0.703125, "loss/logits": 0.13004590570926666, "loss/reg": 0.023851698264479637, "step": 420 }, { "epoch": 0.052625, "grad_norm": 2.8389148712158203, "grad_norm_var": 0.8127685868335741, "learning_rate": 0.0001, "loss": 1.0781, "loss/crossentropy": 2.3562088012695312, "loss/hidden": 0.70703125, "loss/logits": 0.13266143202781677, "loss/reg": 0.023843195289373398, "step": 421 }, { "epoch": 0.05275, "grad_norm": 4.581103324890137, "grad_norm_var": 0.8613437167520175, "learning_rate": 0.0001, "loss": 1.4299, "loss/crossentropy": 2.1004514694213867, "loss/hidden": 1.0546875, "loss/logits": 0.13683074712753296, "loss/reg": 0.023835282772779465, "step": 422 }, { "epoch": 0.052875, "grad_norm": 4.119724750518799, "grad_norm_var": 0.8733982923945914, "learning_rate": 0.0001, "loss": 1.2152, "loss/crossentropy": 2.640662670135498, "loss/hidden": 0.8515625, "loss/logits": 0.12537327408790588, "loss/reg": 0.02382684126496315, "step": 423 }, { "epoch": 0.053, "grad_norm": 4.544858455657959, "grad_norm_var": 0.918133871994677, "learning_rate": 0.0001, "loss": 1.0957, "loss/crossentropy": 2.8237485885620117, "loss/hidden": 0.7265625, "loss/logits": 0.1309557855129242, "loss/reg": 0.023818302899599075, "step": 424 }, { "epoch": 0.053125, "grad_norm": 2.757870674133301, "grad_norm_var": 0.8666742418813403, "learning_rate": 0.0001, "loss": 0.962, "loss/crossentropy": 2.5533342361450195, "loss/hidden": 0.62109375, "loss/logits": 0.10283903032541275, "loss/reg": 0.023810207843780518, "step": 425 }, { "epoch": 0.05325, "grad_norm": 2.8035690784454346, "grad_norm_var": 0.8970790990362765, "learning_rate": 0.0001, "loss": 1.0877, "loss/crossentropy": 2.610870361328125, "loss/hidden": 0.6953125, "loss/logits": 0.1544169783592224, "loss/reg": 0.02380150742828846, "step": 426 }, { "epoch": 0.053375, "grad_norm": 3.5559544563293457, "grad_norm_var": 0.8432525593756196, "learning_rate": 0.0001, "loss": 1.1169, "loss/crossentropy": 2.537252187728882, "loss/hidden": 0.74609375, "loss/logits": 0.1328693926334381, "loss/reg": 0.02379263937473297, "step": 427 }, { "epoch": 0.0535, "grad_norm": 3.58505916595459, "grad_norm_var": 0.7479056021171611, "learning_rate": 0.0001, "loss": 1.0427, "loss/crossentropy": 2.240241289138794, "loss/hidden": 0.68359375, "loss/logits": 0.1212427169084549, "loss/reg": 0.02378367818892002, "step": 428 }, { "epoch": 0.053625, "grad_norm": 3.6646056175231934, "grad_norm_var": 0.7156687449512967, "learning_rate": 0.0001, "loss": 1.0166, "loss/crossentropy": 2.765550374984741, "loss/hidden": 0.6328125, "loss/logits": 0.14600837230682373, "loss/reg": 0.023774517700076103, "step": 429 }, { "epoch": 0.05375, "grad_norm": 3.2596421241760254, "grad_norm_var": 0.6044660126437359, "learning_rate": 0.0001, "loss": 1.13, "loss/crossentropy": 2.1970372200012207, "loss/hidden": 0.7578125, "loss/logits": 0.13451042771339417, "loss/reg": 0.023765094578266144, "step": 430 }, { "epoch": 0.053875, "grad_norm": 4.039770603179932, "grad_norm_var": 0.6214738630237046, "learning_rate": 0.0001, "loss": 1.2708, "loss/crossentropy": 2.5254645347595215, "loss/hidden": 0.83984375, "loss/logits": 0.19336232542991638, "loss/reg": 0.023756500333547592, "step": 431 }, { "epoch": 0.054, "grad_norm": 2.9176530838012695, "grad_norm_var": 0.6009675102137348, "learning_rate": 0.0001, "loss": 0.9399, "loss/crossentropy": 2.611178398132324, "loss/hidden": 0.59375, "loss/logits": 0.10868553817272186, "loss/reg": 0.023747922852635384, "step": 432 }, { "epoch": 0.054125, "grad_norm": 3.540189027786255, "grad_norm_var": 0.5748467113480954, "learning_rate": 0.0001, "loss": 1.164, "loss/crossentropy": 2.4766628742218018, "loss/hidden": 0.80078125, "loss/logits": 0.1258353739976883, "loss/reg": 0.023739352822303772, "step": 433 }, { "epoch": 0.05425, "grad_norm": 3.7053966522216797, "grad_norm_var": 0.4931212433281331, "learning_rate": 0.0001, "loss": 1.0522, "loss/crossentropy": 2.491478681564331, "loss/hidden": 0.67578125, "loss/logits": 0.13909485936164856, "loss/reg": 0.023731039837002754, "step": 434 }, { "epoch": 0.054375, "grad_norm": 3.4101648330688477, "grad_norm_var": 0.32751985750053336, "learning_rate": 0.0001, "loss": 0.9527, "loss/crossentropy": 2.5495922565460205, "loss/hidden": 0.59765625, "loss/logits": 0.1178436130285263, "loss/reg": 0.023722674697637558, "step": 435 }, { "epoch": 0.0545, "grad_norm": 3.923790693283081, "grad_norm_var": 0.33181180777142816, "learning_rate": 0.0001, "loss": 1.1217, "loss/crossentropy": 2.350161552429199, "loss/hidden": 0.75, "loss/logits": 0.13453412055969238, "loss/reg": 0.02371453307569027, "step": 436 }, { "epoch": 0.054625, "grad_norm": 3.2052547931671143, "grad_norm_var": 0.3040979482718351, "learning_rate": 0.0001, "loss": 1.0641, "loss/crossentropy": 2.5055789947509766, "loss/hidden": 0.71484375, "loss/logits": 0.11222882568836212, "loss/reg": 0.023706616833806038, "step": 437 }, { "epoch": 0.05475, "grad_norm": 2.820063591003418, "grad_norm_var": 0.26777286633351405, "learning_rate": 0.0001, "loss": 1.1811, "loss/crossentropy": 2.535069704055786, "loss/hidden": 0.78125, "loss/logits": 0.1628357172012329, "loss/reg": 0.023698095232248306, "step": 438 }, { "epoch": 0.054875, "grad_norm": 3.095841884613037, "grad_norm_var": 0.24744105333314317, "learning_rate": 0.0001, "loss": 1.1936, "loss/crossentropy": 2.313276767730713, "loss/hidden": 0.78125, "loss/logits": 0.17543524503707886, "loss/reg": 0.02368931844830513, "step": 439 }, { "epoch": 0.055, "grad_norm": 2.955540895462036, "grad_norm_var": 0.1683967569747227, "learning_rate": 0.0001, "loss": 0.991, "loss/crossentropy": 2.747128486633301, "loss/hidden": 0.6484375, "loss/logits": 0.10580303519964218, "loss/reg": 0.023680580779910088, "step": 440 }, { "epoch": 0.055125, "grad_norm": 2.866759777069092, "grad_norm_var": 0.16086728592038804, "learning_rate": 0.0001, "loss": 0.9014, "loss/crossentropy": 2.4465224742889404, "loss/hidden": 0.5703125, "loss/logits": 0.09433356672525406, "loss/reg": 0.023672088980674744, "step": 441 }, { "epoch": 0.05525, "grad_norm": 3.1793012619018555, "grad_norm_var": 0.14310091597801508, "learning_rate": 0.0001, "loss": 1.0742, "loss/crossentropy": 2.364579677581787, "loss/hidden": 0.6875, "loss/logits": 0.15003816783428192, "loss/reg": 0.023663459345698357, "step": 442 }, { "epoch": 0.055375, "grad_norm": 4.3040900230407715, "grad_norm_var": 0.19784760386154687, "learning_rate": 0.0001, "loss": 1.1406, "loss/crossentropy": 2.955233573913574, "loss/hidden": 0.74609375, "loss/logits": 0.1579587459564209, "loss/reg": 0.023654496297240257, "step": 443 }, { "epoch": 0.0555, "grad_norm": 3.6495676040649414, "grad_norm_var": 0.19966009525053988, "learning_rate": 0.0001, "loss": 1.0678, "loss/crossentropy": 2.184626340866089, "loss/hidden": 0.71484375, "loss/logits": 0.11654888093471527, "loss/reg": 0.023645464330911636, "step": 444 }, { "epoch": 0.055625, "grad_norm": 2.7992637157440186, "grad_norm_var": 0.21692371557562992, "learning_rate": 0.0001, "loss": 1.0746, "loss/crossentropy": 1.9573417901992798, "loss/hidden": 0.71484375, "loss/logits": 0.1234317272901535, "loss/reg": 0.023636594414711, "step": 445 }, { "epoch": 0.05575, "grad_norm": 3.4131267070770264, "grad_norm_var": 0.21645445922388262, "learning_rate": 0.0001, "loss": 1.0938, "loss/crossentropy": 2.3615517616271973, "loss/hidden": 0.71875, "loss/logits": 0.1387380063533783, "loss/reg": 0.023627731949090958, "step": 446 }, { "epoch": 0.055875, "grad_norm": 2.890436887741089, "grad_norm_var": 0.19547383544341201, "learning_rate": 0.0001, "loss": 0.8903, "loss/crossentropy": 2.2940785884857178, "loss/hidden": 0.5546875, "loss/logits": 0.09945578873157501, "loss/reg": 0.023619333282113075, "step": 447 }, { "epoch": 0.056, "grad_norm": 3.9202187061309814, "grad_norm_var": 0.20821686288428035, "learning_rate": 0.0001, "loss": 1.1124, "loss/crossentropy": 2.561680316925049, "loss/hidden": 0.73828125, "loss/logits": 0.13801740109920502, "loss/reg": 0.023610329255461693, "step": 448 }, { "epoch": 0.056125, "grad_norm": 3.0585479736328125, "grad_norm_var": 0.21081889060941586, "learning_rate": 0.0001, "loss": 0.9545, "loss/crossentropy": 2.4095382690429688, "loss/hidden": 0.609375, "loss/logits": 0.10911431908607483, "loss/reg": 0.02360081672668457, "step": 449 }, { "epoch": 0.05625, "grad_norm": 3.3532161712646484, "grad_norm_var": 0.2007006666523369, "learning_rate": 0.0001, "loss": 1.1045, "loss/crossentropy": 1.8727831840515137, "loss/hidden": 0.7421875, "loss/logits": 0.1264159381389618, "loss/reg": 0.023592744022607803, "step": 450 }, { "epoch": 0.056375, "grad_norm": 3.1456074714660645, "grad_norm_var": 0.20128870800301848, "learning_rate": 0.0001, "loss": 0.9574, "loss/crossentropy": 2.4792492389678955, "loss/hidden": 0.609375, "loss/logits": 0.11215440928936005, "loss/reg": 0.02358343079686165, "step": 451 }, { "epoch": 0.0565, "grad_norm": 3.402637004852295, "grad_norm_var": 0.1739656178124496, "learning_rate": 0.0001, "loss": 1.0743, "loss/crossentropy": 2.3384110927581787, "loss/hidden": 0.6875, "loss/logits": 0.15106429159641266, "loss/reg": 0.02357417158782482, "step": 452 }, { "epoch": 0.056625, "grad_norm": 3.259817361831665, "grad_norm_var": 0.17379912081048493, "learning_rate": 0.0001, "loss": 1.1221, "loss/crossentropy": 2.456613779067993, "loss/hidden": 0.734375, "loss/logits": 0.15212517976760864, "loss/reg": 0.023564757779240608, "step": 453 }, { "epoch": 0.05675, "grad_norm": 2.531987190246582, "grad_norm_var": 0.195773570863138, "learning_rate": 0.0001, "loss": 0.9605, "loss/crossentropy": 2.394343376159668, "loss/hidden": 0.609375, "loss/logits": 0.11555634438991547, "loss/reg": 0.023556271567940712, "step": 454 }, { "epoch": 0.056875, "grad_norm": 2.594336986541748, "grad_norm_var": 0.22107356191806862, "learning_rate": 0.0001, "loss": 0.9207, "loss/crossentropy": 2.781731367111206, "loss/hidden": 0.58203125, "loss/logits": 0.10317155718803406, "loss/reg": 0.023547139018774033, "step": 455 }, { "epoch": 0.057, "grad_norm": 3.1260924339294434, "grad_norm_var": 0.21715561662650557, "learning_rate": 0.0001, "loss": 1.1637, "loss/crossentropy": 2.5018725395202637, "loss/hidden": 0.78125, "loss/logits": 0.14706720411777496, "loss/reg": 0.02353852428495884, "step": 456 }, { "epoch": 0.057125, "grad_norm": 3.159911632537842, "grad_norm_var": 0.20878072756432645, "learning_rate": 0.0001, "loss": 1.0676, "loss/crossentropy": 2.668788433074951, "loss/hidden": 0.68359375, "loss/logits": 0.1487644910812378, "loss/reg": 0.02352879010140896, "step": 457 }, { "epoch": 0.05725, "grad_norm": 3.0937418937683105, "grad_norm_var": 0.20989373673105333, "learning_rate": 0.0001, "loss": 1.0188, "loss/crossentropy": 2.498286008834839, "loss/hidden": 0.6640625, "loss/logits": 0.11953231692314148, "loss/reg": 0.023519227281212807, "step": 458 }, { "epoch": 0.057375, "grad_norm": 2.4465949535369873, "grad_norm_var": 0.15987096754079838, "learning_rate": 0.0001, "loss": 1.0333, "loss/crossentropy": 2.401630401611328, "loss/hidden": 0.671875, "loss/logits": 0.12631601095199585, "loss/reg": 0.023509083315730095, "step": 459 }, { "epoch": 0.0575, "grad_norm": 3.8185830116271973, "grad_norm_var": 0.17369585396981316, "learning_rate": 0.0001, "loss": 1.2185, "loss/crossentropy": 2.3343100547790527, "loss/hidden": 0.84375, "loss/logits": 0.13973468542099, "loss/reg": 0.023500461131334305, "step": 460 }, { "epoch": 0.057625, "grad_norm": 2.769894599914551, "grad_norm_var": 0.1750287637092998, "learning_rate": 0.0001, "loss": 1.0478, "loss/crossentropy": 2.594421148300171, "loss/hidden": 0.66796875, "loss/logits": 0.14495806396007538, "loss/reg": 0.02349086105823517, "step": 461 }, { "epoch": 0.05775, "grad_norm": 3.924386501312256, "grad_norm_var": 0.21107140664515026, "learning_rate": 0.0001, "loss": 1.1201, "loss/crossentropy": 1.8543033599853516, "loss/hidden": 0.71875, "loss/logits": 0.16654378175735474, "loss/reg": 0.023482073098421097, "step": 462 }, { "epoch": 0.057875, "grad_norm": 2.719325304031372, "grad_norm_var": 0.21896016035892957, "learning_rate": 0.0001, "loss": 0.9973, "loss/crossentropy": 2.7110376358032227, "loss/hidden": 0.6328125, "loss/logits": 0.1297917366027832, "loss/reg": 0.02347267046570778, "step": 463 }, { "epoch": 0.058, "grad_norm": 3.110532522201538, "grad_norm_var": 0.17627651595159174, "learning_rate": 0.0001, "loss": 0.9503, "loss/crossentropy": 2.424137830734253, "loss/hidden": 0.61328125, "loss/logits": 0.10241679847240448, "loss/reg": 0.023463333025574684, "step": 464 }, { "epoch": 0.058125, "grad_norm": 3.2945988178253174, "grad_norm_var": 0.17862116157392785, "learning_rate": 0.0001, "loss": 1.1634, "loss/crossentropy": 2.8107378482818604, "loss/hidden": 0.75, "loss/logits": 0.17888996005058289, "loss/reg": 0.0234534852206707, "step": 465 }, { "epoch": 0.05825, "grad_norm": 4.864523887634277, "grad_norm_var": 0.37049430510921844, "learning_rate": 0.0001, "loss": 1.0812, "loss/crossentropy": 2.5031394958496094, "loss/hidden": 0.71484375, "loss/logits": 0.13192051649093628, "loss/reg": 0.023444540798664093, "step": 466 }, { "epoch": 0.058375, "grad_norm": 3.8722984790802, "grad_norm_var": 0.3978501673810001, "learning_rate": 0.0001, "loss": 1.2316, "loss/crossentropy": 1.9772007465362549, "loss/hidden": 0.8515625, "loss/logits": 0.14563970267772675, "loss/reg": 0.023435747250914574, "step": 467 }, { "epoch": 0.0585, "grad_norm": 4.621346473693848, "grad_norm_var": 0.5155902021721573, "learning_rate": 0.0001, "loss": 1.1725, "loss/crossentropy": 2.4977753162384033, "loss/hidden": 0.7890625, "loss/logits": 0.149122953414917, "loss/reg": 0.023427119478583336, "step": 468 }, { "epoch": 0.058625, "grad_norm": 3.36370849609375, "grad_norm_var": 0.5153549660191058, "learning_rate": 0.0001, "loss": 1.2411, "loss/crossentropy": 2.4750683307647705, "loss/hidden": 0.8203125, "loss/logits": 0.18663738667964935, "loss/reg": 0.023417862132191658, "step": 469 }, { "epoch": 0.05875, "grad_norm": 3.391871690750122, "grad_norm_var": 0.46984604899865395, "learning_rate": 0.0001, "loss": 1.1066, "loss/crossentropy": 2.5189239978790283, "loss/hidden": 0.71875, "loss/logits": 0.1537722945213318, "loss/reg": 0.023408619686961174, "step": 470 }, { "epoch": 0.058875, "grad_norm": 3.903122901916504, "grad_norm_var": 0.43880097595690587, "learning_rate": 0.0001, "loss": 0.9258, "loss/crossentropy": 2.548199415206909, "loss/hidden": 0.59375, "loss/logits": 0.09802491217851639, "loss/reg": 0.02339930646121502, "step": 471 }, { "epoch": 0.059, "grad_norm": 5.576291084289551, "grad_norm_var": 0.7024716555345464, "learning_rate": 0.0001, "loss": 1.2843, "loss/crossentropy": 2.682227849960327, "loss/hidden": 0.859375, "loss/logits": 0.19106051325798035, "loss/reg": 0.023390140384435654, "step": 472 }, { "epoch": 0.059125, "grad_norm": 2.4201464653015137, "grad_norm_var": 0.7821220779043936, "learning_rate": 0.0001, "loss": 0.9341, "loss/crossentropy": 2.5251269340515137, "loss/hidden": 0.59375, "loss/logits": 0.10658347606658936, "loss/reg": 0.023381320759654045, "step": 473 }, { "epoch": 0.05925, "grad_norm": 4.325901985168457, "grad_norm_var": 0.7980385459591806, "learning_rate": 0.0001, "loss": 0.9798, "loss/crossentropy": 2.3592865467071533, "loss/hidden": 0.640625, "loss/logits": 0.10548710823059082, "loss/reg": 0.023372096940875053, "step": 474 }, { "epoch": 0.059375, "grad_norm": 3.1382455825805664, "grad_norm_var": 0.7168259193102716, "learning_rate": 0.0001, "loss": 0.9839, "loss/crossentropy": 2.6056907176971436, "loss/hidden": 0.62890625, "loss/logits": 0.121395543217659, "loss/reg": 0.023363398388028145, "step": 475 }, { "epoch": 0.0595, "grad_norm": 3.0779755115509033, "grad_norm_var": 0.7388713721113239, "learning_rate": 0.0001, "loss": 1.0798, "loss/crossentropy": 2.320854663848877, "loss/hidden": 0.72265625, "loss/logits": 0.12356055527925491, "loss/reg": 0.023354284465312958, "step": 476 }, { "epoch": 0.059625, "grad_norm": 3.293567657470703, "grad_norm_var": 0.6946720185858983, "learning_rate": 0.0001, "loss": 0.9685, "loss/crossentropy": 2.7064430713653564, "loss/hidden": 0.625, "loss/logits": 0.11008161306381226, "loss/reg": 0.023345019668340683, "step": 477 }, { "epoch": 0.05975, "grad_norm": 2.664088249206543, "grad_norm_var": 0.7530647477645431, "learning_rate": 0.0001, "loss": 0.9101, "loss/crossentropy": 2.6147005558013916, "loss/hidden": 0.5703125, "loss/logits": 0.10643748193979263, "loss/reg": 0.02333623729646206, "step": 478 }, { "epoch": 0.059875, "grad_norm": 2.9174795150756836, "grad_norm_var": 0.7321888983535926, "learning_rate": 0.0001, "loss": 0.9342, "loss/crossentropy": 2.566849946975708, "loss/hidden": 0.58984375, "loss/logits": 0.11108069121837616, "loss/reg": 0.023327510803937912, "step": 479 }, { "epoch": 0.06, "grad_norm": 3.3596463203430176, "grad_norm_var": 0.71932045702877, "learning_rate": 0.0001, "loss": 1.0961, "loss/crossentropy": 2.645395040512085, "loss/hidden": 0.703125, "loss/logits": 0.15979796648025513, "loss/reg": 0.023318573832511902, "step": 480 }, { "epoch": 0.060125, "grad_norm": 4.0903096199035645, "grad_norm_var": 0.7232764591548666, "learning_rate": 0.0001, "loss": 1.1686, "loss/crossentropy": 2.077467918395996, "loss/hidden": 0.79296875, "loss/logits": 0.14252969622612, "loss/reg": 0.02330981194972992, "step": 481 }, { "epoch": 0.06025, "grad_norm": 3.277656078338623, "grad_norm_var": 0.6300433507978689, "learning_rate": 0.0001, "loss": 1.1136, "loss/crossentropy": 2.588385581970215, "loss/hidden": 0.7421875, "loss/logits": 0.13843819499015808, "loss/reg": 0.02330118976533413, "step": 482 }, { "epoch": 0.060375, "grad_norm": 4.117528915405273, "grad_norm_var": 0.6433314640873874, "learning_rate": 0.0001, "loss": 1.0239, "loss/crossentropy": 2.766176700592041, "loss/hidden": 0.6640625, "loss/logits": 0.12686920166015625, "loss/reg": 0.023292165249586105, "step": 483 }, { "epoch": 0.0605, "grad_norm": 11.29445743560791, "grad_norm_var": 4.338621670503842, "learning_rate": 0.0001, "loss": 1.9614, "loss/crossentropy": 2.5279059410095215, "loss/hidden": 1.2109375, "loss/logits": 0.5176718235015869, "loss/reg": 0.02328311838209629, "step": 484 }, { "epoch": 0.060625, "grad_norm": 3.5054728984832764, "grad_norm_var": 4.327600163307723, "learning_rate": 0.0001, "loss": 1.0719, "loss/crossentropy": 2.726095199584961, "loss/hidden": 0.7109375, "loss/logits": 0.12821289896965027, "loss/reg": 0.023273879662156105, "step": 485 }, { "epoch": 0.06075, "grad_norm": 3.157118558883667, "grad_norm_var": 4.350771203860321, "learning_rate": 0.0001, "loss": 0.9326, "loss/crossentropy": 2.207131862640381, "loss/hidden": 0.60546875, "loss/logits": 0.09451892971992493, "loss/reg": 0.02326469123363495, "step": 486 }, { "epoch": 0.060875, "grad_norm": 5.006563186645508, "grad_norm_var": 4.411522578027558, "learning_rate": 0.0001, "loss": 1.1884, "loss/crossentropy": 3.005479335784912, "loss/hidden": 0.78515625, "loss/logits": 0.17073442041873932, "loss/reg": 0.02325539104640484, "step": 487 }, { "epoch": 0.061, "grad_norm": 8.253157615661621, "grad_norm_var": 5.3947068177792, "learning_rate": 0.0001, "loss": 1.4073, "loss/crossentropy": 2.329857349395752, "loss/hidden": 0.984375, "loss/logits": 0.19045758247375488, "loss/reg": 0.02324584126472473, "step": 488 }, { "epoch": 0.061125, "grad_norm": 4.795626640319824, "grad_norm_var": 5.169810789054156, "learning_rate": 0.0001, "loss": 1.2274, "loss/crossentropy": 2.354893207550049, "loss/hidden": 0.8515625, "loss/logits": 0.14352190494537354, "loss/reg": 0.023236218839883804, "step": 489 }, { "epoch": 0.06125, "grad_norm": 3.1360483169555664, "grad_norm_var": 5.268809256909969, "learning_rate": 0.0001, "loss": 1.0242, "loss/crossentropy": 2.2810122966766357, "loss/hidden": 0.671875, "loss/logits": 0.12006018310785294, "loss/reg": 0.02322734333574772, "step": 490 }, { "epoch": 0.061375, "grad_norm": 3.931467056274414, "grad_norm_var": 5.1833802842946906, "learning_rate": 0.0001, "loss": 1.1688, "loss/crossentropy": 2.3372247219085693, "loss/hidden": 0.78515625, "loss/logits": 0.15147234499454498, "loss/reg": 0.023217879235744476, "step": 491 }, { "epoch": 0.0615, "grad_norm": 3.8858096599578857, "grad_norm_var": 5.085283642122107, "learning_rate": 0.0001, "loss": 1.0309, "loss/crossentropy": 2.597487688064575, "loss/hidden": 0.6796875, "loss/logits": 0.11914543062448502, "loss/reg": 0.02320869080722332, "step": 492 }, { "epoch": 0.061625, "grad_norm": 6.116000175476074, "grad_norm_var": 5.160062314221837, "learning_rate": 0.0001, "loss": 1.2295, "loss/crossentropy": 2.4041733741760254, "loss/hidden": 0.8515625, "loss/logits": 0.14599129557609558, "loss/reg": 0.023199014365673065, "step": 493 }, { "epoch": 0.06175, "grad_norm": 3.4635026454925537, "grad_norm_var": 4.994267697000357, "learning_rate": 0.0001, "loss": 1.0957, "loss/crossentropy": 2.692230224609375, "loss/hidden": 0.73046875, "loss/logits": 0.13329669833183289, "loss/reg": 0.023189352825284004, "step": 494 }, { "epoch": 0.061875, "grad_norm": 6.833379745483398, "grad_norm_var": 5.051083471594066, "learning_rate": 0.0001, "loss": 1.1437, "loss/crossentropy": 2.470660448074341, "loss/hidden": 0.7890625, "loss/logits": 0.12278926372528076, "loss/reg": 0.023179946467280388, "step": 495 }, { "epoch": 0.062, "grad_norm": 3.2948107719421387, "grad_norm_var": 5.064566926371495, "learning_rate": 0.0001, "loss": 1.109, "loss/crossentropy": 2.3682029247283936, "loss/hidden": 0.7421875, "loss/logits": 0.13507232069969177, "loss/reg": 0.02317013218998909, "step": 496 }, { "epoch": 0.062125, "grad_norm": 4.057919025421143, "grad_norm_var": 5.068064269732227, "learning_rate": 0.0001, "loss": 1.1648, "loss/crossentropy": 2.4153223037719727, "loss/hidden": 0.765625, "loss/logits": 0.16754823923110962, "loss/reg": 0.02316114492714405, "step": 497 }, { "epoch": 0.06225, "grad_norm": 3.3985679149627686, "grad_norm_var": 5.043098814178749, "learning_rate": 0.0001, "loss": 1.1033, "loss/crossentropy": 2.6683197021484375, "loss/hidden": 0.73828125, "loss/logits": 0.13353273272514343, "loss/reg": 0.023151271045207977, "step": 498 }, { "epoch": 0.062375, "grad_norm": 2.6574859619140625, "grad_norm_var": 5.326800856327217, "learning_rate": 0.0001, "loss": 0.9413, "loss/crossentropy": 2.490849733352661, "loss/hidden": 0.60546875, "loss/logits": 0.1043705940246582, "loss/reg": 0.023141290992498398, "step": 499 }, { "epoch": 0.0625, "grad_norm": 3.2627739906311035, "grad_norm_var": 2.402846049322009, "learning_rate": 0.0001, "loss": 1.0233, "loss/crossentropy": 2.3429057598114014, "loss/hidden": 0.63671875, "loss/logits": 0.1552838534116745, "loss/reg": 0.02313125506043434, "step": 500 }, { "epoch": 0.062625, "grad_norm": 3.2453906536102295, "grad_norm_var": 2.4345300369903553, "learning_rate": 0.0001, "loss": 1.1719, "loss/crossentropy": 2.497878313064575, "loss/hidden": 0.76953125, "loss/logits": 0.17116406559944153, "loss/reg": 0.02312229759991169, "step": 501 }, { "epoch": 0.06275, "grad_norm": 4.79340934753418, "grad_norm_var": 2.3566760840149366, "learning_rate": 0.0001, "loss": 1.0001, "loss/crossentropy": 2.360431671142578, "loss/hidden": 0.6484375, "loss/logits": 0.12051868438720703, "loss/reg": 0.02311263047158718, "step": 502 }, { "epoch": 0.062875, "grad_norm": 3.1595826148986816, "grad_norm_var": 2.4163836713766425, "learning_rate": 0.0001, "loss": 0.9158, "loss/crossentropy": 2.266618490219116, "loss/hidden": 0.58203125, "loss/logits": 0.1027822494506836, "loss/reg": 0.023102767765522003, "step": 503 }, { "epoch": 0.063, "grad_norm": 3.59019136428833, "grad_norm_var": 1.2975304557542653, "learning_rate": 0.0001, "loss": 0.9685, "loss/crossentropy": 2.6386334896087646, "loss/hidden": 0.625, "loss/logits": 0.11253425478935242, "loss/reg": 0.023093828931450844, "step": 504 }, { "epoch": 0.063125, "grad_norm": 3.1218326091766357, "grad_norm_var": 1.2897946661694502, "learning_rate": 0.0001, "loss": 0.9493, "loss/crossentropy": 2.2931203842163086, "loss/hidden": 0.6171875, "loss/logits": 0.10128200799226761, "loss/reg": 0.0230838842689991, "step": 505 }, { "epoch": 0.06325, "grad_norm": 7.243019104003906, "grad_norm_var": 1.941121973828988, "learning_rate": 0.0001, "loss": 1.1363, "loss/crossentropy": 2.510519504547119, "loss/hidden": 0.7890625, "loss/logits": 0.11650878190994263, "loss/reg": 0.023074399679899216, "step": 506 }, { "epoch": 0.063375, "grad_norm": 2.7458910942077637, "grad_norm_var": 2.0601092371510408, "learning_rate": 0.0001, "loss": 0.8688, "loss/crossentropy": 2.504798173904419, "loss/hidden": 0.54296875, "loss/logits": 0.09517204016447067, "loss/reg": 0.023064618930220604, "step": 507 }, { "epoch": 0.0635, "grad_norm": 3.834894895553589, "grad_norm_var": 2.061415401484546, "learning_rate": 0.0001, "loss": 1.0614, "loss/crossentropy": 2.504178285598755, "loss/hidden": 0.69140625, "loss/logits": 0.13941214978694916, "loss/reg": 0.023055192083120346, "step": 508 }, { "epoch": 0.063625, "grad_norm": 3.0524418354034424, "grad_norm_var": 1.8045701590720038, "learning_rate": 0.0001, "loss": 0.9395, "loss/crossentropy": 2.6670830249786377, "loss/hidden": 0.6015625, "loss/logits": 0.10751838982105255, "loss/reg": 0.023046277463436127, "step": 509 }, { "epoch": 0.06375, "grad_norm": 2.638979196548462, "grad_norm_var": 1.8906158947457992, "learning_rate": 0.0001, "loss": 0.9649, "loss/crossentropy": 2.5606770515441895, "loss/hidden": 0.62109375, "loss/logits": 0.11341118812561035, "loss/reg": 0.02303677424788475, "step": 510 }, { "epoch": 0.063875, "grad_norm": 4.029105186462402, "grad_norm_var": 1.2509737999906814, "learning_rate": 0.0001, "loss": 1.0378, "loss/crossentropy": 2.446560859680176, "loss/hidden": 0.7109375, "loss/logits": 0.09661944955587387, "loss/reg": 0.023027852177619934, "step": 511 }, { "epoch": 0.064, "grad_norm": 3.203378438949585, "grad_norm_var": 1.255617850639648, "learning_rate": 0.0001, "loss": 0.9705, "loss/crossentropy": 2.08353328704834, "loss/hidden": 0.63671875, "loss/logits": 0.10362571477890015, "loss/reg": 0.02301831543445587, "step": 512 }, { "epoch": 0.064125, "grad_norm": 2.5737931728363037, "grad_norm_var": 1.3080458668089554, "learning_rate": 0.0001, "loss": 1.0597, "loss/crossentropy": 2.3520448207855225, "loss/hidden": 0.7109375, "loss/logits": 0.11864635348320007, "loss/reg": 0.023009376600384712, "step": 513 }, { "epoch": 0.06425, "grad_norm": 3.6731107234954834, "grad_norm_var": 1.307783724921588, "learning_rate": 0.0001, "loss": 1.1013, "loss/crossentropy": 2.77113938331604, "loss/hidden": 0.73046875, "loss/logits": 0.1408485472202301, "loss/reg": 0.022999830543994904, "step": 514 }, { "epoch": 0.064375, "grad_norm": 3.06605863571167, "grad_norm_var": 1.269509965568249, "learning_rate": 0.0001, "loss": 0.9224, "loss/crossentropy": 2.3147072792053223, "loss/hidden": 0.59375, "loss/logits": 0.0987289547920227, "loss/reg": 0.022990131750702858, "step": 515 }, { "epoch": 0.0645, "grad_norm": 3.462446689605713, "grad_norm_var": 1.2636330593023397, "learning_rate": 0.0001, "loss": 1.0201, "loss/crossentropy": 2.4042327404022217, "loss/hidden": 0.66796875, "loss/logits": 0.12231434136629105, "loss/reg": 0.02298046089708805, "step": 516 }, { "epoch": 0.064625, "grad_norm": 4.2029500007629395, "grad_norm_var": 1.2769943636458339, "learning_rate": 0.0001, "loss": 1.222, "loss/crossentropy": 2.514112710952759, "loss/hidden": 0.82421875, "loss/logits": 0.16808617115020752, "loss/reg": 0.022970519959926605, "step": 517 }, { "epoch": 0.06475, "grad_norm": 3.433554172515869, "grad_norm_var": 1.1851525686550586, "learning_rate": 0.0001, "loss": 1.2444, "loss/crossentropy": 2.493224859237671, "loss/hidden": 0.84375, "loss/logits": 0.17107471823692322, "loss/reg": 0.022961357608437538, "step": 518 }, { "epoch": 0.064875, "grad_norm": 4.310100078582764, "grad_norm_var": 1.2057753361074104, "learning_rate": 0.0001, "loss": 1.0108, "loss/crossentropy": 2.5606930255889893, "loss/hidden": 0.67578125, "loss/logits": 0.10546360909938812, "loss/reg": 0.022952331230044365, "step": 519 }, { "epoch": 0.065, "grad_norm": 3.674527883529663, "grad_norm_var": 1.2057007253632908, "learning_rate": 0.0001, "loss": 0.9803, "loss/crossentropy": 2.4196624755859375, "loss/hidden": 0.64453125, "loss/logits": 0.10631287097930908, "loss/reg": 0.02294265851378441, "step": 520 }, { "epoch": 0.065125, "grad_norm": 3.101484775543213, "grad_norm_var": 1.20713683658368, "learning_rate": 0.0001, "loss": 1.2375, "loss/crossentropy": 2.1448891162872314, "loss/hidden": 0.88671875, "loss/logits": 0.12145140767097473, "loss/reg": 0.02293260022997856, "step": 521 }, { "epoch": 0.06525, "grad_norm": 3.2564265727996826, "grad_norm_var": 0.2854656858181736, "learning_rate": 0.0001, "loss": 0.9783, "loss/crossentropy": 2.3986809253692627, "loss/hidden": 0.609375, "loss/logits": 0.1396813988685608, "loss/reg": 0.022922798991203308, "step": 522 }, { "epoch": 0.065375, "grad_norm": 3.3007333278656006, "grad_norm_var": 0.2569672821287893, "learning_rate": 0.0001, "loss": 0.9586, "loss/crossentropy": 2.7955212593078613, "loss/hidden": 0.62109375, "loss/logits": 0.10833179950714111, "loss/reg": 0.02291307970881462, "step": 523 }, { "epoch": 0.0655, "grad_norm": 2.9546499252319336, "grad_norm_var": 0.25738909944094907, "learning_rate": 0.0001, "loss": 1.0937, "loss/crossentropy": 2.409029006958008, "loss/hidden": 0.71875, "loss/logits": 0.14591118693351746, "loss/reg": 0.022904111072421074, "step": 524 }, { "epoch": 0.065625, "grad_norm": 3.2193830013275146, "grad_norm_var": 0.25204334767618, "learning_rate": 0.0001, "loss": 1.1221, "loss/crossentropy": 2.491401195526123, "loss/hidden": 0.75390625, "loss/logits": 0.13920898735523224, "loss/reg": 0.02289445698261261, "step": 525 }, { "epoch": 0.06575, "grad_norm": 40.478694915771484, "grad_norm_var": 85.9971082258447, "learning_rate": 0.0001, "loss": 1.1342, "loss/crossentropy": 2.4311652183532715, "loss/hidden": 0.796875, "loss/logits": 0.10845671594142914, "loss/reg": 0.022885650396347046, "step": 526 }, { "epoch": 0.065875, "grad_norm": 3.2905385494232178, "grad_norm_var": 86.20029999738617, "learning_rate": 0.0001, "loss": 1.0101, "loss/crossentropy": 2.1737422943115234, "loss/hidden": 0.671875, "loss/logits": 0.10944204032421112, "loss/reg": 0.022876843810081482, "step": 527 }, { "epoch": 0.066, "grad_norm": 4.666721343994141, "grad_norm_var": 85.84699165642114, "learning_rate": 0.0001, "loss": 1.2142, "loss/crossentropy": 2.462200880050659, "loss/hidden": 0.86328125, "loss/logits": 0.12223749607801437, "loss/reg": 0.02286742813885212, "step": 528 }, { "epoch": 0.066125, "grad_norm": 3.0347273349761963, "grad_norm_var": 85.66251244998139, "learning_rate": 0.0001, "loss": 1.0285, "loss/crossentropy": 2.38840651512146, "loss/hidden": 0.6640625, "loss/logits": 0.13581448793411255, "loss/reg": 0.022858494892716408, "step": 529 }, { "epoch": 0.06625, "grad_norm": 2.5890092849731445, "grad_norm_var": 86.04634847608627, "learning_rate": 0.0001, "loss": 0.967, "loss/crossentropy": 2.5042731761932373, "loss/hidden": 0.62890625, "loss/logits": 0.10961504280567169, "loss/reg": 0.02284966967999935, "step": 530 }, { "epoch": 0.066375, "grad_norm": 3.3963401317596436, "grad_norm_var": 85.93485657047692, "learning_rate": 0.0001, "loss": 1.072, "loss/crossentropy": 2.7259116172790527, "loss/hidden": 0.7109375, "loss/logits": 0.13268503546714783, "loss/reg": 0.02284088172018528, "step": 531 }, { "epoch": 0.0665, "grad_norm": 3.731293201446533, "grad_norm_var": 85.85653980693046, "learning_rate": 0.0001, "loss": 1.0647, "loss/crossentropy": 2.277968168258667, "loss/hidden": 0.71484375, "loss/logits": 0.12150134146213531, "loss/reg": 0.02283208817243576, "step": 532 }, { "epoch": 0.066625, "grad_norm": 4.581428050994873, "grad_norm_var": 85.78540060231343, "learning_rate": 0.0001, "loss": 1.2486, "loss/crossentropy": 2.067720890045166, "loss/hidden": 0.84375, "loss/logits": 0.17660680413246155, "loss/reg": 0.02282322198152542, "step": 533 }, { "epoch": 0.06675, "grad_norm": 3.0526421070098877, "grad_norm_var": 85.91535378874303, "learning_rate": 0.0001, "loss": 1.1318, "loss/crossentropy": 2.4441521167755127, "loss/hidden": 0.7734375, "loss/logits": 0.13023720681667328, "loss/reg": 0.022814445197582245, "step": 534 }, { "epoch": 0.066875, "grad_norm": 3.4852664470672607, "grad_norm_var": 86.12062292738887, "learning_rate": 0.0001, "loss": 0.9936, "loss/crossentropy": 2.418733596801758, "loss/hidden": 0.66015625, "loss/logits": 0.10543158650398254, "loss/reg": 0.022805610671639442, "step": 535 }, { "epoch": 0.067, "grad_norm": 2.7321274280548096, "grad_norm_var": 86.43545869041343, "learning_rate": 0.0001, "loss": 1.0096, "loss/crossentropy": 2.3275394439697266, "loss/hidden": 0.65625, "loss/logits": 0.12536926567554474, "loss/reg": 0.022797243669629097, "step": 536 }, { "epoch": 0.067125, "grad_norm": 3.029811382293701, "grad_norm_var": 86.46041611877568, "learning_rate": 0.0001, "loss": 1.1049, "loss/crossentropy": 2.2477617263793945, "loss/hidden": 0.74609375, "loss/logits": 0.1309519112110138, "loss/reg": 0.022788099944591522, "step": 537 }, { "epoch": 0.06725, "grad_norm": 2.7345895767211914, "grad_norm_var": 86.64571497988939, "learning_rate": 0.0001, "loss": 1.1272, "loss/crossentropy": 2.4289236068725586, "loss/hidden": 0.7578125, "loss/logits": 0.141631618142128, "loss/reg": 0.022778736427426338, "step": 538 }, { "epoch": 0.067375, "grad_norm": 3.2934482097625732, "grad_norm_var": 86.6479928457627, "learning_rate": 0.0001, "loss": 1.0286, "loss/crossentropy": 2.538973093032837, "loss/hidden": 0.67578125, "loss/logits": 0.12509004771709442, "loss/reg": 0.02277030609548092, "step": 539 }, { "epoch": 0.0675, "grad_norm": 3.833656072616577, "grad_norm_var": 86.38133368804911, "learning_rate": 0.0001, "loss": 0.999, "loss/crossentropy": 2.429593563079834, "loss/hidden": 0.64453125, "loss/logits": 0.12682604789733887, "loss/reg": 0.022761952131986618, "step": 540 }, { "epoch": 0.067625, "grad_norm": 3.544104814529419, "grad_norm_var": 86.28065873545309, "learning_rate": 0.0001, "loss": 1.0002, "loss/crossentropy": 2.166292905807495, "loss/hidden": 0.66015625, "loss/logits": 0.11254848539829254, "loss/reg": 0.02275264821946621, "step": 541 }, { "epoch": 0.06775, "grad_norm": 4.461411952972412, "grad_norm_var": 0.4229304647166086, "learning_rate": 0.0001, "loss": 1.6203, "loss/crossentropy": 2.6347737312316895, "loss/hidden": 1.1328125, "loss/logits": 0.2600440979003906, "loss/reg": 0.022744029760360718, "step": 542 }, { "epoch": 0.067875, "grad_norm": 2.6814959049224854, "grad_norm_var": 0.46036790462302574, "learning_rate": 0.0001, "loss": 0.9923, "loss/crossentropy": 2.580264091491699, "loss/hidden": 0.65234375, "loss/logits": 0.11260214447975159, "loss/reg": 0.022734828293323517, "step": 543 }, { "epoch": 0.068, "grad_norm": 3.685408353805542, "grad_norm_var": 0.35847800648438505, "learning_rate": 0.0001, "loss": 1.1452, "loss/crossentropy": 2.7129862308502197, "loss/hidden": 0.78515625, "loss/logits": 0.13279825448989868, "loss/reg": 0.022726204246282578, "step": 544 }, { "epoch": 0.068125, "grad_norm": 6.349724292755127, "grad_norm_var": 0.8985836730566762, "learning_rate": 0.0001, "loss": 1.1353, "loss/crossentropy": 2.7293214797973633, "loss/hidden": 0.65625, "loss/logits": 0.2518823742866516, "loss/reg": 0.022716930136084557, "step": 545 }, { "epoch": 0.06825, "grad_norm": 3.681774616241455, "grad_norm_var": 0.829722440393675, "learning_rate": 0.0001, "loss": 1.1759, "loss/crossentropy": 2.278223991394043, "loss/hidden": 0.8046875, "loss/logits": 0.1441642791032791, "loss/reg": 0.022707859054207802, "step": 546 }, { "epoch": 0.068375, "grad_norm": 3.84778094291687, "grad_norm_var": 0.8276635905853821, "learning_rate": 0.0001, "loss": 1.3398, "loss/crossentropy": 2.1253304481506348, "loss/hidden": 0.94921875, "loss/logits": 0.16354900598526, "loss/reg": 0.022699227556586266, "step": 547 }, { "epoch": 0.0685, "grad_norm": 5.178676605224609, "grad_norm_var": 0.9703527182714744, "learning_rate": 0.0001, "loss": 1.022, "loss/crossentropy": 2.7832319736480713, "loss/hidden": 0.67578125, "loss/logits": 0.11932960152626038, "loss/reg": 0.02269013226032257, "step": 548 }, { "epoch": 0.068625, "grad_norm": 3.0284383296966553, "grad_norm_var": 0.9511722709094016, "learning_rate": 0.0001, "loss": 1.0412, "loss/crossentropy": 1.9599052667617798, "loss/hidden": 0.703125, "loss/logits": 0.11125050485134125, "loss/reg": 0.022681355476379395, "step": 549 }, { "epoch": 0.06875, "grad_norm": 3.167809247970581, "grad_norm_var": 0.942616955302132, "learning_rate": 0.0001, "loss": 1.0545, "loss/crossentropy": 2.5844783782958984, "loss/hidden": 0.69140625, "loss/logits": 0.13634443283081055, "loss/reg": 0.02267223782837391, "step": 550 }, { "epoch": 0.068875, "grad_norm": 3.2949278354644775, "grad_norm_var": 0.9495941353113788, "learning_rate": 0.0001, "loss": 1.0619, "loss/crossentropy": 2.314173460006714, "loss/hidden": 0.71484375, "loss/logits": 0.12046810984611511, "loss/reg": 0.022662866860628128, "step": 551 }, { "epoch": 0.069, "grad_norm": 2.8322856426239014, "grad_norm_var": 0.9378422714313653, "learning_rate": 0.0001, "loss": 1.0272, "loss/crossentropy": 2.768298387527466, "loss/hidden": 0.66796875, "loss/logits": 0.1326732635498047, "loss/reg": 0.022653890773653984, "step": 552 }, { "epoch": 0.069125, "grad_norm": 6.815005779266357, "grad_norm_var": 1.5125797637252996, "learning_rate": 0.0001, "loss": 1.1457, "loss/crossentropy": 2.8011491298675537, "loss/hidden": 0.78515625, "loss/logits": 0.13409599661827087, "loss/reg": 0.022644398733973503, "step": 553 }, { "epoch": 0.06925, "grad_norm": 4.219581127166748, "grad_norm_var": 1.4192768991356985, "learning_rate": 0.0001, "loss": 1.2504, "loss/crossentropy": 2.5874292850494385, "loss/hidden": 0.8515625, "loss/logits": 0.17252284288406372, "loss/reg": 0.02263464592397213, "step": 554 }, { "epoch": 0.069375, "grad_norm": 3.8279531002044678, "grad_norm_var": 1.3871550629864857, "learning_rate": 0.0001, "loss": 1.1814, "loss/crossentropy": 2.5384669303894043, "loss/hidden": 0.80078125, "loss/logits": 0.1543978452682495, "loss/reg": 0.022625621408224106, "step": 555 }, { "epoch": 0.0695, "grad_norm": 3.563680648803711, "grad_norm_var": 1.3987108056073487, "learning_rate": 0.0001, "loss": 1.0727, "loss/crossentropy": 2.111318349838257, "loss/hidden": 0.73828125, "loss/logits": 0.10826431214809418, "loss/reg": 0.022616824135184288, "step": 556 }, { "epoch": 0.069625, "grad_norm": 3.9599223136901855, "grad_norm_var": 1.3836174934919199, "learning_rate": 0.0001, "loss": 1.0309, "loss/crossentropy": 2.429032325744629, "loss/hidden": 0.67578125, "loss/logits": 0.12908612191677094, "loss/reg": 0.022607678547501564, "step": 557 }, { "epoch": 0.06975, "grad_norm": 2.8072519302368164, "grad_norm_var": 1.4610802306207225, "learning_rate": 0.0001, "loss": 1.0694, "loss/crossentropy": 2.5817906856536865, "loss/hidden": 0.6875, "loss/logits": 0.15589380264282227, "loss/reg": 0.022598396986722946, "step": 558 }, { "epoch": 0.069875, "grad_norm": 2.764833927154541, "grad_norm_var": 1.4475983977607596, "learning_rate": 0.0001, "loss": 1.0233, "loss/crossentropy": 2.202237844467163, "loss/hidden": 0.66015625, "loss/logits": 0.13721255958080292, "loss/reg": 0.02258932963013649, "step": 559 }, { "epoch": 0.07, "grad_norm": 5.113494396209717, "grad_norm_var": 1.5267634464669517, "learning_rate": 0.0001, "loss": 1.1672, "loss/crossentropy": 2.006186008453369, "loss/hidden": 0.80078125, "loss/logits": 0.14061376452445984, "loss/reg": 0.022580046206712723, "step": 560 }, { "epoch": 0.070125, "grad_norm": 3.8133227825164795, "grad_norm_var": 1.1437787263680603, "learning_rate": 0.0001, "loss": 1.0148, "loss/crossentropy": 2.37170672416687, "loss/hidden": 0.67578125, "loss/logits": 0.11329137533903122, "loss/reg": 0.02257111482322216, "step": 561 }, { "epoch": 0.07025, "grad_norm": 2.7208938598632812, "grad_norm_var": 1.2255733087022764, "learning_rate": 0.0001, "loss": 1.0621, "loss/crossentropy": 2.271667242050171, "loss/hidden": 0.703125, "loss/logits": 0.1333208978176117, "loss/reg": 0.02256210334599018, "step": 562 }, { "epoch": 0.070375, "grad_norm": 53.38179016113281, "grad_norm_var": 154.8279377341773, "learning_rate": 0.0001, "loss": 0.9618, "loss/crossentropy": 2.2667322158813477, "loss/hidden": 0.640625, "loss/logits": 0.09567096829414368, "loss/reg": 0.02255306765437126, "step": 563 }, { "epoch": 0.0705, "grad_norm": 5.538954257965088, "grad_norm_var": 154.75309317540348, "learning_rate": 0.0001, "loss": 1.782, "loss/crossentropy": 2.5168824195861816, "loss/hidden": 1.3671875, "loss/logits": 0.1893981695175171, "loss/reg": 0.02254408597946167, "step": 564 }, { "epoch": 0.070625, "grad_norm": 2.8311338424682617, "grad_norm_var": 154.85811657117597, "learning_rate": 0.0001, "loss": 1.1104, "loss/crossentropy": 2.1938281059265137, "loss/hidden": 0.75, "loss/logits": 0.13508498668670654, "loss/reg": 0.02253509685397148, "step": 565 }, { "epoch": 0.07075, "grad_norm": 4.604408264160156, "grad_norm_var": 154.26918998432618, "learning_rate": 0.0001, "loss": 1.2995, "loss/crossentropy": 2.103062391281128, "loss/hidden": 0.91015625, "loss/logits": 0.16405051946640015, "loss/reg": 0.02252543345093727, "step": 566 }, { "epoch": 0.070875, "grad_norm": 4.917901992797852, "grad_norm_var": 153.63084329919155, "learning_rate": 0.0001, "loss": 1.2945, "loss/crossentropy": 2.6932315826416016, "loss/hidden": 0.87890625, "loss/logits": 0.1904207468032837, "loss/reg": 0.022515632212162018, "step": 567 }, { "epoch": 0.071, "grad_norm": 4.055351257324219, "grad_norm_var": 153.02723135387427, "learning_rate": 0.0001, "loss": 1.2253, "loss/crossentropy": 2.3849422931671143, "loss/hidden": 0.8359375, "loss/logits": 0.1642536073923111, "loss/reg": 0.02250652387738228, "step": 568 }, { "epoch": 0.071125, "grad_norm": 2.851072072982788, "grad_norm_var": 154.20402053832456, "learning_rate": 0.0001, "loss": 1.0724, "loss/crossentropy": 2.580428123474121, "loss/hidden": 0.703125, "loss/logits": 0.144349604845047, "loss/reg": 0.022497190162539482, "step": 569 }, { "epoch": 0.07125, "grad_norm": 3.257493495941162, "grad_norm_var": 154.6102933496205, "learning_rate": 0.0001, "loss": 0.9328, "loss/crossentropy": 2.4263837337493896, "loss/hidden": 0.59375, "loss/logits": 0.11414404958486557, "loss/reg": 0.02248740941286087, "step": 570 }, { "epoch": 0.071375, "grad_norm": 3.6194608211517334, "grad_norm_var": 154.69773136421824, "learning_rate": 0.0001, "loss": 1.0417, "loss/crossentropy": 2.7035937309265137, "loss/hidden": 0.69140625, "loss/logits": 0.12549756467342377, "loss/reg": 0.022477447986602783, "step": 571 }, { "epoch": 0.0715, "grad_norm": 4.193033695220947, "grad_norm_var": 154.44566535859553, "learning_rate": 0.0001, "loss": 1.6505, "loss/crossentropy": 2.274057149887085, "loss/hidden": 1.1640625, "loss/logits": 0.26177138090133667, "loss/reg": 0.022467276081442833, "step": 572 }, { "epoch": 0.071625, "grad_norm": 2.7127623558044434, "grad_norm_var": 155.03209308401435, "learning_rate": 0.0001, "loss": 0.9118, "loss/crossentropy": 2.4076664447784424, "loss/hidden": 0.58203125, "loss/logits": 0.10521911084651947, "loss/reg": 0.022456735372543335, "step": 573 }, { "epoch": 0.07175, "grad_norm": 3.5871214866638184, "grad_norm_var": 154.65243889362196, "learning_rate": 0.0001, "loss": 0.9755, "loss/crossentropy": 2.8088905811309814, "loss/hidden": 0.62890625, "loss/logits": 0.12216061353683472, "loss/reg": 0.022446416318416595, "step": 574 }, { "epoch": 0.071875, "grad_norm": 20.843276977539062, "grad_norm_var": 165.17750310305152, "learning_rate": 0.0001, "loss": 1.1707, "loss/crossentropy": 2.569218635559082, "loss/hidden": 0.82421875, "loss/logits": 0.1221412867307663, "loss/reg": 0.02243630215525627, "step": 575 }, { "epoch": 0.072, "grad_norm": 3.375251293182373, "grad_norm_var": 166.03594003131977, "learning_rate": 0.0001, "loss": 0.9923, "loss/crossentropy": 2.7540183067321777, "loss/hidden": 0.640625, "loss/logits": 0.12738245725631714, "loss/reg": 0.022425668314099312, "step": 576 }, { "epoch": 0.072125, "grad_norm": 3.457340717315674, "grad_norm_var": 166.23754433202586, "learning_rate": 0.0001, "loss": 1.0371, "loss/crossentropy": 2.2943716049194336, "loss/hidden": 0.66796875, "loss/logits": 0.14497271180152893, "loss/reg": 0.022415172308683395, "step": 577 }, { "epoch": 0.07225, "grad_norm": 2.966371774673462, "grad_norm_var": 166.07272256293106, "learning_rate": 0.0001, "loss": 1.2044, "loss/crossentropy": 2.760359048843384, "loss/hidden": 0.8125, "loss/logits": 0.16780292987823486, "loss/reg": 0.022406071424484253, "step": 578 }, { "epoch": 0.072375, "grad_norm": 2.5532329082489014, "grad_norm_var": 19.219812763276813, "learning_rate": 0.0001, "loss": 0.9996, "loss/crossentropy": 2.427797794342041, "loss/hidden": 0.66796875, "loss/logits": 0.10767525434494019, "loss/reg": 0.02239692024886608, "step": 579 }, { "epoch": 0.0725, "grad_norm": 3.250906229019165, "grad_norm_var": 19.294198335433887, "learning_rate": 0.0001, "loss": 1.1148, "loss/crossentropy": 2.29978084564209, "loss/hidden": 0.75, "loss/logits": 0.1408957540988922, "loss/reg": 0.0223868228495121, "step": 580 }, { "epoch": 0.072625, "grad_norm": 3.8051562309265137, "grad_norm_var": 19.12802354300362, "learning_rate": 0.0001, "loss": 1.2329, "loss/crossentropy": 2.402350664138794, "loss/hidden": 0.83203125, "loss/logits": 0.17713911831378937, "loss/reg": 0.022376833483576775, "step": 581 }, { "epoch": 0.07275, "grad_norm": 4.466115951538086, "grad_norm_var": 19.129656316190147, "learning_rate": 0.0001, "loss": 1.1846, "loss/crossentropy": 2.5129504203796387, "loss/hidden": 0.8125, "loss/logits": 0.14841441810131073, "loss/reg": 0.02236761339008808, "step": 582 }, { "epoch": 0.072875, "grad_norm": 3.1465752124786377, "grad_norm_var": 19.255278342461487, "learning_rate": 0.0001, "loss": 1.1857, "loss/crossentropy": 2.5782837867736816, "loss/hidden": 0.8203125, "loss/logits": 0.14178822934627533, "loss/reg": 0.022358402609825134, "step": 583 }, { "epoch": 0.073, "grad_norm": 2.7474803924560547, "grad_norm_var": 19.441256858474677, "learning_rate": 0.0001, "loss": 1.0532, "loss/crossentropy": 2.4901747703552246, "loss/hidden": 0.703125, "loss/logits": 0.12661507725715637, "loss/reg": 0.02234930731356144, "step": 584 }, { "epoch": 0.073125, "grad_norm": 2.7027359008789062, "grad_norm_var": 19.47380183903334, "learning_rate": 0.0001, "loss": 0.8469, "loss/crossentropy": 2.694582223892212, "loss/hidden": 0.5390625, "loss/logits": 0.08438676595687866, "loss/reg": 0.022340187802910805, "step": 585 }, { "epoch": 0.07325, "grad_norm": 2.847770929336548, "grad_norm_var": 19.547679388785195, "learning_rate": 0.0001, "loss": 0.9885, "loss/crossentropy": 2.5558888912200928, "loss/hidden": 0.6484375, "loss/logits": 0.11670757830142975, "loss/reg": 0.022330984473228455, "step": 586 }, { "epoch": 0.073375, "grad_norm": 3.20444917678833, "grad_norm_var": 19.601201389954156, "learning_rate": 0.0001, "loss": 1.2216, "loss/crossentropy": 2.0031001567840576, "loss/hidden": 0.84375, "loss/logits": 0.1546502560377121, "loss/reg": 0.022321749478578568, "step": 587 }, { "epoch": 0.0735, "grad_norm": 4.657837390899658, "grad_norm_var": 19.6039707895662, "learning_rate": 0.0001, "loss": 1.1377, "loss/crossentropy": 2.598100185394287, "loss/hidden": 0.78125, "loss/logits": 0.13329939544200897, "loss/reg": 0.02231265790760517, "step": 588 }, { "epoch": 0.073625, "grad_norm": 3.1166787147521973, "grad_norm_var": 19.52355503271277, "learning_rate": 0.0001, "loss": 1.304, "loss/crossentropy": 2.2209084033966064, "loss/hidden": 0.89453125, "loss/logits": 0.18648099899291992, "loss/reg": 0.022303014993667603, "step": 589 }, { "epoch": 0.07375, "grad_norm": 2.988344669342041, "grad_norm_var": 19.61249925539728, "learning_rate": 0.0001, "loss": 0.9412, "loss/crossentropy": 2.4533708095550537, "loss/hidden": 0.60546875, "loss/logits": 0.11275988817214966, "loss/reg": 0.022293319925665855, "step": 590 }, { "epoch": 0.073875, "grad_norm": 3.0553901195526123, "grad_norm_var": 0.3491433903254536, "learning_rate": 0.0001, "loss": 1.0883, "loss/crossentropy": 2.3986990451812744, "loss/hidden": 0.73046875, "loss/logits": 0.13498544692993164, "loss/reg": 0.022283662110567093, "step": 591 }, { "epoch": 0.074, "grad_norm": 4.445681095123291, "grad_norm_var": 0.43558600780207446, "learning_rate": 0.0001, "loss": 1.3195, "loss/crossentropy": 2.335937976837158, "loss/hidden": 0.90625, "loss/logits": 0.19050292670726776, "loss/reg": 0.022273709997534752, "step": 592 }, { "epoch": 0.074125, "grad_norm": 41.39726638793945, "grad_norm_var": 91.00287624901027, "learning_rate": 0.0001, "loss": 1.2527, "loss/crossentropy": 2.3441669940948486, "loss/hidden": 0.86328125, "loss/logits": 0.1667976826429367, "loss/reg": 0.02226419560611248, "step": 593 }, { "epoch": 0.07425, "grad_norm": 2.8557939529418945, "grad_norm_var": 91.04408434440504, "learning_rate": 0.0001, "loss": 1.0612, "loss/crossentropy": 2.380885362625122, "loss/hidden": 0.703125, "loss/logits": 0.1354852318763733, "loss/reg": 0.022254258394241333, "step": 594 }, { "epoch": 0.074375, "grad_norm": 6.565737724304199, "grad_norm_var": 90.36543928633743, "learning_rate": 0.0001, "loss": 1.2519, "loss/crossentropy": 2.3877017498016357, "loss/hidden": 0.90625, "loss/logits": 0.1231798455119133, "loss/reg": 0.022244345396757126, "step": 595 }, { "epoch": 0.0745, "grad_norm": 3.6746349334716797, "grad_norm_var": 90.22397938232942, "learning_rate": 0.0001, "loss": 1.1718, "loss/crossentropy": 2.397080898284912, "loss/hidden": 0.80078125, "loss/logits": 0.1486300826072693, "loss/reg": 0.022234413772821426, "step": 596 }, { "epoch": 0.074625, "grad_norm": 2.981614589691162, "grad_norm_var": 90.50516196939815, "learning_rate": 0.0001, "loss": 1.043, "loss/crossentropy": 2.37412428855896, "loss/hidden": 0.69140625, "loss/logits": 0.12931303679943085, "loss/reg": 0.022224588319659233, "step": 597 }, { "epoch": 0.07475, "grad_norm": 3.6531307697296143, "grad_norm_var": 90.70497774366554, "learning_rate": 0.0001, "loss": 1.1272, "loss/crossentropy": 2.6906161308288574, "loss/hidden": 0.76953125, "loss/logits": 0.13550975918769836, "loss/reg": 0.02221417799592018, "step": 598 }, { "epoch": 0.074875, "grad_norm": 3.751652240753174, "grad_norm_var": 90.50753182721607, "learning_rate": 0.0001, "loss": 1.1618, "loss/crossentropy": 2.6506831645965576, "loss/hidden": 0.79296875, "loss/logits": 0.1468081772327423, "loss/reg": 0.022204989567399025, "step": 599 }, { "epoch": 0.075, "grad_norm": 3.228563070297241, "grad_norm_var": 90.31879350061254, "learning_rate": 0.0001, "loss": 1.0791, "loss/crossentropy": 2.381368398666382, "loss/hidden": 0.71875, "loss/logits": 0.13835087418556213, "loss/reg": 0.022195899859070778, "step": 600 }, { "epoch": 0.075125, "grad_norm": 5.392886638641357, "grad_norm_var": 89.60797997668053, "learning_rate": 0.0001, "loss": 1.2309, "loss/crossentropy": 2.52596378326416, "loss/hidden": 0.8359375, "loss/logits": 0.17307403683662415, "loss/reg": 0.022185994312167168, "step": 601 }, { "epoch": 0.07525, "grad_norm": 3.2832775115966797, "grad_norm_var": 89.43019603463321, "learning_rate": 0.0001, "loss": 1.1462, "loss/crossentropy": 2.4442057609558105, "loss/hidden": 0.76953125, "loss/logits": 0.15487736463546753, "loss/reg": 0.02217610739171505, "step": 602 }, { "epoch": 0.075375, "grad_norm": 4.1014084815979, "grad_norm_var": 89.1293068696746, "learning_rate": 0.0001, "loss": 1.0847, "loss/crossentropy": 2.531064987182617, "loss/hidden": 0.7265625, "loss/logits": 0.13643184304237366, "loss/reg": 0.02216634899377823, "step": 603 }, { "epoch": 0.0755, "grad_norm": 3.656172037124634, "grad_norm_var": 89.39756111673695, "learning_rate": 0.0001, "loss": 1.0161, "loss/crossentropy": 2.470468044281006, "loss/hidden": 0.68359375, "loss/logits": 0.11097002029418945, "loss/reg": 0.02215682342648506, "step": 604 }, { "epoch": 0.075625, "grad_norm": 3.8061492443084717, "grad_norm_var": 89.1498668494714, "learning_rate": 0.0001, "loss": 1.1195, "loss/crossentropy": 2.469788074493408, "loss/hidden": 0.7734375, "loss/logits": 0.12457874417304993, "loss/reg": 0.022147687152028084, "step": 605 }, { "epoch": 0.07575, "grad_norm": 2.658135414123535, "grad_norm_var": 89.29708722871554, "learning_rate": 0.0001, "loss": 1.0657, "loss/crossentropy": 2.2552011013031006, "loss/hidden": 0.71484375, "loss/logits": 0.12950366735458374, "loss/reg": 0.022138802334666252, "step": 606 }, { "epoch": 0.075875, "grad_norm": 3.291750907897949, "grad_norm_var": 89.20284122750789, "learning_rate": 0.0001, "loss": 1.133, "loss/crossentropy": 2.225055456161499, "loss/hidden": 0.73828125, "loss/logits": 0.1734064519405365, "loss/reg": 0.022129878401756287, "step": 607 }, { "epoch": 0.076, "grad_norm": 4.001339912414551, "grad_norm_var": 89.31742762195415, "learning_rate": 0.0001, "loss": 0.9658, "loss/crossentropy": 2.3826169967651367, "loss/hidden": 0.64453125, "loss/logits": 0.10007497668266296, "loss/reg": 0.022121025249361992, "step": 608 }, { "epoch": 0.076125, "grad_norm": 2.9729602336883545, "grad_norm_var": 0.9817241823775095, "learning_rate": 0.0001, "loss": 1.1962, "loss/crossentropy": 2.4926881790161133, "loss/hidden": 0.82421875, "loss/logits": 0.1508128046989441, "loss/reg": 0.022111859172582626, "step": 609 }, { "epoch": 0.07625, "grad_norm": 2.5144121646881104, "grad_norm_var": 1.0293551003726749, "learning_rate": 0.0001, "loss": 0.9883, "loss/crossentropy": 2.478576898574829, "loss/hidden": 0.65234375, "loss/logits": 0.11488830298185349, "loss/reg": 0.02210419811308384, "step": 610 }, { "epoch": 0.076375, "grad_norm": 3.7777743339538574, "grad_norm_var": 0.4576308797359, "learning_rate": 0.0001, "loss": 1.0059, "loss/crossentropy": 2.4356257915496826, "loss/hidden": 0.6484375, "loss/logits": 0.13651525974273682, "loss/reg": 0.022095149382948875, "step": 611 }, { "epoch": 0.0765, "grad_norm": 3.232398748397827, "grad_norm_var": 0.4623055923756754, "learning_rate": 0.0001, "loss": 1.0108, "loss/crossentropy": 2.4058218002319336, "loss/hidden": 0.67578125, "loss/logits": 0.11410736292600632, "loss/reg": 0.022087210789322853, "step": 612 }, { "epoch": 0.076625, "grad_norm": 2.61934757232666, "grad_norm_var": 0.49646373584008807, "learning_rate": 0.0001, "loss": 0.9466, "loss/crossentropy": 2.3981077671051025, "loss/hidden": 0.59765625, "loss/logits": 0.1281721591949463, "loss/reg": 0.022078126668930054, "step": 613 }, { "epoch": 0.07675, "grad_norm": 4.228268146514893, "grad_norm_var": 0.5291615579452734, "learning_rate": 0.0001, "loss": 1.0729, "loss/crossentropy": 2.664149761199951, "loss/hidden": 0.73046875, "loss/logits": 0.12176868319511414, "loss/reg": 0.02207016758620739, "step": 614 }, { "epoch": 0.076875, "grad_norm": 2.9137685298919678, "grad_norm_var": 0.5485319535320492, "learning_rate": 0.0001, "loss": 0.8708, "loss/crossentropy": 2.5457890033721924, "loss/hidden": 0.55078125, "loss/logits": 0.09941907972097397, "loss/reg": 0.022060981020331383, "step": 615 }, { "epoch": 0.077, "grad_norm": 3.4499685764312744, "grad_norm_var": 0.5441756848342263, "learning_rate": 0.0001, "loss": 1.1439, "loss/crossentropy": 2.3822786808013916, "loss/hidden": 0.78125, "loss/logits": 0.14211627840995789, "loss/reg": 0.02205180749297142, "step": 616 }, { "epoch": 0.077125, "grad_norm": 2.271921157836914, "grad_norm_var": 0.36266744154546565, "learning_rate": 0.0001, "loss": 0.9061, "loss/crossentropy": 2.707848072052002, "loss/hidden": 0.58203125, "loss/logits": 0.10360105335712433, "loss/reg": 0.022042402997612953, "step": 617 }, { "epoch": 0.07725, "grad_norm": 3.16980242729187, "grad_norm_var": 0.36370543210803513, "learning_rate": 0.0001, "loss": 1.2733, "loss/crossentropy": 2.1980466842651367, "loss/hidden": 0.86328125, "loss/logits": 0.18972548842430115, "loss/reg": 0.02203306369483471, "step": 618 }, { "epoch": 0.077375, "grad_norm": 2.4408295154571533, "grad_norm_var": 0.3567501583972517, "learning_rate": 0.0001, "loss": 1.1352, "loss/crossentropy": 2.3986692428588867, "loss/hidden": 0.76953125, "loss/logits": 0.1454274207353592, "loss/reg": 0.022023871541023254, "step": 619 }, { "epoch": 0.0775, "grad_norm": 2.558687925338745, "grad_norm_var": 0.36349398943808237, "learning_rate": 0.0001, "loss": 0.9776, "loss/crossentropy": 2.4290616512298584, "loss/hidden": 0.65625, "loss/logits": 0.1012360006570816, "loss/reg": 0.0220141913741827, "step": 620 }, { "epoch": 0.077625, "grad_norm": 3.7468619346618652, "grad_norm_var": 0.3582835152003213, "learning_rate": 0.0001, "loss": 1.1635, "loss/crossentropy": 2.4566454887390137, "loss/hidden": 0.78515625, "loss/logits": 0.15830263495445251, "loss/reg": 0.02200442925095558, "step": 621 }, { "epoch": 0.07775, "grad_norm": 3.8364651203155518, "grad_norm_var": 0.3732032502257139, "learning_rate": 0.0001, "loss": 1.3213, "loss/crossentropy": 2.5934860706329346, "loss/hidden": 0.9296875, "loss/logits": 0.17167437076568604, "loss/reg": 0.021995313465595245, "step": 622 }, { "epoch": 0.077875, "grad_norm": 2.9136173725128174, "grad_norm_var": 0.37696739372618043, "learning_rate": 0.0001, "loss": 1.1191, "loss/crossentropy": 2.1625287532806396, "loss/hidden": 0.75, "loss/logits": 0.1492336541414261, "loss/reg": 0.021986283361911774, "step": 623 }, { "epoch": 0.078, "grad_norm": 3.3136067390441895, "grad_norm_var": 0.3298862344756988, "learning_rate": 0.0001, "loss": 1.1487, "loss/crossentropy": 2.3915464878082275, "loss/hidden": 0.7890625, "loss/logits": 0.1398892104625702, "loss/reg": 0.021976841613650322, "step": 624 }, { "epoch": 0.078125, "grad_norm": 3.1731748580932617, "grad_norm_var": 0.3283984444798058, "learning_rate": 0.0001, "loss": 1.0945, "loss/crossentropy": 2.53873610496521, "loss/hidden": 0.7109375, "loss/logits": 0.16392138600349426, "loss/reg": 0.021966535598039627, "step": 625 }, { "epoch": 0.07825, "grad_norm": 4.172556400299072, "grad_norm_var": 0.3630228628347131, "learning_rate": 0.0001, "loss": 1.1888, "loss/crossentropy": 2.4219956398010254, "loss/hidden": 0.79296875, "loss/logits": 0.17627781629562378, "loss/reg": 0.021956363692879677, "step": 626 }, { "epoch": 0.078375, "grad_norm": 3.99817156791687, "grad_norm_var": 0.3819004722530487, "learning_rate": 0.0001, "loss": 1.3074, "loss/crossentropy": 2.2407050132751465, "loss/hidden": 0.8828125, "loss/logits": 0.20513707399368286, "loss/reg": 0.02194611169397831, "step": 627 }, { "epoch": 0.0785, "grad_norm": 3.2869558334350586, "grad_norm_var": 0.3819405314837089, "learning_rate": 0.0001, "loss": 0.923, "loss/crossentropy": 2.5824477672576904, "loss/hidden": 0.59375, "loss/logits": 0.10985252261161804, "loss/reg": 0.021935785189270973, "step": 628 }, { "epoch": 0.078625, "grad_norm": 3.0037360191345215, "grad_norm_var": 0.35855200267849247, "learning_rate": 0.0001, "loss": 1.0636, "loss/crossentropy": 2.4916722774505615, "loss/hidden": 0.7109375, "loss/logits": 0.13343587517738342, "loss/reg": 0.021925168111920357, "step": 629 }, { "epoch": 0.07875, "grad_norm": 3.5274088382720947, "grad_norm_var": 0.3006291732182412, "learning_rate": 0.0001, "loss": 1.1441, "loss/crossentropy": 1.7279773950576782, "loss/hidden": 0.78515625, "loss/logits": 0.13975940644741058, "loss/reg": 0.02191445231437683, "step": 630 }, { "epoch": 0.078875, "grad_norm": 3.2895989418029785, "grad_norm_var": 0.29330515223301745, "learning_rate": 0.0001, "loss": 1.0394, "loss/crossentropy": 2.370850086212158, "loss/hidden": 0.68359375, "loss/logits": 0.13678821921348572, "loss/reg": 0.02190525084733963, "step": 631 }, { "epoch": 0.079, "grad_norm": 3.741135597229004, "grad_norm_var": 0.30599490652712474, "learning_rate": 0.0001, "loss": 1.2655, "loss/crossentropy": 2.365464210510254, "loss/hidden": 0.87890625, "loss/logits": 0.16766133904457092, "loss/reg": 0.02189476415514946, "step": 632 }, { "epoch": 0.079125, "grad_norm": 4.110158443450928, "grad_norm_var": 0.2706546096444164, "learning_rate": 0.0001, "loss": 1.1804, "loss/crossentropy": 2.658433437347412, "loss/hidden": 0.796875, "loss/logits": 0.1646716445684433, "loss/reg": 0.021885616704821587, "step": 633 }, { "epoch": 0.07925, "grad_norm": 3.7503654956817627, "grad_norm_var": 0.27446839769864156, "learning_rate": 0.0001, "loss": 1.1, "loss/crossentropy": 2.4457831382751465, "loss/hidden": 0.74609375, "loss/logits": 0.13510534167289734, "loss/reg": 0.021875550970435143, "step": 634 }, { "epoch": 0.079375, "grad_norm": 2.917835235595703, "grad_norm_var": 0.22584356567046956, "learning_rate": 0.0001, "loss": 1.1498, "loss/crossentropy": 2.3328001499176025, "loss/hidden": 0.77734375, "loss/logits": 0.1537725031375885, "loss/reg": 0.021865583956241608, "step": 635 }, { "epoch": 0.0795, "grad_norm": 2.907240152359009, "grad_norm_var": 0.19160647764443486, "learning_rate": 0.0001, "loss": 0.9073, "loss/crossentropy": 2.556042432785034, "loss/hidden": 0.59375, "loss/logits": 0.09503352642059326, "loss/reg": 0.02185530960559845, "step": 636 }, { "epoch": 0.079625, "grad_norm": 2.5937955379486084, "grad_norm_var": 0.2337615816576618, "learning_rate": 0.0001, "loss": 0.9198, "loss/crossentropy": 2.363370180130005, "loss/hidden": 0.5859375, "loss/logits": 0.11537887156009674, "loss/reg": 0.021845519542694092, "step": 637 }, { "epoch": 0.07975, "grad_norm": 3.9080708026885986, "grad_norm_var": 0.2381681132369368, "learning_rate": 0.0001, "loss": 1.1552, "loss/crossentropy": 2.5054194927215576, "loss/hidden": 0.81640625, "loss/logits": 0.12043754756450653, "loss/reg": 0.021835271269083023, "step": 638 }, { "epoch": 0.079875, "grad_norm": 2.8192946910858154, "grad_norm_var": 0.24500412598165416, "learning_rate": 0.0001, "loss": 0.9375, "loss/crossentropy": 2.6611719131469727, "loss/hidden": 0.60546875, "loss/logits": 0.11382012814283371, "loss/reg": 0.021824965253472328, "step": 639 }, { "epoch": 0.08, "grad_norm": 4.111716270446777, "grad_norm_var": 0.27486954530744445, "learning_rate": 0.0001, "loss": 1.2972, "loss/crossentropy": 2.472568988800049, "loss/hidden": 0.91015625, "loss/logits": 0.16890740394592285, "loss/reg": 0.02181575633585453, "step": 640 }, { "epoch": 0.080125, "grad_norm": 5.715061187744141, "grad_norm_var": 0.5825168124345791, "learning_rate": 0.0001, "loss": 1.0307, "loss/crossentropy": 2.2123966217041016, "loss/hidden": 0.68359375, "loss/logits": 0.12899596989154816, "loss/reg": 0.021806620061397552, "step": 641 }, { "epoch": 0.08025, "grad_norm": 3.8680315017700195, "grad_norm_var": 0.5657073815126407, "learning_rate": 0.0001, "loss": 1.053, "loss/crossentropy": 2.4526453018188477, "loss/hidden": 0.703125, "loss/logits": 0.13190723955631256, "loss/reg": 0.021797508001327515, "step": 642 }, { "epoch": 0.080375, "grad_norm": 4.992859363555908, "grad_norm_var": 0.680778895488037, "learning_rate": 0.0001, "loss": 1.1683, "loss/crossentropy": 2.3445968627929688, "loss/hidden": 0.82421875, "loss/logits": 0.12617573142051697, "loss/reg": 0.02178841643035412, "step": 643 }, { "epoch": 0.0805, "grad_norm": 4.339104175567627, "grad_norm_var": 0.6977811040599325, "learning_rate": 0.0001, "loss": 1.3433, "loss/crossentropy": 1.8577097654342651, "loss/hidden": 0.9453125, "loss/logits": 0.18017446994781494, "loss/reg": 0.02177964523434639, "step": 644 }, { "epoch": 0.080625, "grad_norm": 4.026562213897705, "grad_norm_var": 0.6648423545945282, "learning_rate": 0.0001, "loss": 1.3266, "loss/crossentropy": 2.415410041809082, "loss/hidden": 0.859375, "loss/logits": 0.24951061606407166, "loss/reg": 0.021770501509308815, "step": 645 }, { "epoch": 0.08075, "grad_norm": 2.8517775535583496, "grad_norm_var": 0.7169049906385166, "learning_rate": 0.0001, "loss": 0.9283, "loss/crossentropy": 2.6426851749420166, "loss/hidden": 0.6015625, "loss/logits": 0.10916159301996231, "loss/reg": 0.021761184558272362, "step": 646 }, { "epoch": 0.080875, "grad_norm": 3.3171377182006836, "grad_norm_var": 0.7152750431492562, "learning_rate": 0.0001, "loss": 1.0088, "loss/crossentropy": 2.638533115386963, "loss/hidden": 0.62890625, "loss/logits": 0.1623907834291458, "loss/reg": 0.02175196446478367, "step": 647 }, { "epoch": 0.081, "grad_norm": 3.653881311416626, "grad_norm_var": 0.7158322952113887, "learning_rate": 0.0001, "loss": 1.1132, "loss/crossentropy": 2.745229482650757, "loss/hidden": 0.765625, "loss/logits": 0.1301097273826599, "loss/reg": 0.021742329001426697, "step": 648 }, { "epoch": 0.081125, "grad_norm": 4.31439733505249, "grad_norm_var": 0.728446489341123, "learning_rate": 0.0001, "loss": 1.2706, "loss/crossentropy": 2.7004082202911377, "loss/hidden": 0.80078125, "loss/logits": 0.25250470638275146, "loss/reg": 0.021732579916715622, "step": 649 }, { "epoch": 0.08125, "grad_norm": 4.621687889099121, "grad_norm_var": 0.7753064642270201, "learning_rate": 0.0001, "loss": 1.4091, "loss/crossentropy": 2.38405179977417, "loss/hidden": 0.89453125, "loss/logits": 0.29737186431884766, "loss/reg": 0.02172265760600567, "step": 650 }, { "epoch": 0.081375, "grad_norm": 2.87424898147583, "grad_norm_var": 0.7806094534209419, "learning_rate": 0.0001, "loss": 1.2103, "loss/crossentropy": 2.0905954837799072, "loss/hidden": 0.8125, "loss/logits": 0.18066659569740295, "loss/reg": 0.021712414920330048, "step": 651 }, { "epoch": 0.0815, "grad_norm": 4.724517345428467, "grad_norm_var": 0.7689569917943563, "learning_rate": 0.0001, "loss": 1.2928, "loss/crossentropy": 2.500594139099121, "loss/hidden": 0.89453125, "loss/logits": 0.1812204122543335, "loss/reg": 0.021703310310840607, "step": 652 }, { "epoch": 0.081625, "grad_norm": 2.8673081398010254, "grad_norm_var": 0.7252403996552116, "learning_rate": 0.0001, "loss": 1.0588, "loss/crossentropy": 2.3599724769592285, "loss/hidden": 0.703125, "loss/logits": 0.13873916864395142, "loss/reg": 0.021693557500839233, "step": 653 }, { "epoch": 0.08175, "grad_norm": 3.099315643310547, "grad_norm_var": 0.7693322976491117, "learning_rate": 0.0001, "loss": 1.1961, "loss/crossentropy": 2.3167355060577393, "loss/hidden": 0.83203125, "loss/logits": 0.14727652072906494, "loss/reg": 0.021683741360902786, "step": 654 }, { "epoch": 0.081875, "grad_norm": 3.287602186203003, "grad_norm_var": 0.7163515778113151, "learning_rate": 0.0001, "loss": 1.0429, "loss/crossentropy": 2.7967636585235596, "loss/hidden": 0.6875, "loss/logits": 0.13863959908485413, "loss/reg": 0.021674364805221558, "step": 655 }, { "epoch": 0.082, "grad_norm": 3.487874746322632, "grad_norm_var": 0.7244436337536264, "learning_rate": 0.0001, "loss": 1.0883, "loss/crossentropy": 2.5409998893737793, "loss/hidden": 0.71875, "loss/logits": 0.1528949737548828, "loss/reg": 0.02166520059108734, "step": 656 }, { "epoch": 0.082125, "grad_norm": 3.6202752590179443, "grad_norm_var": 0.4854858648423857, "learning_rate": 0.0001, "loss": 1.2516, "loss/crossentropy": 2.3633673191070557, "loss/hidden": 0.88671875, "loss/logits": 0.14828172326087952, "loss/reg": 0.021655315533280373, "step": 657 }, { "epoch": 0.08225, "grad_norm": 3.7921783924102783, "grad_norm_var": 0.484617963461113, "learning_rate": 0.0001, "loss": 1.07, "loss/crossentropy": 2.497013807296753, "loss/hidden": 0.71484375, "loss/logits": 0.13874131441116333, "loss/reg": 0.021646033972501755, "step": 658 }, { "epoch": 0.082375, "grad_norm": 5.818857669830322, "grad_norm_var": 0.6650298211735747, "learning_rate": 0.0001, "loss": 1.2961, "loss/crossentropy": 2.6599833965301514, "loss/hidden": 0.91015625, "loss/logits": 0.16955840587615967, "loss/reg": 0.021636882796883583, "step": 659 }, { "epoch": 0.0825, "grad_norm": 3.465527057647705, "grad_norm_var": 0.6491808619433999, "learning_rate": 0.0001, "loss": 1.1365, "loss/crossentropy": 2.5972306728363037, "loss/hidden": 0.7734375, "loss/logits": 0.14674408733844757, "loss/reg": 0.02162766456604004, "step": 660 }, { "epoch": 0.082625, "grad_norm": 3.143159866333008, "grad_norm_var": 0.664078497493661, "learning_rate": 0.0001, "loss": 1.2241, "loss/crossentropy": 2.5945792198181152, "loss/hidden": 0.83203125, "loss/logits": 0.17588043212890625, "loss/reg": 0.021618474274873734, "step": 661 }, { "epoch": 0.08275, "grad_norm": 3.7091660499572754, "grad_norm_var": 0.6149151800980365, "learning_rate": 0.0001, "loss": 1.1901, "loss/crossentropy": 2.631565809249878, "loss/hidden": 0.8125, "loss/logits": 0.16149985790252686, "loss/reg": 0.021609637886285782, "step": 662 }, { "epoch": 0.082875, "grad_norm": 3.1449224948883057, "grad_norm_var": 0.6264170707357067, "learning_rate": 0.0001, "loss": 1.1491, "loss/crossentropy": 2.2890501022338867, "loss/hidden": 0.78515625, "loss/logits": 0.14797118306159973, "loss/reg": 0.021600957959890366, "step": 663 }, { "epoch": 0.083, "grad_norm": 3.096752405166626, "grad_norm_var": 0.6512152784754629, "learning_rate": 0.0001, "loss": 1.0925, "loss/crossentropy": 2.767069101333618, "loss/hidden": 0.7421875, "loss/logits": 0.1343650072813034, "loss/reg": 0.021592585369944572, "step": 664 }, { "epoch": 0.083125, "grad_norm": 3.8935883045196533, "grad_norm_var": 0.6273466460071402, "learning_rate": 0.0001, "loss": 1.0996, "loss/crossentropy": 2.4779109954833984, "loss/hidden": 0.7578125, "loss/logits": 0.12599951028823853, "loss/reg": 0.021583350375294685, "step": 665 }, { "epoch": 0.08325, "grad_norm": 3.6881141662597656, "grad_norm_var": 0.5627883047301658, "learning_rate": 0.0001, "loss": 1.1184, "loss/crossentropy": 2.1603968143463135, "loss/hidden": 0.76171875, "loss/logits": 0.14095276594161987, "loss/reg": 0.021574225276708603, "step": 666 }, { "epoch": 0.083375, "grad_norm": 2.9750800132751465, "grad_norm_var": 0.5535713466115603, "learning_rate": 0.0001, "loss": 1.1215, "loss/crossentropy": 2.4817895889282227, "loss/hidden": 0.75390625, "loss/logits": 0.15194162726402283, "loss/reg": 0.02156493254005909, "step": 667 }, { "epoch": 0.0835, "grad_norm": 2.649543046951294, "grad_norm_var": 0.5152581471177806, "learning_rate": 0.0001, "loss": 0.918, "loss/crossentropy": 2.334226608276367, "loss/hidden": 0.59375, "loss/logits": 0.10870292782783508, "loss/reg": 0.02155502513051033, "step": 668 }, { "epoch": 0.083625, "grad_norm": 4.535495281219482, "grad_norm_var": 0.5520843285134825, "learning_rate": 0.0001, "loss": 1.168, "loss/crossentropy": 2.4633467197418213, "loss/hidden": 0.796875, "loss/logits": 0.15570059418678284, "loss/reg": 0.021545063704252243, "step": 669 }, { "epoch": 0.08375, "grad_norm": 3.3291449546813965, "grad_norm_var": 0.5404115229162234, "learning_rate": 0.0001, "loss": 1.1241, "loss/crossentropy": 2.317607879638672, "loss/hidden": 0.78515625, "loss/logits": 0.12363065779209137, "loss/reg": 0.021535001695156097, "step": 670 }, { "epoch": 0.083875, "grad_norm": 4.071917533874512, "grad_norm_var": 0.5459456401930327, "learning_rate": 0.0001, "loss": 1.1652, "loss/crossentropy": 2.4951958656311035, "loss/hidden": 0.83203125, "loss/logits": 0.11796893179416656, "loss/reg": 0.021524924784898758, "step": 671 }, { "epoch": 0.084, "grad_norm": 4.647782802581787, "grad_norm_var": 0.6047501670354971, "learning_rate": 0.0001, "loss": 1.157, "loss/crossentropy": 2.6994447708129883, "loss/hidden": 0.78125, "loss/logits": 0.16059228777885437, "loss/reg": 0.021514689549803734, "step": 672 }, { "epoch": 0.084125, "grad_norm": 4.240062713623047, "grad_norm_var": 0.6201999433703328, "learning_rate": 0.0001, "loss": 1.0984, "loss/crossentropy": 2.592426300048828, "loss/hidden": 0.7734375, "loss/logits": 0.10989370942115784, "loss/reg": 0.021505359560251236, "step": 673 }, { "epoch": 0.08425, "grad_norm": 15.647865295410156, "grad_norm_var": 9.451818582857973, "learning_rate": 0.0001, "loss": 1.6418, "loss/crossentropy": 2.723198413848877, "loss/hidden": 1.078125, "loss/logits": 0.3486996293067932, "loss/reg": 0.02149534970521927, "step": 674 }, { "epoch": 0.084375, "grad_norm": 3.6480963230133057, "grad_norm_var": 9.365638761154676, "learning_rate": 0.0001, "loss": 1.1052, "loss/crossentropy": 2.618795871734619, "loss/hidden": 0.734375, "loss/logits": 0.1559816300868988, "loss/reg": 0.02148519456386566, "step": 675 }, { "epoch": 0.0845, "grad_norm": 3.5789527893066406, "grad_norm_var": 9.352796045350159, "learning_rate": 0.0001, "loss": 1.1092, "loss/crossentropy": 2.4817607402801514, "loss/hidden": 0.75, "loss/logits": 0.14444658160209656, "loss/reg": 0.021475963294506073, "step": 676 }, { "epoch": 0.084625, "grad_norm": 3.143718719482422, "grad_norm_var": 9.352704277495931, "learning_rate": 0.0001, "loss": 1.0394, "loss/crossentropy": 2.28035044670105, "loss/hidden": 0.70703125, "loss/logits": 0.11767329275608063, "loss/reg": 0.02146601676940918, "step": 677 }, { "epoch": 0.08475, "grad_norm": 4.2889580726623535, "grad_norm_var": 9.322240526517621, "learning_rate": 0.0001, "loss": 1.1067, "loss/crossentropy": 2.28658390045166, "loss/hidden": 0.7109375, "loss/logits": 0.18121060729026794, "loss/reg": 0.021456118673086166, "step": 678 }, { "epoch": 0.084875, "grad_norm": 2.800516128540039, "grad_norm_var": 9.387804829955044, "learning_rate": 0.0001, "loss": 0.975, "loss/crossentropy": 2.699044704437256, "loss/hidden": 0.63671875, "loss/logits": 0.12379397451877594, "loss/reg": 0.02144702896475792, "step": 679 }, { "epoch": 0.085, "grad_norm": 3.064215898513794, "grad_norm_var": 9.393480165725077, "learning_rate": 0.0001, "loss": 1.1186, "loss/crossentropy": 2.547557830810547, "loss/hidden": 0.76171875, "loss/logits": 0.14249341189861298, "loss/reg": 0.02143782004714012, "step": 680 }, { "epoch": 0.085125, "grad_norm": 3.7314915657043457, "grad_norm_var": 9.405801361337378, "learning_rate": 0.0001, "loss": 1.1376, "loss/crossentropy": 2.705601930618286, "loss/hidden": 0.7734375, "loss/logits": 0.14988452196121216, "loss/reg": 0.02142806351184845, "step": 681 }, { "epoch": 0.08525, "grad_norm": 2.256387948989868, "grad_norm_var": 9.665529326305519, "learning_rate": 0.0001, "loss": 0.945, "loss/crossentropy": 2.464989423751831, "loss/hidden": 0.62109375, "loss/logits": 0.10975323617458344, "loss/reg": 0.02141808532178402, "step": 682 }, { "epoch": 0.085375, "grad_norm": 3.150348424911499, "grad_norm_var": 9.636765682886749, "learning_rate": 0.0001, "loss": 1.0573, "loss/crossentropy": 2.2520275115966797, "loss/hidden": 0.72265625, "loss/logits": 0.12059713900089264, "loss/reg": 0.021408328786492348, "step": 683 }, { "epoch": 0.0855, "grad_norm": 2.973684549331665, "grad_norm_var": 9.572043410499656, "learning_rate": 0.0001, "loss": 1.0663, "loss/crossentropy": 2.4148662090301514, "loss/hidden": 0.73828125, "loss/logits": 0.11397817730903625, "loss/reg": 0.021399127319455147, "step": 684 }, { "epoch": 0.085625, "grad_norm": 4.39288330078125, "grad_norm_var": 9.56920341692891, "learning_rate": 0.0001, "loss": 1.206, "loss/crossentropy": 2.5025415420532227, "loss/hidden": 0.84765625, "loss/logits": 0.1444375216960907, "loss/reg": 0.021389208734035492, "step": 685 }, { "epoch": 0.08575, "grad_norm": 2.815019369125366, "grad_norm_var": 9.652987248771876, "learning_rate": 0.0001, "loss": 1.0631, "loss/crossentropy": 2.236358165740967, "loss/hidden": 0.72265625, "loss/logits": 0.1266387552022934, "loss/reg": 0.021379247307777405, "step": 686 }, { "epoch": 0.085875, "grad_norm": 3.6744375228881836, "grad_norm_var": 9.673796390527396, "learning_rate": 0.0001, "loss": 1.2523, "loss/crossentropy": 2.1673667430877686, "loss/hidden": 0.8671875, "loss/logits": 0.1714528650045395, "loss/reg": 0.02136901021003723, "step": 687 }, { "epoch": 0.086, "grad_norm": 3.169265031814575, "grad_norm_var": 9.732675648460468, "learning_rate": 0.0001, "loss": 1.1389, "loss/crossentropy": 2.45385479927063, "loss/hidden": 0.79296875, "loss/logits": 0.13236522674560547, "loss/reg": 0.021359853446483612, "step": 688 }, { "epoch": 0.086125, "grad_norm": 7.017651557922363, "grad_norm_var": 10.2441458601344, "learning_rate": 0.0001, "loss": 1.4071, "loss/crossentropy": 2.344740629196167, "loss/hidden": 1.0625, "loss/logits": 0.13107535243034363, "loss/reg": 0.021349839866161346, "step": 689 }, { "epoch": 0.08625, "grad_norm": 3.5523998737335205, "grad_norm_var": 1.142674868302701, "learning_rate": 0.0001, "loss": 1.1408, "loss/crossentropy": 2.323866367340088, "loss/hidden": 0.79296875, "loss/logits": 0.13442741334438324, "loss/reg": 0.021340306848287582, "step": 690 }, { "epoch": 0.086375, "grad_norm": 3.2477400302886963, "grad_norm_var": 1.1489843436981233, "learning_rate": 0.0001, "loss": 1.0067, "loss/crossentropy": 2.672182083129883, "loss/hidden": 0.67578125, "loss/logits": 0.11757320165634155, "loss/reg": 0.021331045776605606, "step": 691 }, { "epoch": 0.0865, "grad_norm": 3.1238696575164795, "grad_norm_var": 1.1603900529546722, "learning_rate": 0.0001, "loss": 1.1122, "loss/crossentropy": 2.510279893875122, "loss/hidden": 0.765625, "loss/logits": 0.13339656591415405, "loss/reg": 0.021321000531315804, "step": 692 }, { "epoch": 0.086625, "grad_norm": 2.552560329437256, "grad_norm_var": 1.2122975827491755, "learning_rate": 0.0001, "loss": 1.0396, "loss/crossentropy": 2.5303587913513184, "loss/hidden": 0.6953125, "loss/logits": 0.13117440044879913, "loss/reg": 0.021313220262527466, "step": 693 }, { "epoch": 0.08675, "grad_norm": 2.7075002193450928, "grad_norm_var": 1.1997649773340213, "learning_rate": 0.0001, "loss": 1.0787, "loss/crossentropy": 2.4859445095062256, "loss/hidden": 0.73828125, "loss/logits": 0.12738223373889923, "loss/reg": 0.021303845569491386, "step": 694 }, { "epoch": 0.086875, "grad_norm": 3.2640225887298584, "grad_norm_var": 1.1768004922088529, "learning_rate": 0.0001, "loss": 1.3383, "loss/crossentropy": 2.4017555713653564, "loss/hidden": 0.96875, "loss/logits": 0.15659019351005554, "loss/reg": 0.021294469013810158, "step": 695 }, { "epoch": 0.087, "grad_norm": 3.5289857387542725, "grad_norm_var": 1.1683562063707291, "learning_rate": 0.0001, "loss": 1.2607, "loss/crossentropy": 2.2716236114501953, "loss/hidden": 0.87109375, "loss/logits": 0.17679363489151, "loss/reg": 0.021284854039549828, "step": 696 }, { "epoch": 0.087125, "grad_norm": 8.924003601074219, "grad_norm_var": 3.0501856400161764, "learning_rate": 0.0001, "loss": 1.474, "loss/crossentropy": 2.3684239387512207, "loss/hidden": 1.0546875, "loss/logits": 0.2065410166978836, "loss/reg": 0.02127666585147381, "step": 697 }, { "epoch": 0.08725, "grad_norm": 4.2463788986206055, "grad_norm_var": 2.8955696375948605, "learning_rate": 0.0001, "loss": 1.1835, "loss/crossentropy": 2.234137535095215, "loss/hidden": 0.828125, "loss/logits": 0.1427059769630432, "loss/reg": 0.021268585696816444, "step": 698 }, { "epoch": 0.087375, "grad_norm": 3.0776546001434326, "grad_norm_var": 2.903130025314302, "learning_rate": 0.0001, "loss": 1.0559, "loss/crossentropy": 2.4638020992279053, "loss/hidden": 0.7265625, "loss/logits": 0.1167045459151268, "loss/reg": 0.02125934511423111, "step": 699 }, { "epoch": 0.0875, "grad_norm": 3.9374022483825684, "grad_norm_var": 2.843209099820052, "learning_rate": 0.0001, "loss": 0.9629, "loss/crossentropy": 2.88035249710083, "loss/hidden": 0.63671875, "loss/logits": 0.11369955539703369, "loss/reg": 0.021249722689390182, "step": 700 }, { "epoch": 0.087625, "grad_norm": 4.235450744628906, "grad_norm_var": 2.8355032825089417, "learning_rate": 0.0001, "loss": 1.278, "loss/crossentropy": 2.2560718059539795, "loss/hidden": 0.8828125, "loss/logits": 0.1828073114156723, "loss/reg": 0.0212401133030653, "step": 701 }, { "epoch": 0.08775, "grad_norm": 2.3705031871795654, "grad_norm_var": 2.9146564397348773, "learning_rate": 0.0001, "loss": 0.9899, "loss/crossentropy": 2.531632661819458, "loss/hidden": 0.64453125, "loss/logits": 0.13305732607841492, "loss/reg": 0.021231388673186302, "step": 702 }, { "epoch": 0.087875, "grad_norm": 3.5063636302948, "grad_norm_var": 2.921798711310286, "learning_rate": 0.0001, "loss": 1.0524, "loss/crossentropy": 2.548243999481201, "loss/hidden": 0.72265625, "loss/logits": 0.11747653782367706, "loss/reg": 0.02122276835143566, "step": 703 }, { "epoch": 0.088, "grad_norm": 3.141618013381958, "grad_norm_var": 2.924554396554724, "learning_rate": 0.0001, "loss": 0.9087, "loss/crossentropy": 2.3723928928375244, "loss/hidden": 0.59375, "loss/logits": 0.10285645723342896, "loss/reg": 0.021213354542851448, "step": 704 }, { "epoch": 0.088125, "grad_norm": 4.195517063140869, "grad_norm_var": 2.2500098957229806, "learning_rate": 0.0001, "loss": 1.1263, "loss/crossentropy": 2.383502244949341, "loss/hidden": 0.765625, "loss/logits": 0.14867368340492249, "loss/reg": 0.021204529330134392, "step": 705 }, { "epoch": 0.08825, "grad_norm": 3.646482467651367, "grad_norm_var": 2.2483885758775686, "learning_rate": 0.0001, "loss": 1.1459, "loss/crossentropy": 2.239716053009033, "loss/hidden": 0.79296875, "loss/logits": 0.14099720120429993, "loss/reg": 0.021195242181420326, "step": 706 }, { "epoch": 0.088375, "grad_norm": 3.3426883220672607, "grad_norm_var": 2.242826109053838, "learning_rate": 0.0001, "loss": 1.0323, "loss/crossentropy": 2.270444869995117, "loss/hidden": 0.7109375, "loss/logits": 0.10951918363571167, "loss/reg": 0.021186839789152145, "step": 707 }, { "epoch": 0.0885, "grad_norm": 3.083806037902832, "grad_norm_var": 2.2462046620558, "learning_rate": 0.0001, "loss": 1.086, "loss/crossentropy": 2.5013253688812256, "loss/hidden": 0.75390625, "loss/logits": 0.12026840448379517, "loss/reg": 0.021178435534238815, "step": 708 }, { "epoch": 0.088625, "grad_norm": 3.71588134765625, "grad_norm_var": 2.147370219186798, "learning_rate": 0.0001, "loss": 1.1125, "loss/crossentropy": 2.342475175857544, "loss/hidden": 0.76953125, "loss/logits": 0.13124068081378937, "loss/reg": 0.021169869229197502, "step": 709 }, { "epoch": 0.08875, "grad_norm": 3.4402713775634766, "grad_norm_var": 2.0734307300644255, "learning_rate": 0.0001, "loss": 1.1229, "loss/crossentropy": 2.4113690853118896, "loss/hidden": 0.76171875, "loss/logits": 0.14960095286369324, "loss/reg": 0.021161576732993126, "step": 710 }, { "epoch": 0.088875, "grad_norm": 10.640914916992188, "grad_norm_var": 4.894724677276531, "learning_rate": 0.0001, "loss": 1.7804, "loss/crossentropy": 2.6555004119873047, "loss/hidden": 1.328125, "loss/logits": 0.240725576877594, "loss/reg": 0.02115357480943203, "step": 711 }, { "epoch": 0.089, "grad_norm": 3.23919415473938, "grad_norm_var": 4.930329406483421, "learning_rate": 0.0001, "loss": 1.1368, "loss/crossentropy": 2.4652106761932373, "loss/hidden": 0.77734375, "loss/logits": 0.14798855781555176, "loss/reg": 0.021144360303878784, "step": 712 }, { "epoch": 0.089125, "grad_norm": 2.8362622261047363, "grad_norm_var": 3.4904838717428524, "learning_rate": 0.0001, "loss": 1.0668, "loss/crossentropy": 2.216999053955078, "loss/hidden": 0.7265625, "loss/logits": 0.1288391500711441, "loss/reg": 0.021136239171028137, "step": 713 }, { "epoch": 0.08925, "grad_norm": 5.395811080932617, "grad_norm_var": 3.623687874884585, "learning_rate": 0.0001, "loss": 1.2022, "loss/crossentropy": 2.463663101196289, "loss/hidden": 0.82421875, "loss/logits": 0.1667168289422989, "loss/reg": 0.02112707309424877, "step": 714 }, { "epoch": 0.089375, "grad_norm": 3.475656032562256, "grad_norm_var": 3.585286252049487, "learning_rate": 0.0001, "loss": 1.1524, "loss/crossentropy": 2.4688560962677, "loss/hidden": 0.796875, "loss/logits": 0.14434993267059326, "loss/reg": 0.02111782319843769, "step": 715 }, { "epoch": 0.0895, "grad_norm": 4.29054594039917, "grad_norm_var": 3.5895333664829043, "learning_rate": 0.0001, "loss": 1.1214, "loss/crossentropy": 2.4820876121520996, "loss/hidden": 0.7578125, "loss/logits": 0.15247562527656555, "loss/reg": 0.021108638495206833, "step": 716 }, { "epoch": 0.089625, "grad_norm": 2.927002429962158, "grad_norm_var": 3.661532010616088, "learning_rate": 0.0001, "loss": 1.0904, "loss/crossentropy": 2.499390125274658, "loss/hidden": 0.734375, "loss/logits": 0.14508014917373657, "loss/reg": 0.021099381148815155, "step": 717 }, { "epoch": 0.08975, "grad_norm": 2.529557943344116, "grad_norm_var": 3.629551988733732, "learning_rate": 0.0001, "loss": 1.1194, "loss/crossentropy": 2.395061492919922, "loss/hidden": 0.76953125, "loss/logits": 0.13898837566375732, "loss/reg": 0.021089982241392136, "step": 718 }, { "epoch": 0.089875, "grad_norm": 3.2681446075439453, "grad_norm_var": 3.6476018392648397, "learning_rate": 0.0001, "loss": 1.0036, "loss/crossentropy": 2.65079927444458, "loss/hidden": 0.6640625, "loss/logits": 0.12875254452228546, "loss/reg": 0.021080130711197853, "step": 719 }, { "epoch": 0.09, "grad_norm": 2.6700246334075928, "grad_norm_var": 3.7122117675621022, "learning_rate": 0.0001, "loss": 0.9929, "loss/crossentropy": 2.428039789199829, "loss/hidden": 0.65234375, "loss/logits": 0.12986770272254944, "loss/reg": 0.021070368587970734, "step": 720 }, { "epoch": 0.090125, "grad_norm": 3.296482563018799, "grad_norm_var": 3.7295350110356558, "learning_rate": 0.0001, "loss": 1.1571, "loss/crossentropy": 2.317190408706665, "loss/hidden": 0.80078125, "loss/logits": 0.14570315182209015, "loss/reg": 0.02106117643415928, "step": 721 }, { "epoch": 0.09025, "grad_norm": 4.451722145080566, "grad_norm_var": 3.74687645800365, "learning_rate": 0.0001, "loss": 1.1385, "loss/crossentropy": 2.2958080768585205, "loss/hidden": 0.78125, "loss/logits": 0.146757572889328, "loss/reg": 0.021051928400993347, "step": 722 }, { "epoch": 0.090375, "grad_norm": 2.4288058280944824, "grad_norm_var": 3.8685376080960414, "learning_rate": 0.0001, "loss": 0.9788, "loss/crossentropy": 2.4692001342773438, "loss/hidden": 0.65234375, "loss/logits": 0.11606692522764206, "loss/reg": 0.021042969077825546, "step": 723 }, { "epoch": 0.0905, "grad_norm": 45.21147918701172, "grad_norm_var": 110.45448625779909, "learning_rate": 0.0001, "loss": 1.0823, "loss/crossentropy": 2.4428551197052, "loss/hidden": 0.7421875, "loss/logits": 0.12983539700508118, "loss/reg": 0.021031970158219337, "step": 724 }, { "epoch": 0.090625, "grad_norm": 3.7111551761627197, "grad_norm_var": 110.45623490585022, "learning_rate": 0.0001, "loss": 1.4512, "loss/crossentropy": 2.1149277687072754, "loss/hidden": 1.046875, "loss/logits": 0.1941366195678711, "loss/reg": 0.021020574495196342, "step": 725 }, { "epoch": 0.09075, "grad_norm": 3.3139920234680176, "grad_norm_var": 110.50855221427315, "learning_rate": 0.0001, "loss": 1.0686, "loss/crossentropy": 2.798910140991211, "loss/hidden": 0.71875, "loss/logits": 0.13971024751663208, "loss/reg": 0.021009519696235657, "step": 726 }, { "epoch": 0.090875, "grad_norm": 2.5206596851348877, "grad_norm_var": 110.12514261997971, "learning_rate": 0.0001, "loss": 1.0391, "loss/crossentropy": 2.400813579559326, "loss/hidden": 0.6953125, "loss/logits": 0.13384617865085602, "loss/reg": 0.020998528227210045, "step": 727 }, { "epoch": 0.091, "grad_norm": 2.698018789291382, "grad_norm_var": 110.34070270952832, "learning_rate": 0.0001, "loss": 0.928, "loss/crossentropy": 2.505309820175171, "loss/hidden": 0.61328125, "loss/logits": 0.10482652485370636, "loss/reg": 0.020989248529076576, "step": 728 }, { "epoch": 0.091125, "grad_norm": 6.086211204528809, "grad_norm_var": 109.65630388036335, "learning_rate": 0.0001, "loss": 1.4543, "loss/crossentropy": 2.065880537033081, "loss/hidden": 1.1171875, "loss/logits": 0.12730881571769714, "loss/reg": 0.020978538319468498, "step": 729 }, { "epoch": 0.09125, "grad_norm": 2.527377128601074, "grad_norm_var": 110.45601242879228, "learning_rate": 0.0001, "loss": 0.9281, "loss/crossentropy": 2.575125217437744, "loss/hidden": 0.60546875, "loss/logits": 0.11296658217906952, "loss/reg": 0.020967954769730568, "step": 730 }, { "epoch": 0.091375, "grad_norm": 2.405383825302124, "grad_norm_var": 110.88254605251709, "learning_rate": 0.0001, "loss": 1.0005, "loss/crossentropy": 2.1985392570495605, "loss/hidden": 0.671875, "loss/logits": 0.1190965548157692, "loss/reg": 0.02095715142786503, "step": 731 }, { "epoch": 0.0915, "grad_norm": 5.095945835113525, "grad_norm_var": 110.75067974759948, "learning_rate": 0.0001, "loss": 1.2535, "loss/crossentropy": 2.451446056365967, "loss/hidden": 0.88671875, "loss/logits": 0.15732397139072418, "loss/reg": 0.020946422591805458, "step": 732 }, { "epoch": 0.091625, "grad_norm": 2.497173547744751, "grad_norm_var": 110.93526847423995, "learning_rate": 0.0001, "loss": 1.1301, "loss/crossentropy": 2.4981319904327393, "loss/hidden": 0.76953125, "loss/logits": 0.15116086602210999, "loss/reg": 0.020936597138643265, "step": 733 }, { "epoch": 0.09175, "grad_norm": 4.977592468261719, "grad_norm_var": 110.20332761050602, "learning_rate": 0.0001, "loss": 1.0306, "loss/crossentropy": 2.37424898147583, "loss/hidden": 0.703125, "loss/logits": 0.11818103492259979, "loss/reg": 0.020926134660840034, "step": 734 }, { "epoch": 0.091875, "grad_norm": 2.8039772510528564, "grad_norm_var": 110.39035266849663, "learning_rate": 0.0001, "loss": 0.9389, "loss/crossentropy": 2.5503880977630615, "loss/hidden": 0.62109375, "loss/logits": 0.1085958182811737, "loss/reg": 0.02091672271490097, "step": 735 }, { "epoch": 0.092, "grad_norm": 3.418717861175537, "grad_norm_var": 110.08862675247053, "learning_rate": 0.0001, "loss": 0.9544, "loss/crossentropy": 2.6695592403411865, "loss/hidden": 0.62890625, "loss/logits": 0.11641087383031845, "loss/reg": 0.02090657874941826, "step": 736 }, { "epoch": 0.092125, "grad_norm": 2.807041883468628, "grad_norm_var": 110.28591938740921, "learning_rate": 0.0001, "loss": 1.1135, "loss/crossentropy": 2.4328622817993164, "loss/hidden": 0.76953125, "loss/logits": 0.13500146567821503, "loss/reg": 0.02089635282754898, "step": 737 }, { "epoch": 0.09225, "grad_norm": 3.713057518005371, "grad_norm_var": 110.4783888232826, "learning_rate": 0.0001, "loss": 1.1755, "loss/crossentropy": 2.0374624729156494, "loss/hidden": 0.83984375, "loss/logits": 0.12679257988929749, "loss/reg": 0.02088700234889984, "step": 738 }, { "epoch": 0.092375, "grad_norm": 3.618948459625244, "grad_norm_var": 109.99807079993953, "learning_rate": 0.0001, "loss": 0.9255, "loss/crossentropy": 2.536527395248413, "loss/hidden": 0.609375, "loss/logits": 0.10737244784832001, "loss/reg": 0.020877836272120476, "step": 739 }, { "epoch": 0.0925, "grad_norm": 2.815113067626953, "grad_norm_var": 1.1792510177041378, "learning_rate": 0.0001, "loss": 0.9755, "loss/crossentropy": 2.464996337890625, "loss/hidden": 0.640625, "loss/logits": 0.12616491317749023, "loss/reg": 0.020868681371212006, "step": 740 }, { "epoch": 0.092625, "grad_norm": 3.194117546081543, "grad_norm_var": 1.1771383378848062, "learning_rate": 0.0001, "loss": 1.1989, "loss/crossentropy": 2.325310707092285, "loss/hidden": 0.84375, "loss/logits": 0.14658081531524658, "loss/reg": 0.020859118551015854, "step": 741 }, { "epoch": 0.09275, "grad_norm": 2.970301628112793, "grad_norm_var": 1.1887296793511715, "learning_rate": 0.0001, "loss": 0.9939, "loss/crossentropy": 2.4747390747070312, "loss/hidden": 0.66015625, "loss/logits": 0.1252739280462265, "loss/reg": 0.020850006490945816, "step": 742 }, { "epoch": 0.092875, "grad_norm": 3.7745604515075684, "grad_norm_var": 1.1425983881417718, "learning_rate": 0.0001, "loss": 0.9835, "loss/crossentropy": 2.4191653728485107, "loss/hidden": 0.67578125, "loss/logits": 0.09930374473333359, "loss/reg": 0.020840618759393692, "step": 743 }, { "epoch": 0.093, "grad_norm": 2.8048055171966553, "grad_norm_var": 1.132423092522494, "learning_rate": 0.0001, "loss": 1.083, "loss/crossentropy": 2.764143228530884, "loss/hidden": 0.75390625, "loss/logits": 0.12075912207365036, "loss/reg": 0.020830942317843437, "step": 744 }, { "epoch": 0.093125, "grad_norm": 9.04628849029541, "grad_norm_var": 2.712848654929009, "learning_rate": 0.0001, "loss": 1.4779, "loss/crossentropy": 2.2979469299316406, "loss/hidden": 1.125, "loss/logits": 0.1446615606546402, "loss/reg": 0.02082117274403572, "step": 745 }, { "epoch": 0.09325, "grad_norm": 3.1999406814575195, "grad_norm_var": 2.6400540651182967, "learning_rate": 0.0001, "loss": 1.0424, "loss/crossentropy": 2.6157429218292236, "loss/hidden": 0.71875, "loss/logits": 0.11555971205234528, "loss/reg": 0.020811092108488083, "step": 746 }, { "epoch": 0.093375, "grad_norm": 3.6833674907684326, "grad_norm_var": 2.522139333113606, "learning_rate": 0.0001, "loss": 1.1046, "loss/crossentropy": 2.6110498905181885, "loss/hidden": 0.7578125, "loss/logits": 0.13881847262382507, "loss/reg": 0.020801017060875893, "step": 747 }, { "epoch": 0.0935, "grad_norm": 3.073922872543335, "grad_norm_var": 2.4218973518929827, "learning_rate": 0.0001, "loss": 0.9672, "loss/crossentropy": 2.0898826122283936, "loss/hidden": 0.65625, "loss/logits": 0.10303568840026855, "loss/reg": 0.020791731774806976, "step": 748 }, { "epoch": 0.093625, "grad_norm": 2.9481358528137207, "grad_norm_var": 2.365294319547022, "learning_rate": 0.0001, "loss": 0.9589, "loss/crossentropy": 2.477987766265869, "loss/hidden": 0.640625, "loss/logits": 0.11042475700378418, "loss/reg": 0.02078239433467388, "step": 749 }, { "epoch": 0.09375, "grad_norm": 5.792114734649658, "grad_norm_var": 2.5478865053407236, "learning_rate": 0.0001, "loss": 1.2395, "loss/crossentropy": 2.0092225074768066, "loss/hidden": 0.87109375, "loss/logits": 0.16063663363456726, "loss/reg": 0.020772725343704224, "step": 750 }, { "epoch": 0.093875, "grad_norm": 3.148350954055786, "grad_norm_var": 2.512823601683457, "learning_rate": 0.0001, "loss": 1.14, "loss/crossentropy": 2.669532537460327, "loss/hidden": 0.78125, "loss/logits": 0.15109741687774658, "loss/reg": 0.020762871950864792, "step": 751 }, { "epoch": 0.094, "grad_norm": 3.2779595851898193, "grad_norm_var": 2.5202896391695124, "learning_rate": 0.0001, "loss": 1.1492, "loss/crossentropy": 2.477522373199463, "loss/hidden": 0.8125, "loss/logits": 0.1291784942150116, "loss/reg": 0.020753389224410057, "step": 752 }, { "epoch": 0.094125, "grad_norm": 2.3718457221984863, "grad_norm_var": 2.586364485192132, "learning_rate": 0.0001, "loss": 1.0399, "loss/crossentropy": 2.497688055038452, "loss/hidden": 0.69921875, "loss/logits": 0.13327988982200623, "loss/reg": 0.02074403502047062, "step": 753 }, { "epoch": 0.09425, "grad_norm": 4.972538948059082, "grad_norm_var": 2.6852568725766104, "learning_rate": 0.0001, "loss": 1.2511, "loss/crossentropy": 2.232241630554199, "loss/hidden": 0.90234375, "loss/logits": 0.1414394974708557, "loss/reg": 0.020734604448080063, "step": 754 }, { "epoch": 0.094375, "grad_norm": 2.6991426944732666, "grad_norm_var": 2.7595134043336267, "learning_rate": 0.0001, "loss": 1.001, "loss/crossentropy": 2.596348285675049, "loss/hidden": 0.6640625, "loss/logits": 0.1297152191400528, "loss/reg": 0.020725268870592117, "step": 755 }, { "epoch": 0.0945, "grad_norm": 2.8633017539978027, "grad_norm_var": 2.753743097466793, "learning_rate": 0.0001, "loss": 1.065, "loss/crossentropy": 2.3198070526123047, "loss/hidden": 0.74609375, "loss/logits": 0.11169925332069397, "loss/reg": 0.020716087892651558, "step": 756 }, { "epoch": 0.094625, "grad_norm": 3.628239154815674, "grad_norm_var": 2.733994536045853, "learning_rate": 0.0001, "loss": 1.3141, "loss/crossentropy": 2.1950411796569824, "loss/hidden": 0.92578125, "loss/logits": 0.1812862753868103, "loss/reg": 0.020707255229353905, "step": 757 }, { "epoch": 0.09475, "grad_norm": 2.805727958679199, "grad_norm_var": 2.753145827217212, "learning_rate": 0.0001, "loss": 1.109, "loss/crossentropy": 2.40027117729187, "loss/hidden": 0.76953125, "loss/logits": 0.13251210749149323, "loss/reg": 0.02069801278412342, "step": 758 }, { "epoch": 0.094875, "grad_norm": 3.1892051696777344, "grad_norm_var": 2.7730842000576295, "learning_rate": 0.0001, "loss": 1.1224, "loss/crossentropy": 2.2343788146972656, "loss/hidden": 0.77734375, "loss/logits": 0.1382053792476654, "loss/reg": 0.020689615979790688, "step": 759 }, { "epoch": 0.095, "grad_norm": 2.925913095474243, "grad_norm_var": 2.759237877311041, "learning_rate": 0.0001, "loss": 0.9857, "loss/crossentropy": 2.5283889770507812, "loss/hidden": 0.6640625, "loss/logits": 0.11487281322479248, "loss/reg": 0.02068025805056095, "step": 760 }, { "epoch": 0.095125, "grad_norm": 4.760406970977783, "grad_norm_var": 0.8673601536624308, "learning_rate": 0.0001, "loss": 1.3807, "loss/crossentropy": 2.8629026412963867, "loss/hidden": 0.9765625, "loss/logits": 0.1973596215248108, "loss/reg": 0.020673030987381935, "step": 761 }, { "epoch": 0.09525, "grad_norm": 2.6182596683502197, "grad_norm_var": 0.9085803501248163, "learning_rate": 0.0001, "loss": 0.9554, "loss/crossentropy": 2.6935925483703613, "loss/hidden": 0.6328125, "loss/logits": 0.11598716676235199, "loss/reg": 0.020663931965827942, "step": 762 }, { "epoch": 0.095375, "grad_norm": 3.404240131378174, "grad_norm_var": 0.9037375089777697, "learning_rate": 0.0001, "loss": 1.0173, "loss/crossentropy": 2.5958027839660645, "loss/hidden": 0.6953125, "loss/logits": 0.11542315781116486, "loss/reg": 0.020654823631048203, "step": 763 }, { "epoch": 0.0955, "grad_norm": 3.0021934509277344, "grad_norm_var": 0.9072250591900151, "learning_rate": 0.0001, "loss": 0.9811, "loss/crossentropy": 2.4611666202545166, "loss/hidden": 0.6640625, "loss/logits": 0.11060373485088348, "loss/reg": 0.020645687356591225, "step": 764 }, { "epoch": 0.095625, "grad_norm": 4.123987674713135, "grad_norm_var": 0.9227216736856043, "learning_rate": 0.0001, "loss": 1.2112, "loss/crossentropy": 2.327146530151367, "loss/hidden": 0.87109375, "loss/logits": 0.13372407853603363, "loss/reg": 0.020637821406126022, "step": 765 }, { "epoch": 0.09575, "grad_norm": 3.835116147994995, "grad_norm_var": 0.5572045887439032, "learning_rate": 0.0001, "loss": 1.0062, "loss/crossentropy": 2.5438876152038574, "loss/hidden": 0.67578125, "loss/logits": 0.12415439635515213, "loss/reg": 0.02063015103340149, "step": 766 }, { "epoch": 0.095875, "grad_norm": 3.2649309635162354, "grad_norm_var": 0.5548939110280107, "learning_rate": 0.0001, "loss": 1.1664, "loss/crossentropy": 2.6621615886688232, "loss/hidden": 0.80078125, "loss/logits": 0.15938454866409302, "loss/reg": 0.020622732117772102, "step": 767 }, { "epoch": 0.096, "grad_norm": 3.398061752319336, "grad_norm_var": 0.554498685348062, "learning_rate": 0.0001, "loss": 0.852, "loss/crossentropy": 2.3231897354125977, "loss/hidden": 0.5625, "loss/logits": 0.08329755067825317, "loss/reg": 0.0206154715269804, "step": 768 }, { "epoch": 0.096125, "grad_norm": 2.8744146823883057, "grad_norm_var": 0.5036373977995244, "learning_rate": 0.0001, "loss": 0.9094, "loss/crossentropy": 2.5140726566314697, "loss/hidden": 0.6171875, "loss/logits": 0.0860939770936966, "loss/reg": 0.020607706159353256, "step": 769 }, { "epoch": 0.09625, "grad_norm": 8.404803276062012, "grad_norm_var": 1.9605456650252857, "learning_rate": 0.0001, "loss": 1.614, "loss/crossentropy": 2.4072842597961426, "loss/hidden": 1.1328125, "loss/logits": 0.2751774787902832, "loss/reg": 0.020600339397788048, "step": 770 }, { "epoch": 0.096375, "grad_norm": 2.581511974334717, "grad_norm_var": 1.975733645477993, "learning_rate": 0.0001, "loss": 1.08, "loss/crossentropy": 2.3724427223205566, "loss/hidden": 0.73828125, "loss/logits": 0.1358003169298172, "loss/reg": 0.02059323526918888, "step": 771 }, { "epoch": 0.0965, "grad_norm": 3.9817845821380615, "grad_norm_var": 1.9433082266341482, "learning_rate": 0.0001, "loss": 1.0926, "loss/crossentropy": 2.459740400314331, "loss/hidden": 0.765625, "loss/logits": 0.12109124660491943, "loss/reg": 0.02058546058833599, "step": 772 }, { "epoch": 0.096625, "grad_norm": 7.001491069793701, "grad_norm_var": 2.633487351928299, "learning_rate": 0.0001, "loss": 1.1253, "loss/crossentropy": 2.6342828273773193, "loss/hidden": 0.8125, "loss/logits": 0.10705184936523438, "loss/reg": 0.020576275885105133, "step": 773 }, { "epoch": 0.09675, "grad_norm": 2.4826319217681885, "grad_norm_var": 2.6865387021083524, "learning_rate": 0.0001, "loss": 1.1239, "loss/crossentropy": 2.537883996963501, "loss/hidden": 0.7734375, "loss/logits": 0.1447601616382599, "loss/reg": 0.02056770585477352, "step": 774 }, { "epoch": 0.096875, "grad_norm": 3.5470290184020996, "grad_norm_var": 2.6622723084153237, "learning_rate": 0.0001, "loss": 1.1572, "loss/crossentropy": 2.572312831878662, "loss/hidden": 0.80078125, "loss/logits": 0.15083444118499756, "loss/reg": 0.020558428019285202, "step": 775 }, { "epoch": 0.097, "grad_norm": 2.8388142585754395, "grad_norm_var": 2.673918444962514, "learning_rate": 0.0001, "loss": 1.1426, "loss/crossentropy": 2.2697503566741943, "loss/hidden": 0.79296875, "loss/logits": 0.14412574470043182, "loss/reg": 0.020549749955534935, "step": 776 }, { "epoch": 0.097125, "grad_norm": 2.9166338443756104, "grad_norm_var": 2.670560695292122, "learning_rate": 0.0001, "loss": 0.8926, "loss/crossentropy": 2.4288480281829834, "loss/hidden": 0.58203125, "loss/logits": 0.10516718029975891, "loss/reg": 0.02054043672978878, "step": 777 }, { "epoch": 0.09725, "grad_norm": 3.565744161605835, "grad_norm_var": 2.58151597609506, "learning_rate": 0.0001, "loss": 1.1299, "loss/crossentropy": 2.444842576980591, "loss/hidden": 0.77734375, "loss/logits": 0.1472683846950531, "loss/reg": 0.020531047135591507, "step": 778 }, { "epoch": 0.097375, "grad_norm": 2.8829805850982666, "grad_norm_var": 2.627842889624617, "learning_rate": 0.0001, "loss": 1.1382, "loss/crossentropy": 2.548234462738037, "loss/hidden": 0.78515625, "loss/logits": 0.1478501409292221, "loss/reg": 0.02052178978919983, "step": 779 }, { "epoch": 0.0975, "grad_norm": 2.908299207687378, "grad_norm_var": 2.6383052442278556, "learning_rate": 0.0001, "loss": 1.069, "loss/crossentropy": 2.3388137817382812, "loss/hidden": 0.734375, "loss/logits": 0.12949424982070923, "loss/reg": 0.020512979477643967, "step": 780 }, { "epoch": 0.097625, "grad_norm": 2.5852208137512207, "grad_norm_var": 2.717361748364273, "learning_rate": 0.0001, "loss": 0.9782, "loss/crossentropy": 2.4337894916534424, "loss/hidden": 0.65625, "loss/logits": 0.11689651757478714, "loss/reg": 0.020503604784607887, "step": 781 }, { "epoch": 0.09775, "grad_norm": 5.328097820281982, "grad_norm_var": 2.885194693951976, "learning_rate": 0.0001, "loss": 1.2053, "loss/crossentropy": 2.5564181804656982, "loss/hidden": 0.80078125, "loss/logits": 0.19961750507354736, "loss/reg": 0.020494818687438965, "step": 782 }, { "epoch": 0.097875, "grad_norm": 3.647054672241211, "grad_norm_var": 2.8678156226553613, "learning_rate": 0.0001, "loss": 1.0039, "loss/crossentropy": 2.5830650329589844, "loss/hidden": 0.66796875, "loss/logits": 0.13103139400482178, "loss/reg": 0.020485466346144676, "step": 783 }, { "epoch": 0.098, "grad_norm": 4.480771541595459, "grad_norm_var": 2.8817531456144723, "learning_rate": 0.0001, "loss": 1.3811, "loss/crossentropy": 2.4550743103027344, "loss/hidden": 0.9765625, "loss/logits": 0.1997724324464798, "loss/reg": 0.020477164536714554, "step": 784 }, { "epoch": 0.098125, "grad_norm": 3.1495656967163086, "grad_norm_var": 2.8497140664534366, "learning_rate": 0.0001, "loss": 1.1386, "loss/crossentropy": 2.846536636352539, "loss/hidden": 0.76171875, "loss/logits": 0.1722334325313568, "loss/reg": 0.020469149574637413, "step": 785 }, { "epoch": 0.09825, "grad_norm": 3.611919641494751, "grad_norm_var": 1.4027508562338329, "learning_rate": 0.0001, "loss": 1.0783, "loss/crossentropy": 2.7328994274139404, "loss/hidden": 0.73046875, "loss/logits": 0.14321856200695038, "loss/reg": 0.020459884777665138, "step": 786 }, { "epoch": 0.098375, "grad_norm": 4.535567283630371, "grad_norm_var": 1.3775118805215696, "learning_rate": 0.0001, "loss": 1.3035, "loss/crossentropy": 2.616641044616699, "loss/hidden": 0.8515625, "loss/logits": 0.24746158719062805, "loss/reg": 0.020450593903660774, "step": 787 }, { "epoch": 0.0985, "grad_norm": 2.7010657787323, "grad_norm_var": 1.4347220572574786, "learning_rate": 0.0001, "loss": 0.9439, "loss/crossentropy": 2.7851712703704834, "loss/hidden": 0.62890625, "loss/logits": 0.11055716872215271, "loss/reg": 0.020441319793462753, "step": 788 }, { "epoch": 0.098625, "grad_norm": 3.3690221309661865, "grad_norm_var": 0.6296018822433316, "learning_rate": 0.0001, "loss": 0.9858, "loss/crossentropy": 2.5199151039123535, "loss/hidden": 0.6640625, "loss/logits": 0.11739970743656158, "loss/reg": 0.020432572811841965, "step": 789 }, { "epoch": 0.09875, "grad_norm": 3.5332016944885254, "grad_norm_var": 0.5687648370759459, "learning_rate": 0.0001, "loss": 1.2114, "loss/crossentropy": 2.3861191272735596, "loss/hidden": 0.80859375, "loss/logits": 0.19853942096233368, "loss/reg": 0.020423252135515213, "step": 790 }, { "epoch": 0.098875, "grad_norm": 3.4778192043304443, "grad_norm_var": 0.5684000998912779, "learning_rate": 0.0001, "loss": 1.3337, "loss/crossentropy": 2.177149772644043, "loss/hidden": 0.94921875, "loss/logits": 0.18037351965904236, "loss/reg": 0.020413951948285103, "step": 791 }, { "epoch": 0.099, "grad_norm": 3.1103479862213135, "grad_norm_var": 0.5501298461305394, "learning_rate": 0.0001, "loss": 1.1811, "loss/crossentropy": 2.381289005279541, "loss/hidden": 0.8125, "loss/logits": 0.16453775763511658, "loss/reg": 0.020404649898409843, "step": 792 }, { "epoch": 0.099125, "grad_norm": 2.7654690742492676, "grad_norm_var": 0.563068172749172, "learning_rate": 0.0001, "loss": 0.9788, "loss/crossentropy": 2.6645448207855225, "loss/hidden": 0.65234375, "loss/logits": 0.12248219549655914, "loss/reg": 0.020395854488015175, "step": 793 }, { "epoch": 0.09925, "grad_norm": 2.9942195415496826, "grad_norm_var": 0.5768165563916947, "learning_rate": 0.0001, "loss": 1.1054, "loss/crossentropy": 2.388904571533203, "loss/hidden": 0.73828125, "loss/logits": 0.16325941681861877, "loss/reg": 0.020387381315231323, "step": 794 }, { "epoch": 0.099375, "grad_norm": 3.1070520877838135, "grad_norm_var": 0.5632370819485725, "learning_rate": 0.0001, "loss": 1.0819, "loss/crossentropy": 2.4645302295684814, "loss/hidden": 0.74609375, "loss/logits": 0.13203924894332886, "loss/reg": 0.020378144457936287, "step": 795 }, { "epoch": 0.0995, "grad_norm": 3.5182595252990723, "grad_norm_var": 0.5419026805153273, "learning_rate": 0.0001, "loss": 1.3242, "loss/crossentropy": 2.337388038635254, "loss/hidden": 0.96875, "loss/logits": 0.15180304646492004, "loss/reg": 0.020368557423353195, "step": 796 }, { "epoch": 0.099625, "grad_norm": 2.4217121601104736, "grad_norm_var": 0.5634005753459926, "learning_rate": 0.0001, "loss": 0.976, "loss/crossentropy": 2.520176887512207, "loss/hidden": 0.6640625, "loss/logits": 0.1083153486251831, "loss/reg": 0.020359758287668228, "step": 797 }, { "epoch": 0.09975, "grad_norm": 3.818143129348755, "grad_norm_var": 0.33472096860269646, "learning_rate": 0.0001, "loss": 1.2273, "loss/crossentropy": 2.452592134475708, "loss/hidden": 0.86328125, "loss/logits": 0.1604856252670288, "loss/reg": 0.02035023830831051, "step": 798 }, { "epoch": 0.099875, "grad_norm": 2.8298075199127197, "grad_norm_var": 0.3484620943588491, "learning_rate": 0.0001, "loss": 1.094, "loss/crossentropy": 2.6919286251068115, "loss/hidden": 0.73828125, "loss/logits": 0.15227138996124268, "loss/reg": 0.020340625196695328, "step": 799 }, { "epoch": 0.1, "grad_norm": 45.50175094604492, "grad_norm_var": 111.76340644728633, "learning_rate": 0.0001, "loss": 1.6213, "loss/crossentropy": 2.2076241970062256, "loss/hidden": 1.125, "loss/logits": 0.2930048406124115, "loss/reg": 0.020331410691142082, "step": 800 }, { "epoch": 0.100125, "grad_norm": 3.073981523513794, "grad_norm_var": 111.7915103772579, "learning_rate": 0.0001, "loss": 1.1146, "loss/crossentropy": 2.5723347663879395, "loss/hidden": 0.77734375, "loss/logits": 0.13403840363025665, "loss/reg": 0.020322071388363838, "step": 801 }, { "epoch": 0.10025, "grad_norm": 4.337286472320557, "grad_norm_var": 111.60328751499571, "learning_rate": 0.0001, "loss": 1.2228, "loss/crossentropy": 2.7784037590026855, "loss/hidden": 0.83984375, "loss/logits": 0.17983263731002808, "loss/reg": 0.020312372595071793, "step": 802 }, { "epoch": 0.100375, "grad_norm": 3.656367778778076, "grad_norm_var": 111.81663718658596, "learning_rate": 0.0001, "loss": 1.1003, "loss/crossentropy": 2.4638426303863525, "loss/hidden": 0.76953125, "loss/logits": 0.12776704132556915, "loss/reg": 0.020302986726164818, "step": 803 }, { "epoch": 0.1005, "grad_norm": 3.38935923576355, "grad_norm_var": 111.55373057700994, "learning_rate": 0.0001, "loss": 0.9749, "loss/crossentropy": 2.4947969913482666, "loss/hidden": 0.6640625, "loss/logits": 0.10794391483068466, "loss/reg": 0.020293867215514183, "step": 804 }, { "epoch": 0.100625, "grad_norm": 3.2775380611419678, "grad_norm_var": 111.58551029522327, "learning_rate": 0.0001, "loss": 1.133, "loss/crossentropy": 2.4812419414520264, "loss/hidden": 0.77734375, "loss/logits": 0.15277597308158875, "loss/reg": 0.020284701138734818, "step": 805 }, { "epoch": 0.10075, "grad_norm": 2.996157646179199, "grad_norm_var": 111.77485823890761, "learning_rate": 0.0001, "loss": 1.0998, "loss/crossentropy": 2.516984224319458, "loss/hidden": 0.75, "loss/logits": 0.14709463715553284, "loss/reg": 0.020275365561246872, "step": 806 }, { "epoch": 0.100875, "grad_norm": 5.575490951538086, "grad_norm_var": 111.37459403701224, "learning_rate": 0.0001, "loss": 1.0629, "loss/crossentropy": 2.6265506744384766, "loss/hidden": 0.73828125, "loss/logits": 0.12198775261640549, "loss/reg": 0.02026602067053318, "step": 807 }, { "epoch": 0.101, "grad_norm": 2.653712749481201, "grad_norm_var": 111.56498102164149, "learning_rate": 0.0001, "loss": 0.957, "loss/crossentropy": 2.755121946334839, "loss/hidden": 0.6328125, "loss/logits": 0.12166983634233475, "loss/reg": 0.02025618776679039, "step": 808 }, { "epoch": 0.101125, "grad_norm": 2.5454790592193604, "grad_norm_var": 111.66272758702647, "learning_rate": 0.0001, "loss": 1.0209, "loss/crossentropy": 2.475360870361328, "loss/hidden": 0.6953125, "loss/logits": 0.1231006383895874, "loss/reg": 0.02024705521762371, "step": 809 }, { "epoch": 0.10125, "grad_norm": 12.002355575561523, "grad_norm_var": 113.1469842386682, "learning_rate": 0.0001, "loss": 1.1289, "loss/crossentropy": 2.485873222351074, "loss/hidden": 0.77734375, "loss/logits": 0.1492297500371933, "loss/reg": 0.020237451419234276, "step": 810 }, { "epoch": 0.101375, "grad_norm": 3.0118675231933594, "grad_norm_var": 113.19117010752396, "learning_rate": 0.0001, "loss": 1.0673, "loss/crossentropy": 2.610520124435425, "loss/hidden": 0.7265625, "loss/logits": 0.13847574591636658, "loss/reg": 0.020227529108524323, "step": 811 }, { "epoch": 0.1015, "grad_norm": 37.33988952636719, "grad_norm_var": 171.06705552642327, "learning_rate": 0.0001, "loss": 4.5438, "loss/crossentropy": 3.938170909881592, "loss/hidden": 2.203125, "loss/logits": 2.1385304927825928, "loss/reg": 0.02021711878478527, "step": 812 }, { "epoch": 0.101625, "grad_norm": 2.306735038757324, "grad_norm_var": 171.16339278078715, "learning_rate": 0.0001, "loss": 0.928, "loss/crossentropy": 2.3164937496185303, "loss/hidden": 0.625, "loss/logits": 0.10087703168392181, "loss/reg": 0.020207591354846954, "step": 813 }, { "epoch": 0.10175, "grad_norm": 3.00903058052063, "grad_norm_var": 171.72501112959992, "learning_rate": 0.0001, "loss": 1.1753, "loss/crossentropy": 2.4386990070343018, "loss/hidden": 0.828125, "loss/logits": 0.1452336609363556, "loss/reg": 0.02019745111465454, "step": 814 }, { "epoch": 0.101875, "grad_norm": 2.680140972137451, "grad_norm_var": 171.84144221114087, "learning_rate": 0.0001, "loss": 1.157, "loss/crossentropy": 2.531343460083008, "loss/hidden": 0.796875, "loss/logits": 0.15824541449546814, "loss/reg": 0.020187031477689743, "step": 815 }, { "epoch": 0.102, "grad_norm": 2.582740068435669, "grad_norm_var": 75.71062264039246, "learning_rate": 0.0001, "loss": 1.0065, "loss/crossentropy": 2.5258586406707764, "loss/hidden": 0.69140625, "loss/logits": 0.11330173909664154, "loss/reg": 0.02017681486904621, "step": 816 }, { "epoch": 0.102125, "grad_norm": 3.5809519290924072, "grad_norm_var": 75.53549752812218, "learning_rate": 0.0001, "loss": 1.2756, "loss/crossentropy": 2.080437183380127, "loss/hidden": 0.90625, "loss/logits": 0.16763544082641602, "loss/reg": 0.02016652189195156, "step": 817 }, { "epoch": 0.10225, "grad_norm": 3.093592405319214, "grad_norm_var": 75.89695881356819, "learning_rate": 0.0001, "loss": 1.1484, "loss/crossentropy": 2.630833625793457, "loss/hidden": 0.80078125, "loss/logits": 0.14600682258605957, "loss/reg": 0.020157409831881523, "step": 818 }, { "epoch": 0.102375, "grad_norm": 3.1120946407318115, "grad_norm_var": 76.0751246894024, "learning_rate": 0.0001, "loss": 1.038, "loss/crossentropy": 2.5411901473999023, "loss/hidden": 0.69140625, "loss/logits": 0.14511412382125854, "loss/reg": 0.020148303359746933, "step": 819 }, { "epoch": 0.1025, "grad_norm": 2.9903945922851562, "grad_norm_var": 76.21449508483448, "learning_rate": 0.0001, "loss": 1.0769, "loss/crossentropy": 2.556246042251587, "loss/hidden": 0.73828125, "loss/logits": 0.13721047341823578, "loss/reg": 0.020139139145612717, "step": 820 }, { "epoch": 0.102625, "grad_norm": 3.297760248184204, "grad_norm_var": 76.2077263993312, "learning_rate": 0.0001, "loss": 1.025, "loss/crossentropy": 2.5704898834228516, "loss/hidden": 0.6796875, "loss/logits": 0.14399868249893188, "loss/reg": 0.020129989832639694, "step": 821 }, { "epoch": 0.10275, "grad_norm": 3.810601234436035, "grad_norm_var": 75.94485425030823, "learning_rate": 0.0001, "loss": 1.2142, "loss/crossentropy": 2.4684388637542725, "loss/hidden": 0.83984375, "loss/logits": 0.17318351566791534, "loss/reg": 0.020120643079280853, "step": 822 }, { "epoch": 0.102875, "grad_norm": 2.798835515975952, "grad_norm_var": 76.52818091118122, "learning_rate": 0.0001, "loss": 0.9558, "loss/crossentropy": 2.476940393447876, "loss/hidden": 0.6484375, "loss/logits": 0.10626688599586487, "loss/reg": 0.020111503079533577, "step": 823 }, { "epoch": 0.103, "grad_norm": 4.541812419891357, "grad_norm_var": 75.99013496754353, "learning_rate": 0.0001, "loss": 1.1311, "loss/crossentropy": 2.339744806289673, "loss/hidden": 0.7265625, "loss/logits": 0.2034740447998047, "loss/reg": 0.020102351903915405, "step": 824 }, { "epoch": 0.103125, "grad_norm": 2.6360268592834473, "grad_norm_var": 75.95142766348106, "learning_rate": 0.0001, "loss": 1.0849, "loss/crossentropy": 2.3795337677001953, "loss/hidden": 0.74609375, "loss/logits": 0.1378898024559021, "loss/reg": 0.02009383775293827, "step": 825 }, { "epoch": 0.10325, "grad_norm": 3.2206642627716064, "grad_norm_var": 73.50864103963063, "learning_rate": 0.0001, "loss": 1.09, "loss/crossentropy": 2.645775556564331, "loss/hidden": 0.7578125, "loss/logits": 0.13137856125831604, "loss/reg": 0.020084405317902565, "step": 826 }, { "epoch": 0.103375, "grad_norm": 3.2071568965911865, "grad_norm_var": 73.45272548167614, "learning_rate": 0.0001, "loss": 0.9383, "loss/crossentropy": 2.7224762439727783, "loss/hidden": 0.625, "loss/logits": 0.11251779645681381, "loss/reg": 0.020074598491191864, "step": 827 }, { "epoch": 0.1035, "grad_norm": 3.2850613594055176, "grad_norm_var": 0.2863261800221416, "learning_rate": 0.0001, "loss": 1.1194, "loss/crossentropy": 2.522642135620117, "loss/hidden": 0.7421875, "loss/logits": 0.17652815580368042, "loss/reg": 0.02006435953080654, "step": 828 }, { "epoch": 0.103625, "grad_norm": 3.5068256855010986, "grad_norm_var": 0.24387138774257647, "learning_rate": 0.0001, "loss": 0.9457, "loss/crossentropy": 2.6463444232940674, "loss/hidden": 0.6328125, "loss/logits": 0.11229754984378815, "loss/reg": 0.020054515451192856, "step": 829 }, { "epoch": 0.10375, "grad_norm": 4.16710090637207, "grad_norm_var": 0.29672115328222404, "learning_rate": 0.0001, "loss": 1.301, "loss/crossentropy": 2.519270420074463, "loss/hidden": 0.921875, "loss/logits": 0.1786651611328125, "loss/reg": 0.020044928416609764, "step": 830 }, { "epoch": 0.103875, "grad_norm": 3.5839273929595947, "grad_norm_var": 0.27524789373512704, "learning_rate": 0.0001, "loss": 0.9949, "loss/crossentropy": 2.0604701042175293, "loss/hidden": 0.68359375, "loss/logits": 0.1109282597899437, "loss/reg": 0.02003585919737816, "step": 831 }, { "epoch": 0.104, "grad_norm": 2.8121650218963623, "grad_norm_var": 0.25541980739101955, "learning_rate": 0.0001, "loss": 1.1057, "loss/crossentropy": 2.670191764831543, "loss/hidden": 0.75390625, "loss/logits": 0.1515616476535797, "loss/reg": 0.020026110112667084, "step": 832 }, { "epoch": 0.104125, "grad_norm": 3.116018295288086, "grad_norm_var": 0.2547872758708628, "learning_rate": 0.0001, "loss": 1.0301, "loss/crossentropy": 3.016143321990967, "loss/hidden": 0.6796875, "loss/logits": 0.1502160131931305, "loss/reg": 0.020016156136989594, "step": 833 }, { "epoch": 0.10425, "grad_norm": 6.202447414398193, "grad_norm_var": 0.7634439694535559, "learning_rate": 0.0001, "loss": 1.2045, "loss/crossentropy": 2.8383448123931885, "loss/hidden": 0.84765625, "loss/logits": 0.15678462386131287, "loss/reg": 0.020007088780403137, "step": 834 }, { "epoch": 0.104375, "grad_norm": 3.4904069900512695, "grad_norm_var": 0.7519116349075012, "learning_rate": 0.0001, "loss": 1.2658, "loss/crossentropy": 2.597175359725952, "loss/hidden": 0.84375, "loss/logits": 0.22209098935127258, "loss/reg": 0.019998185336589813, "step": 835 }, { "epoch": 0.1045, "grad_norm": 4.6242594718933105, "grad_norm_var": 0.7986550791863064, "learning_rate": 0.0001, "loss": 1.1411, "loss/crossentropy": 2.6017813682556152, "loss/hidden": 0.796875, "loss/logits": 0.14435435831546783, "loss/reg": 0.019988389685750008, "step": 836 }, { "epoch": 0.104625, "grad_norm": 3.2676827907562256, "grad_norm_var": 0.800099420481778, "learning_rate": 0.0001, "loss": 1.1908, "loss/crossentropy": 2.454334259033203, "loss/hidden": 0.82421875, "loss/logits": 0.16681547462940216, "loss/reg": 0.019979091361165047, "step": 837 }, { "epoch": 0.10475, "grad_norm": 3.4931089878082275, "grad_norm_var": 0.7992595598721048, "learning_rate": 0.0001, "loss": 1.1803, "loss/crossentropy": 2.2890915870666504, "loss/hidden": 0.828125, "loss/logits": 0.15247204899787903, "loss/reg": 0.019969170913100243, "step": 838 }, { "epoch": 0.104875, "grad_norm": 2.7106313705444336, "grad_norm_var": 0.8094277801425143, "learning_rate": 0.0001, "loss": 1.097, "loss/crossentropy": 2.56706166267395, "loss/hidden": 0.76953125, "loss/logits": 0.12783397734165192, "loss/reg": 0.019960079342126846, "step": 839 }, { "epoch": 0.105, "grad_norm": 3.2825114727020264, "grad_norm_var": 0.7531900707246374, "learning_rate": 0.0001, "loss": 1.2726, "loss/crossentropy": 2.324145555496216, "loss/hidden": 0.88671875, "loss/logits": 0.18636029958724976, "loss/reg": 0.019951237365603447, "step": 840 }, { "epoch": 0.105125, "grad_norm": 2.752570152282715, "grad_norm_var": 0.7400250579900473, "learning_rate": 0.0001, "loss": 1.0885, "loss/crossentropy": 2.4273452758789062, "loss/hidden": 0.7578125, "loss/logits": 0.1312946379184723, "loss/reg": 0.01994233950972557, "step": 841 }, { "epoch": 0.10525, "grad_norm": 3.451720714569092, "grad_norm_var": 0.733364881032247, "learning_rate": 0.0001, "loss": 1.1042, "loss/crossentropy": 2.5271100997924805, "loss/hidden": 0.76171875, "loss/logits": 0.1431449055671692, "loss/reg": 0.019932815805077553, "step": 842 }, { "epoch": 0.105375, "grad_norm": 3.0586564540863037, "grad_norm_var": 0.741721542830341, "learning_rate": 0.0001, "loss": 1.0412, "loss/crossentropy": 2.9447247982025146, "loss/hidden": 0.7109375, "loss/logits": 0.13100013136863708, "loss/reg": 0.019923273473978043, "step": 843 }, { "epoch": 0.1055, "grad_norm": 3.4214084148406982, "grad_norm_var": 0.7380611813534226, "learning_rate": 0.0001, "loss": 1.2281, "loss/crossentropy": 2.3005340099334717, "loss/hidden": 0.86328125, "loss/logits": 0.1657242327928543, "loss/reg": 0.01991339959204197, "step": 844 }, { "epoch": 0.105625, "grad_norm": 2.6607367992401123, "grad_norm_var": 0.7886706735221035, "learning_rate": 0.0001, "loss": 1.1345, "loss/crossentropy": 2.5435853004455566, "loss/hidden": 0.80078125, "loss/logits": 0.1347101330757141, "loss/reg": 0.019903138279914856, "step": 845 }, { "epoch": 0.10575, "grad_norm": 3.0965163707733154, "grad_norm_var": 0.7659307635756494, "learning_rate": 0.0001, "loss": 1.0956, "loss/crossentropy": 2.6083555221557617, "loss/hidden": 0.74609375, "loss/logits": 0.1505199819803238, "loss/reg": 0.019893797114491463, "step": 846 }, { "epoch": 0.105875, "grad_norm": 4.309004306793213, "grad_norm_var": 0.8127957898222188, "learning_rate": 0.0001, "loss": 1.078, "loss/crossentropy": 2.515653371810913, "loss/hidden": 0.76171875, "loss/logits": 0.11742238700389862, "loss/reg": 0.019883660599589348, "step": 847 }, { "epoch": 0.106, "grad_norm": 3.248624563217163, "grad_norm_var": 0.7855834171862691, "learning_rate": 0.0001, "loss": 1.0838, "loss/crossentropy": 2.593562126159668, "loss/hidden": 0.75390625, "loss/logits": 0.13119123876094818, "loss/reg": 0.019874349236488342, "step": 848 }, { "epoch": 0.106125, "grad_norm": 3.347926378250122, "grad_norm_var": 0.7767115778534122, "learning_rate": 0.0001, "loss": 1.0816, "loss/crossentropy": 2.3947036266326904, "loss/hidden": 0.75, "loss/logits": 0.13295108079910278, "loss/reg": 0.019864298403263092, "step": 849 }, { "epoch": 0.10625, "grad_norm": 4.369658946990967, "grad_norm_var": 0.33264170947598637, "learning_rate": 0.0001, "loss": 1.0708, "loss/crossentropy": 2.449446201324463, "loss/hidden": 0.7421875, "loss/logits": 0.13007891178131104, "loss/reg": 0.019854165613651276, "step": 850 }, { "epoch": 0.106375, "grad_norm": 3.361130475997925, "grad_norm_var": 0.332327660409797, "learning_rate": 0.0001, "loss": 0.9761, "loss/crossentropy": 2.566408157348633, "loss/hidden": 0.66796875, "loss/logits": 0.10966208577156067, "loss/reg": 0.019843947142362595, "step": 851 }, { "epoch": 0.1065, "grad_norm": 2.6885204315185547, "grad_norm_var": 0.25144665871681204, "learning_rate": 0.0001, "loss": 0.9331, "loss/crossentropy": 2.3620035648345947, "loss/hidden": 0.625, "loss/logits": 0.10973039269447327, "loss/reg": 0.019834715873003006, "step": 852 }, { "epoch": 0.106625, "grad_norm": 2.615734338760376, "grad_norm_var": 0.2793016853206145, "learning_rate": 0.0001, "loss": 1.0168, "loss/crossentropy": 2.605545997619629, "loss/hidden": 0.69140625, "loss/logits": 0.12715153396129608, "loss/reg": 0.01982559822499752, "step": 853 }, { "epoch": 0.10675, "grad_norm": 3.073012113571167, "grad_norm_var": 0.276254032788457, "learning_rate": 0.0001, "loss": 0.9855, "loss/crossentropy": 2.634042739868164, "loss/hidden": 0.671875, "loss/logits": 0.11542729288339615, "loss/reg": 0.019816165789961815, "step": 854 }, { "epoch": 0.106875, "grad_norm": 2.759152412414551, "grad_norm_var": 0.27313479552050995, "learning_rate": 0.0001, "loss": 0.9145, "loss/crossentropy": 2.4920616149902344, "loss/hidden": 0.61328125, "loss/logits": 0.10316716134548187, "loss/reg": 0.019806833937764168, "step": 855 }, { "epoch": 0.107, "grad_norm": 3.647084951400757, "grad_norm_var": 0.28455080731760823, "learning_rate": 0.0001, "loss": 1.1336, "loss/crossentropy": 2.34993314743042, "loss/hidden": 0.78515625, "loss/logits": 0.150485098361969, "loss/reg": 0.019797123968601227, "step": 856 }, { "epoch": 0.107125, "grad_norm": 5.309474468231201, "grad_norm_var": 0.5265287098230971, "learning_rate": 0.0001, "loss": 1.3513, "loss/crossentropy": 2.4511518478393555, "loss/hidden": 0.94140625, "loss/logits": 0.21205759048461914, "loss/reg": 0.01978708617389202, "step": 857 }, { "epoch": 0.10725, "grad_norm": 3.1982548236846924, "grad_norm_var": 0.5288348795583182, "learning_rate": 0.0001, "loss": 1.1847, "loss/crossentropy": 2.2767844200134277, "loss/hidden": 0.8359375, "loss/logits": 0.15097308158874512, "loss/reg": 0.01977648213505745, "step": 858 }, { "epoch": 0.107375, "grad_norm": 3.7261335849761963, "grad_norm_var": 0.5276094221235986, "learning_rate": 0.0001, "loss": 1.1138, "loss/crossentropy": 2.660844087600708, "loss/hidden": 0.78515625, "loss/logits": 0.13101539015769958, "loss/reg": 0.01976662687957287, "step": 859 }, { "epoch": 0.1075, "grad_norm": 3.3435637950897217, "grad_norm_var": 0.5280464375318101, "learning_rate": 0.0001, "loss": 1.2877, "loss/crossentropy": 2.4701263904571533, "loss/hidden": 0.90625, "loss/logits": 0.18388135731220245, "loss/reg": 0.019757471978664398, "step": 860 }, { "epoch": 0.107625, "grad_norm": 2.630688190460205, "grad_norm_var": 0.5311534898567278, "learning_rate": 0.0001, "loss": 1.063, "loss/crossentropy": 2.5359132289886475, "loss/hidden": 0.734375, "loss/logits": 0.1311398446559906, "loss/reg": 0.019748201593756676, "step": 861 }, { "epoch": 0.10775, "grad_norm": 2.5543456077575684, "grad_norm_var": 0.5729300014134933, "learning_rate": 0.0001, "loss": 0.987, "loss/crossentropy": 2.3763418197631836, "loss/hidden": 0.671875, "loss/logits": 0.11773102730512619, "loss/reg": 0.01973855495452881, "step": 862 }, { "epoch": 0.107875, "grad_norm": 2.8768351078033447, "grad_norm_var": 0.5249464789316676, "learning_rate": 0.0001, "loss": 1.11, "loss/crossentropy": 2.467682361602783, "loss/hidden": 0.7734375, "loss/logits": 0.13924749195575714, "loss/reg": 0.019729435443878174, "step": 863 }, { "epoch": 0.108, "grad_norm": 9.608988761901855, "grad_norm_var": 3.012409881249372, "learning_rate": 0.0001, "loss": 1.8371, "loss/crossentropy": 2.4410533905029297, "loss/hidden": 1.359375, "loss/logits": 0.280517578125, "loss/reg": 0.019719891250133514, "step": 864 }, { "epoch": 0.108125, "grad_norm": 2.6161296367645264, "grad_norm_var": 3.079687357926654, "learning_rate": 0.0001, "loss": 0.9254, "loss/crossentropy": 2.656090497970581, "loss/hidden": 0.6171875, "loss/logits": 0.11112320423126221, "loss/reg": 0.01971041038632393, "step": 865 }, { "epoch": 0.10825, "grad_norm": 6.282230377197266, "grad_norm_var": 3.4921671952336317, "learning_rate": 0.0001, "loss": 1.1269, "loss/crossentropy": 2.4799087047576904, "loss/hidden": 0.80078125, "loss/logits": 0.12914146482944489, "loss/reg": 0.01970127783715725, "step": 866 }, { "epoch": 0.108375, "grad_norm": 3.052783727645874, "grad_norm_var": 3.514845564297898, "learning_rate": 0.0001, "loss": 1.1383, "loss/crossentropy": 2.4700145721435547, "loss/hidden": 0.78515625, "loss/logits": 0.15620394051074982, "loss/reg": 0.019691679626703262, "step": 867 }, { "epoch": 0.1085, "grad_norm": 3.2105300426483154, "grad_norm_var": 3.458070348929603, "learning_rate": 0.0001, "loss": 1.067, "loss/crossentropy": 2.135777473449707, "loss/hidden": 0.7578125, "loss/logits": 0.11233663558959961, "loss/reg": 0.019682079553604126, "step": 868 }, { "epoch": 0.108625, "grad_norm": 2.7341737747192383, "grad_norm_var": 3.4405364793380135, "learning_rate": 0.0001, "loss": 1.0495, "loss/crossentropy": 2.4592809677124023, "loss/hidden": 0.73046875, "loss/logits": 0.1223248764872551, "loss/reg": 0.019672293215990067, "step": 869 }, { "epoch": 0.10875, "grad_norm": 4.007035732269287, "grad_norm_var": 3.4058996890488658, "learning_rate": 0.0001, "loss": 1.1341, "loss/crossentropy": 2.5008175373077393, "loss/hidden": 0.80078125, "loss/logits": 0.13670845329761505, "loss/reg": 0.01966211199760437, "step": 870 }, { "epoch": 0.108875, "grad_norm": 3.146348476409912, "grad_norm_var": 3.359090924722083, "learning_rate": 0.0001, "loss": 1.2459, "loss/crossentropy": 2.0440711975097656, "loss/hidden": 0.8984375, "loss/logits": 0.15090960264205933, "loss/reg": 0.019652366638183594, "step": 871 }, { "epoch": 0.109, "grad_norm": 4.632335662841797, "grad_norm_var": 3.3902752468766453, "learning_rate": 0.0001, "loss": 1.1446, "loss/crossentropy": 2.4574947357177734, "loss/hidden": 0.80859375, "loss/logits": 0.13958214223384857, "loss/reg": 0.01964336633682251, "step": 872 }, { "epoch": 0.109125, "grad_norm": 5.440892219543457, "grad_norm_var": 3.415471723579617, "learning_rate": 0.0001, "loss": 1.1098, "loss/crossentropy": 2.565347671508789, "loss/hidden": 0.8046875, "loss/logits": 0.10876456648111343, "loss/reg": 0.01963435485959053, "step": 873 }, { "epoch": 0.10925, "grad_norm": 2.873284101486206, "grad_norm_var": 3.454269091905695, "learning_rate": 0.0001, "loss": 1.0607, "loss/crossentropy": 2.561459541320801, "loss/hidden": 0.72265625, "loss/logits": 0.14177533984184265, "loss/reg": 0.01962495781481266, "step": 874 }, { "epoch": 0.109375, "grad_norm": 3.0819554328918457, "grad_norm_var": 3.496943197417559, "learning_rate": 0.0001, "loss": 1.2422, "loss/crossentropy": 2.2944936752319336, "loss/hidden": 0.8828125, "loss/logits": 0.16319133341312408, "loss/reg": 0.019616009667515755, "step": 875 }, { "epoch": 0.1095, "grad_norm": 4.361453533172607, "grad_norm_var": 3.488792217244569, "learning_rate": 0.0001, "loss": 1.3828, "loss/crossentropy": 2.4624311923980713, "loss/hidden": 1.0078125, "loss/logits": 0.1789003610610962, "loss/reg": 0.019607286900281906, "step": 876 }, { "epoch": 0.109625, "grad_norm": 3.34078049659729, "grad_norm_var": 3.3959280790074216, "learning_rate": 0.0001, "loss": 0.9589, "loss/crossentropy": 2.4549739360809326, "loss/hidden": 0.64453125, "loss/logits": 0.11839590966701508, "loss/reg": 0.019598115235567093, "step": 877 }, { "epoch": 0.10975, "grad_norm": 3.434715986251831, "grad_norm_var": 3.2759937907981884, "learning_rate": 0.0001, "loss": 1.1192, "loss/crossentropy": 2.571798086166382, "loss/hidden": 0.7578125, "loss/logits": 0.16550706326961517, "loss/reg": 0.01958884485065937, "step": 878 }, { "epoch": 0.109875, "grad_norm": 3.593993663787842, "grad_norm_var": 3.196554005024426, "learning_rate": 0.0001, "loss": 1.1792, "loss/crossentropy": 2.447843551635742, "loss/hidden": 0.83203125, "loss/logits": 0.15137381851673126, "loss/reg": 0.019579457119107246, "step": 879 }, { "epoch": 0.11, "grad_norm": 3.3654563426971436, "grad_norm_var": 1.0373482238282006, "learning_rate": 0.0001, "loss": 1.0326, "loss/crossentropy": 2.6210246086120605, "loss/hidden": 0.71484375, "loss/logits": 0.12209475785493851, "loss/reg": 0.019569827243685722, "step": 880 }, { "epoch": 0.110125, "grad_norm": 2.6370863914489746, "grad_norm_var": 1.0343516088559113, "learning_rate": 0.0001, "loss": 1.1442, "loss/crossentropy": 2.2681713104248047, "loss/hidden": 0.81640625, "loss/logits": 0.132216215133667, "loss/reg": 0.019560784101486206, "step": 881 }, { "epoch": 0.11025, "grad_norm": 2.152343273162842, "grad_norm_var": 0.6782700998492743, "learning_rate": 0.0001, "loss": 1.0136, "loss/crossentropy": 2.437594175338745, "loss/hidden": 0.70703125, "loss/logits": 0.11108942329883575, "loss/reg": 0.019551947712898254, "step": 882 }, { "epoch": 0.110375, "grad_norm": 3.2755074501037598, "grad_norm_var": 0.6698247850929626, "learning_rate": 0.0001, "loss": 0.9972, "loss/crossentropy": 2.481198310852051, "loss/hidden": 0.6796875, "loss/logits": 0.12207823246717453, "loss/reg": 0.01954270713031292, "step": 883 }, { "epoch": 0.1105, "grad_norm": 3.770535707473755, "grad_norm_var": 0.6711344077538037, "learning_rate": 0.0001, "loss": 1.0475, "loss/crossentropy": 2.434851884841919, "loss/hidden": 0.71875, "loss/logits": 0.13345816731452942, "loss/reg": 0.019533507525920868, "step": 884 }, { "epoch": 0.110625, "grad_norm": 2.7666234970092773, "grad_norm_var": 0.6679279033368438, "learning_rate": 0.0001, "loss": 1.1166, "loss/crossentropy": 2.433852434158325, "loss/hidden": 0.78125, "loss/logits": 0.1401294767856598, "loss/reg": 0.01952442154288292, "step": 885 }, { "epoch": 0.11075, "grad_norm": 2.7105534076690674, "grad_norm_var": 0.6840409496040507, "learning_rate": 0.0001, "loss": 1.0924, "loss/crossentropy": 2.219019889831543, "loss/hidden": 0.765625, "loss/logits": 0.13160449266433716, "loss/reg": 0.019515201449394226, "step": 886 }, { "epoch": 0.110875, "grad_norm": 2.8931920528411865, "grad_norm_var": 0.6969961519386968, "learning_rate": 0.0001, "loss": 1.1134, "loss/crossentropy": 2.511371612548828, "loss/hidden": 0.78125, "loss/logits": 0.13708999752998352, "loss/reg": 0.019505700096488, "step": 887 }, { "epoch": 0.111, "grad_norm": 2.821718215942383, "grad_norm_var": 0.6033415037749854, "learning_rate": 0.0001, "loss": 1.2446, "loss/crossentropy": 2.389580011367798, "loss/hidden": 0.8828125, "loss/logits": 0.16678079962730408, "loss/reg": 0.019496839493513107, "step": 888 }, { "epoch": 0.111125, "grad_norm": 2.833461284637451, "grad_norm_var": 0.2778808504856213, "learning_rate": 0.0001, "loss": 1.029, "loss/crossentropy": 2.1618683338165283, "loss/hidden": 0.70703125, "loss/logits": 0.12710769474506378, "loss/reg": 0.019487854093313217, "step": 889 }, { "epoch": 0.11125, "grad_norm": 3.3806753158569336, "grad_norm_var": 0.2773113837378702, "learning_rate": 0.0001, "loss": 1.1349, "loss/crossentropy": 2.656121253967285, "loss/hidden": 0.79296875, "loss/logits": 0.1471368372440338, "loss/reg": 0.019478676840662956, "step": 890 }, { "epoch": 0.111375, "grad_norm": 2.932758092880249, "grad_norm_var": 0.2800811641911004, "learning_rate": 0.0001, "loss": 1.1, "loss/crossentropy": 2.5325851440429688, "loss/hidden": 0.76171875, "loss/logits": 0.14360320568084717, "loss/reg": 0.019469575956463814, "step": 891 }, { "epoch": 0.1115, "grad_norm": 2.3603265285491943, "grad_norm_var": 0.20497304301798067, "learning_rate": 0.0001, "loss": 1.0739, "loss/crossentropy": 2.3697421550750732, "loss/hidden": 0.75, "loss/logits": 0.1292482614517212, "loss/reg": 0.01946048066020012, "step": 892 }, { "epoch": 0.111625, "grad_norm": 4.056443214416504, "grad_norm_var": 0.2678930990243863, "learning_rate": 0.0001, "loss": 1.0008, "loss/crossentropy": 2.7553114891052246, "loss/hidden": 0.68359375, "loss/logits": 0.12268239259719849, "loss/reg": 0.019451187923550606, "step": 893 }, { "epoch": 0.11175, "grad_norm": 3.346064329147339, "grad_norm_var": 0.2639738255705176, "learning_rate": 0.0001, "loss": 1.1791, "loss/crossentropy": 2.4348855018615723, "loss/hidden": 0.7890625, "loss/logits": 0.19565626978874207, "loss/reg": 0.019442636519670486, "step": 894 }, { "epoch": 0.111875, "grad_norm": 6.878971576690674, "grad_norm_var": 1.1740357353356365, "learning_rate": 0.0001, "loss": 1.7085, "loss/crossentropy": 2.9299001693725586, "loss/hidden": 1.1640625, "loss/logits": 0.3501082956790924, "loss/reg": 0.019433531910181046, "step": 895 }, { "epoch": 0.112, "grad_norm": 3.1834142208099365, "grad_norm_var": 1.1735802221223497, "learning_rate": 0.0001, "loss": 1.2953, "loss/crossentropy": 2.4165709018707275, "loss/hidden": 0.92578125, "loss/logits": 0.17523059248924255, "loss/reg": 0.019424354657530785, "step": 896 }, { "epoch": 0.112125, "grad_norm": 2.5551555156707764, "grad_norm_var": 1.180695081530242, "learning_rate": 0.0001, "loss": 1.0171, "loss/crossentropy": 2.4345083236694336, "loss/hidden": 0.6953125, "loss/logits": 0.1276446282863617, "loss/reg": 0.019415004178881645, "step": 897 }, { "epoch": 0.11225, "grad_norm": 3.5786855220794678, "grad_norm_var": 1.1000748366509354, "learning_rate": 0.0001, "loss": 0.9925, "loss/crossentropy": 2.464219808578491, "loss/hidden": 0.69140625, "loss/logits": 0.10706112533807755, "loss/reg": 0.01940576173365116, "step": 898 }, { "epoch": 0.112375, "grad_norm": 8.784457206726074, "grad_norm_var": 2.9538895197120096, "learning_rate": 0.0001, "loss": 1.6799, "loss/crossentropy": 2.3273468017578125, "loss/hidden": 1.2265625, "loss/logits": 0.25934016704559326, "loss/reg": 0.019396713003516197, "step": 899 }, { "epoch": 0.1125, "grad_norm": 4.44765567779541, "grad_norm_var": 2.990871190956643, "learning_rate": 0.0001, "loss": 1.0685, "loss/crossentropy": 2.5760438442230225, "loss/hidden": 0.73046875, "loss/logits": 0.14414295554161072, "loss/reg": 0.019387517124414444, "step": 900 }, { "epoch": 0.112625, "grad_norm": 4.9651265144348145, "grad_norm_var": 3.013306784613239, "learning_rate": 0.0001, "loss": 1.1917, "loss/crossentropy": 2.5148589611053467, "loss/hidden": 0.828125, "loss/logits": 0.1697455495595932, "loss/reg": 0.01937839388847351, "step": 901 }, { "epoch": 0.11275, "grad_norm": 3.8322513103485107, "grad_norm_var": 2.9203267227302745, "learning_rate": 0.0001, "loss": 1.2309, "loss/crossentropy": 2.536973476409912, "loss/hidden": 0.8828125, "loss/logits": 0.15436521172523499, "loss/reg": 0.019369108602404594, "step": 902 }, { "epoch": 0.112875, "grad_norm": 3.5994656085968018, "grad_norm_var": 2.8540415836766555, "learning_rate": 0.0001, "loss": 1.0761, "loss/crossentropy": 2.3160693645477295, "loss/hidden": 0.76171875, "loss/logits": 0.12077254056930542, "loss/reg": 0.019359666854143143, "step": 903 }, { "epoch": 0.113, "grad_norm": 4.031139373779297, "grad_norm_var": 2.7599236229360753, "learning_rate": 0.0001, "loss": 1.0988, "loss/crossentropy": 2.5636203289031982, "loss/hidden": 0.78515625, "loss/logits": 0.12018904089927673, "loss/reg": 0.019350115209817886, "step": 904 }, { "epoch": 0.113125, "grad_norm": 3.2178378105163574, "grad_norm_var": 2.7069185907568794, "learning_rate": 0.0001, "loss": 1.0472, "loss/crossentropy": 2.3457703590393066, "loss/hidden": 0.72265625, "loss/logits": 0.13114379346370697, "loss/reg": 0.019340479746460915, "step": 905 }, { "epoch": 0.11325, "grad_norm": 3.096679925918579, "grad_norm_var": 2.7381334427643536, "learning_rate": 0.0001, "loss": 0.9662, "loss/crossentropy": 2.3510820865631104, "loss/hidden": 0.671875, "loss/logits": 0.10106582939624786, "loss/reg": 0.019330844283103943, "step": 906 }, { "epoch": 0.113375, "grad_norm": 7.590158462524414, "grad_norm_var": 3.3974738441650887, "learning_rate": 0.0001, "loss": 1.6666, "loss/crossentropy": 2.3520147800445557, "loss/hidden": 1.2265625, "loss/logits": 0.2468346357345581, "loss/reg": 0.019321195781230927, "step": 907 }, { "epoch": 0.1135, "grad_norm": 3.3216323852539062, "grad_norm_var": 3.20081618522182, "learning_rate": 0.0001, "loss": 1.2612, "loss/crossentropy": 2.330040216445923, "loss/hidden": 0.90234375, "loss/logits": 0.16571447253227234, "loss/reg": 0.019311606884002686, "step": 908 }, { "epoch": 0.113625, "grad_norm": 3.4920501708984375, "grad_norm_var": 3.246978809627046, "learning_rate": 0.0001, "loss": 1.1161, "loss/crossentropy": 2.598465919494629, "loss/hidden": 0.76953125, "loss/logits": 0.15353354811668396, "loss/reg": 0.019302019849419594, "step": 909 }, { "epoch": 0.11375, "grad_norm": 3.0414321422576904, "grad_norm_var": 3.294370585536838, "learning_rate": 0.0001, "loss": 1.0693, "loss/crossentropy": 2.459712028503418, "loss/hidden": 0.7421875, "loss/logits": 0.13418710231781006, "loss/reg": 0.019292324781417847, "step": 910 }, { "epoch": 0.113875, "grad_norm": 3.131361722946167, "grad_norm_var": 2.908980195007492, "learning_rate": 0.0001, "loss": 1.2301, "loss/crossentropy": 2.268738269805908, "loss/hidden": 0.875, "loss/logits": 0.16223490238189697, "loss/reg": 0.019283456727862358, "step": 911 }, { "epoch": 0.114, "grad_norm": 3.160707950592041, "grad_norm_var": 2.9118381902992776, "learning_rate": 0.0001, "loss": 1.3173, "loss/crossentropy": 2.2185730934143066, "loss/hidden": 0.9453125, "loss/logits": 0.17927365005016327, "loss/reg": 0.019274268299341202, "step": 912 }, { "epoch": 0.114125, "grad_norm": 14.604296684265137, "grad_norm_var": 9.479147248477696, "learning_rate": 0.0001, "loss": 1.4278, "loss/crossentropy": 2.2313976287841797, "loss/hidden": 1.078125, "loss/logits": 0.15699920058250427, "loss/reg": 0.019266733899712563, "step": 913 }, { "epoch": 0.11425, "grad_norm": 3.744290590286255, "grad_norm_var": 9.452382803070204, "learning_rate": 0.0001, "loss": 1.1776, "loss/crossentropy": 2.689699172973633, "loss/hidden": 0.83203125, "loss/logits": 0.15302729606628418, "loss/reg": 0.019258547574281693, "step": 914 }, { "epoch": 0.114375, "grad_norm": 2.999354839324951, "grad_norm_var": 8.531466626401157, "learning_rate": 0.0001, "loss": 1.3107, "loss/crossentropy": 1.8664510250091553, "loss/hidden": 0.9453125, "loss/logits": 0.1728517711162567, "loss/reg": 0.01925109326839447, "step": 915 }, { "epoch": 0.1145, "grad_norm": 3.194913148880005, "grad_norm_var": 8.64117053500835, "learning_rate": 0.0001, "loss": 1.3016, "loss/crossentropy": 2.25707745552063, "loss/hidden": 0.95703125, "loss/logits": 0.15214993059635162, "loss/reg": 0.019243914633989334, "step": 916 }, { "epoch": 0.114625, "grad_norm": 2.335845708847046, "grad_norm_var": 8.88876728908843, "learning_rate": 0.0001, "loss": 0.9484, "loss/crossentropy": 2.6386678218841553, "loss/hidden": 0.63671875, "loss/logits": 0.11931365728378296, "loss/reg": 0.019234785810112953, "step": 917 }, { "epoch": 0.11475, "grad_norm": 3.6922056674957275, "grad_norm_var": 8.898252742921356, "learning_rate": 0.0001, "loss": 1.0667, "loss/crossentropy": 2.5539944171905518, "loss/hidden": 0.73046875, "loss/logits": 0.14395025372505188, "loss/reg": 0.019226964563131332, "step": 918 }, { "epoch": 0.114875, "grad_norm": 2.946389675140381, "grad_norm_var": 8.982934878513694, "learning_rate": 0.0001, "loss": 1.0911, "loss/crossentropy": 2.3651351928710938, "loss/hidden": 0.76171875, "loss/logits": 0.1371677815914154, "loss/reg": 0.019219111651182175, "step": 919 }, { "epoch": 0.115, "grad_norm": 3.050600051879883, "grad_norm_var": 9.068373446668678, "learning_rate": 0.0001, "loss": 1.0419, "loss/crossentropy": 2.3772196769714355, "loss/hidden": 0.734375, "loss/logits": 0.11545050889253616, "loss/reg": 0.019211286678910255, "step": 920 }, { "epoch": 0.115125, "grad_norm": 2.954526424407959, "grad_norm_var": 9.105915478669973, "learning_rate": 0.0001, "loss": 1.0807, "loss/crossentropy": 2.2472290992736816, "loss/hidden": 0.75390625, "loss/logits": 0.13482055068016052, "loss/reg": 0.019202249124646187, "step": 921 }, { "epoch": 0.11525, "grad_norm": 6.214420795440674, "grad_norm_var": 9.27670245999236, "learning_rate": 0.0001, "loss": 1.8562, "loss/crossentropy": 2.9924590587615967, "loss/hidden": 1.2734375, "loss/logits": 0.3907894492149353, "loss/reg": 0.019193273037672043, "step": 922 }, { "epoch": 0.115375, "grad_norm": 4.534095287322998, "grad_norm_var": 8.536934613221737, "learning_rate": 0.0001, "loss": 1.4885, "loss/crossentropy": 2.394090414047241, "loss/hidden": 1.078125, "loss/logits": 0.2185368537902832, "loss/reg": 0.019184142351150513, "step": 923 }, { "epoch": 0.1155, "grad_norm": 3.6761627197265625, "grad_norm_var": 8.505579278095963, "learning_rate": 0.0001, "loss": 1.1306, "loss/crossentropy": 2.3566806316375732, "loss/hidden": 0.7890625, "loss/logits": 0.14977289736270905, "loss/reg": 0.019175738096237183, "step": 924 }, { "epoch": 0.115625, "grad_norm": 4.592898368835449, "grad_norm_var": 8.481328607270026, "learning_rate": 0.0001, "loss": 1.2942, "loss/crossentropy": 2.7458250522613525, "loss/hidden": 0.953125, "loss/logits": 0.14940449595451355, "loss/reg": 0.01916695386171341, "step": 925 }, { "epoch": 0.11575, "grad_norm": 2.7483372688293457, "grad_norm_var": 8.533618684340597, "learning_rate": 0.0001, "loss": 1.0727, "loss/crossentropy": 2.4450652599334717, "loss/hidden": 0.75390625, "loss/logits": 0.12724441289901733, "loss/reg": 0.019158538430929184, "step": 926 }, { "epoch": 0.115875, "grad_norm": 3.075195074081421, "grad_norm_var": 8.541996814909611, "learning_rate": 0.0001, "loss": 1.1251, "loss/crossentropy": 2.3960814476013184, "loss/hidden": 0.76171875, "loss/logits": 0.1718631386756897, "loss/reg": 0.019150495529174805, "step": 927 }, { "epoch": 0.116, "grad_norm": 3.313359022140503, "grad_norm_var": 8.521887542243068, "learning_rate": 0.0001, "loss": 1.1316, "loss/crossentropy": 2.3464736938476562, "loss/hidden": 0.8125, "loss/logits": 0.1276848018169403, "loss/reg": 0.019141457974910736, "step": 928 }, { "epoch": 0.116125, "grad_norm": 4.252639293670654, "grad_norm_var": 0.9000980544993648, "learning_rate": 0.0001, "loss": 1.1818, "loss/crossentropy": 2.523444414138794, "loss/hidden": 0.83203125, "loss/logits": 0.15845248103141785, "loss/reg": 0.01913331262767315, "step": 929 }, { "epoch": 0.11625, "grad_norm": 3.097456455230713, "grad_norm_var": 0.9123223599265882, "learning_rate": 0.0001, "loss": 1.1264, "loss/crossentropy": 2.595120668411255, "loss/hidden": 0.7734375, "loss/logits": 0.16165900230407715, "loss/reg": 0.019125619903206825, "step": 930 }, { "epoch": 0.116375, "grad_norm": 3.292982816696167, "grad_norm_var": 0.8964505136113113, "learning_rate": 0.0001, "loss": 1.1132, "loss/crossentropy": 2.4093523025512695, "loss/hidden": 0.77734375, "loss/logits": 0.14468303322792053, "loss/reg": 0.019116582348942757, "step": 931 }, { "epoch": 0.1165, "grad_norm": 2.559980869293213, "grad_norm_var": 0.952617731514821, "learning_rate": 0.0001, "loss": 0.9697, "loss/crossentropy": 2.5491104125976562, "loss/hidden": 0.6484375, "loss/logits": 0.13018551468849182, "loss/reg": 0.019108334556221962, "step": 932 }, { "epoch": 0.116625, "grad_norm": 3.058579683303833, "grad_norm_var": 0.871050822267735, "learning_rate": 0.0001, "loss": 1.2919, "loss/crossentropy": 2.303837537765503, "loss/hidden": 0.9296875, "loss/logits": 0.17123734951019287, "loss/reg": 0.019099365919828415, "step": 933 }, { "epoch": 0.11675, "grad_norm": 4.866446495056152, "grad_norm_var": 0.9769503909617764, "learning_rate": 0.0001, "loss": 1.1876, "loss/crossentropy": 2.364741325378418, "loss/hidden": 0.8359375, "loss/logits": 0.1607905626296997, "loss/reg": 0.019090238958597183, "step": 934 }, { "epoch": 0.116875, "grad_norm": 5.912527561187744, "grad_norm_var": 1.252657817578581, "learning_rate": 0.0001, "loss": 1.4831, "loss/crossentropy": 2.641162633895874, "loss/hidden": 1.09375, "loss/logits": 0.19855040311813354, "loss/reg": 0.019081177189946175, "step": 935 }, { "epoch": 0.117, "grad_norm": 2.8301663398742676, "grad_norm_var": 1.2784556528629765, "learning_rate": 0.0001, "loss": 1.0878, "loss/crossentropy": 2.277157783508301, "loss/hidden": 0.765625, "loss/logits": 0.13142800331115723, "loss/reg": 0.019072722643613815, "step": 936 }, { "epoch": 0.117125, "grad_norm": 2.9044833183288574, "grad_norm_var": 1.2843284928455583, "learning_rate": 0.0001, "loss": 1.1381, "loss/crossentropy": 2.3311665058135986, "loss/hidden": 0.796875, "loss/logits": 0.1506001055240631, "loss/reg": 0.01906409114599228, "step": 937 }, { "epoch": 0.11725, "grad_norm": 3.78202223777771, "grad_norm_var": 0.8736988295376342, "learning_rate": 0.0001, "loss": 1.2869, "loss/crossentropy": 2.4287989139556885, "loss/hidden": 0.8984375, "loss/logits": 0.197871595621109, "loss/reg": 0.019055521115660667, "step": 938 }, { "epoch": 0.117375, "grad_norm": 3.1086387634277344, "grad_norm_var": 0.8338185014655397, "learning_rate": 0.0001, "loss": 1.0576, "loss/crossentropy": 2.335935354232788, "loss/hidden": 0.73828125, "loss/logits": 0.1288556456565857, "loss/reg": 0.019047552719712257, "step": 939 }, { "epoch": 0.1175, "grad_norm": 4.869723320007324, "grad_norm_var": 0.9402287231159085, "learning_rate": 0.0001, "loss": 1.0378, "loss/crossentropy": 2.9672138690948486, "loss/hidden": 0.7109375, "loss/logits": 0.13644808530807495, "loss/reg": 0.01903851516544819, "step": 940 }, { "epoch": 0.117625, "grad_norm": 2.8750882148742676, "grad_norm_var": 0.9067692046415797, "learning_rate": 0.0001, "loss": 1.0037, "loss/crossentropy": 2.3138959407806396, "loss/hidden": 0.69140625, "loss/logits": 0.12202918529510498, "loss/reg": 0.019029438495635986, "step": 941 }, { "epoch": 0.11775, "grad_norm": 3.044121742248535, "grad_norm_var": 0.8812433820018912, "learning_rate": 0.0001, "loss": 1.0173, "loss/crossentropy": 2.4543912410736084, "loss/hidden": 0.70703125, "loss/logits": 0.12005805224180222, "loss/reg": 0.01902030035853386, "step": 942 }, { "epoch": 0.117875, "grad_norm": 2.945160150527954, "grad_norm_var": 0.8905794039935603, "learning_rate": 0.0001, "loss": 1.1688, "loss/crossentropy": 2.3482954502105713, "loss/hidden": 0.828125, "loss/logits": 0.15057498216629028, "loss/reg": 0.019011201336979866, "step": 943 }, { "epoch": 0.118, "grad_norm": 3.0109965801239014, "grad_norm_var": 0.9056152589294107, "learning_rate": 0.0001, "loss": 1.3721, "loss/crossentropy": 2.353516101837158, "loss/hidden": 0.984375, "loss/logits": 0.1977054923772812, "loss/reg": 0.0190016757696867, "step": 944 }, { "epoch": 0.118125, "grad_norm": 3.424039363861084, "grad_norm_var": 0.8682128423744849, "learning_rate": 0.0001, "loss": 1.1647, "loss/crossentropy": 2.513850450515747, "loss/hidden": 0.81640625, "loss/logits": 0.15838034451007843, "loss/reg": 0.018992552533745766, "step": 945 }, { "epoch": 0.11825, "grad_norm": 2.7656776905059814, "grad_norm_var": 0.8917454992029661, "learning_rate": 0.0001, "loss": 1.0819, "loss/crossentropy": 2.650881767272949, "loss/hidden": 0.75, "loss/logits": 0.1420745849609375, "loss/reg": 0.01898341253399849, "step": 946 }, { "epoch": 0.118375, "grad_norm": 4.2130560874938965, "grad_norm_var": 0.9250033835133629, "learning_rate": 0.0001, "loss": 1.2469, "loss/crossentropy": 2.683312177658081, "loss/hidden": 0.90625, "loss/logits": 0.15088841319084167, "loss/reg": 0.018973875790834427, "step": 947 }, { "epoch": 0.1185, "grad_norm": 3.834226608276367, "grad_norm_var": 0.8649633510209883, "learning_rate": 0.0001, "loss": 0.9767, "loss/crossentropy": 2.5706145763397217, "loss/hidden": 0.6640625, "loss/logits": 0.12295837700366974, "loss/reg": 0.018963845446705818, "step": 948 }, { "epoch": 0.118625, "grad_norm": 3.3009235858917236, "grad_norm_var": 0.8514524765901378, "learning_rate": 0.0001, "loss": 1.1857, "loss/crossentropy": 2.6797893047332764, "loss/hidden": 0.82421875, "loss/logits": 0.17191748321056366, "loss/reg": 0.018954817205667496, "step": 949 }, { "epoch": 0.11875, "grad_norm": 6.452317237854004, "grad_norm_var": 1.2752747995844325, "learning_rate": 0.0001, "loss": 1.4528, "loss/crossentropy": 2.7505640983581543, "loss/hidden": 1.0546875, "loss/logits": 0.20869939029216766, "loss/reg": 0.018945740535855293, "step": 950 }, { "epoch": 0.118875, "grad_norm": 3.405714273452759, "grad_norm_var": 0.9300412257064863, "learning_rate": 0.0001, "loss": 1.0922, "loss/crossentropy": 2.802401065826416, "loss/hidden": 0.7578125, "loss/logits": 0.14506830275058746, "loss/reg": 0.018936749547719955, "step": 951 }, { "epoch": 0.119, "grad_norm": 9.662891387939453, "grad_norm_var": 3.194050081601077, "learning_rate": 0.0001, "loss": 1.5003, "loss/crossentropy": 2.700052499771118, "loss/hidden": 1.1171875, "loss/logits": 0.19381779432296753, "loss/reg": 0.01892753876745701, "step": 952 }, { "epoch": 0.119125, "grad_norm": 2.8670711517333984, "grad_norm_var": 3.199477320796336, "learning_rate": 0.0001, "loss": 1.0153, "loss/crossentropy": 2.490739107131958, "loss/hidden": 0.69921875, "loss/logits": 0.12687504291534424, "loss/reg": 0.01891852729022503, "step": 953 }, { "epoch": 0.11925, "grad_norm": 4.066836357116699, "grad_norm_var": 3.1973098694543274, "learning_rate": 0.0001, "loss": 1.2704, "loss/crossentropy": 2.4122891426086426, "loss/hidden": 0.9140625, "loss/logits": 0.1672634333372116, "loss/reg": 0.018909232690930367, "step": 954 }, { "epoch": 0.119375, "grad_norm": 2.8044497966766357, "grad_norm_var": 3.2388562001879793, "learning_rate": 0.0001, "loss": 1.0385, "loss/crossentropy": 2.473945379257202, "loss/hidden": 0.71484375, "loss/logits": 0.13464638590812683, "loss/reg": 0.018900100141763687, "step": 955 }, { "epoch": 0.1195, "grad_norm": 3.7214255332946777, "grad_norm_var": 3.1837278954586044, "learning_rate": 0.0001, "loss": 1.0832, "loss/crossentropy": 2.6148641109466553, "loss/hidden": 0.75390625, "loss/logits": 0.1403505802154541, "loss/reg": 0.01889113523066044, "step": 956 }, { "epoch": 0.119625, "grad_norm": 3.0579674243927, "grad_norm_var": 3.160836005262268, "learning_rate": 0.0001, "loss": 1.1761, "loss/crossentropy": 2.5016398429870605, "loss/hidden": 0.83984375, "loss/logits": 0.1474056839942932, "loss/reg": 0.0188821442425251, "step": 957 }, { "epoch": 0.11975, "grad_norm": 2.429112672805786, "grad_norm_var": 3.255565314691306, "learning_rate": 0.0001, "loss": 0.9853, "loss/crossentropy": 2.3262507915496826, "loss/hidden": 0.66796875, "loss/logits": 0.1286056935787201, "loss/reg": 0.018873048946261406, "step": 958 }, { "epoch": 0.119875, "grad_norm": 2.9561123847961426, "grad_norm_var": 3.2542184489207098, "learning_rate": 0.0001, "loss": 0.9739, "loss/crossentropy": 2.5669143199920654, "loss/hidden": 0.65234375, "loss/logits": 0.13291960954666138, "loss/reg": 0.018863873556256294, "step": 959 }, { "epoch": 0.12, "grad_norm": 3.1904752254486084, "grad_norm_var": 3.2355963683487268, "learning_rate": 0.0001, "loss": 1.074, "loss/crossentropy": 2.5522408485412598, "loss/hidden": 0.75390625, "loss/logits": 0.1315881609916687, "loss/reg": 0.018855126574635506, "step": 960 }, { "epoch": 0.120125, "grad_norm": 7.435352802276611, "grad_norm_var": 3.9949775747956586, "learning_rate": 0.0001, "loss": 1.6007, "loss/crossentropy": 2.7422168254852295, "loss/hidden": 1.1875, "loss/logits": 0.2246999442577362, "loss/reg": 0.018846556544303894, "step": 961 }, { "epoch": 0.12025, "grad_norm": 4.614249229431152, "grad_norm_var": 3.8709926395951384, "learning_rate": 0.0001, "loss": 1.3908, "loss/crossentropy": 2.356748104095459, "loss/hidden": 0.953125, "loss/logits": 0.24928607046604156, "loss/reg": 0.018837420269846916, "step": 962 }, { "epoch": 0.120375, "grad_norm": 4.978577136993408, "grad_norm_var": 3.903770487124885, "learning_rate": 0.0001, "loss": 1.4333, "loss/crossentropy": 2.2456307411193848, "loss/hidden": 1.0703125, "loss/logits": 0.17467612028121948, "loss/reg": 0.01882883533835411, "step": 963 }, { "epoch": 0.1205, "grad_norm": 5.51161003112793, "grad_norm_var": 3.97576236618078, "learning_rate": 0.0001, "loss": 1.2552, "loss/crossentropy": 2.7595887184143066, "loss/hidden": 0.8828125, "loss/logits": 0.18415382504463196, "loss/reg": 0.01882052607834339, "step": 964 }, { "epoch": 0.120625, "grad_norm": 3.2307283878326416, "grad_norm_var": 3.9863892013288393, "learning_rate": 0.0001, "loss": 1.2728, "loss/crossentropy": 2.48502779006958, "loss/hidden": 0.88671875, "loss/logits": 0.19795754551887512, "loss/reg": 0.01881156861782074, "step": 965 }, { "epoch": 0.12075, "grad_norm": 3.441767454147339, "grad_norm_var": 3.7286595116638916, "learning_rate": 0.0001, "loss": 1.182, "loss/crossentropy": 2.508680582046509, "loss/hidden": 0.8359375, "loss/logits": 0.15802894532680511, "loss/reg": 0.018803071230649948, "step": 966 }, { "epoch": 0.120875, "grad_norm": 4.063246726989746, "grad_norm_var": 3.685090208705704, "learning_rate": 0.0001, "loss": 1.3354, "loss/crossentropy": 2.47729229927063, "loss/hidden": 0.984375, "loss/logits": 0.16310644149780273, "loss/reg": 0.018794314935803413, "step": 967 }, { "epoch": 0.121, "grad_norm": 5.8381171226501465, "grad_norm_var": 1.8400005684538645, "learning_rate": 0.0001, "loss": 1.1025, "loss/crossentropy": 2.5666096210479736, "loss/hidden": 0.77734375, "loss/logits": 0.1373094618320465, "loss/reg": 0.018785255029797554, "step": 968 }, { "epoch": 0.121125, "grad_norm": 2.7132527828216553, "grad_norm_var": 1.8649801572693356, "learning_rate": 0.0001, "loss": 1.0912, "loss/crossentropy": 2.4335010051727295, "loss/hidden": 0.76953125, "loss/logits": 0.13391214609146118, "loss/reg": 0.018776265904307365, "step": 969 }, { "epoch": 0.12125, "grad_norm": 2.9154088497161865, "grad_norm_var": 1.938092020210782, "learning_rate": 0.0001, "loss": 1.1314, "loss/crossentropy": 2.523137331008911, "loss/hidden": 0.80078125, "loss/logits": 0.1429774910211563, "loss/reg": 0.01876768097281456, "step": 970 }, { "epoch": 0.121375, "grad_norm": 3.3754584789276123, "grad_norm_var": 1.8726730225127388, "learning_rate": 0.0001, "loss": 1.0747, "loss/crossentropy": 2.6033432483673096, "loss/hidden": 0.74609375, "loss/logits": 0.14106187224388123, "loss/reg": 0.018759164959192276, "step": 971 }, { "epoch": 0.1215, "grad_norm": 2.7394940853118896, "grad_norm_var": 1.9650935524715956, "learning_rate": 0.0001, "loss": 1.0871, "loss/crossentropy": 2.2976291179656982, "loss/hidden": 0.76953125, "loss/logits": 0.1300249695777893, "loss/reg": 0.01874978095293045, "step": 972 }, { "epoch": 0.121625, "grad_norm": 2.9742624759674072, "grad_norm_var": 1.9749925269591906, "learning_rate": 0.0001, "loss": 1.1221, "loss/crossentropy": 2.7049221992492676, "loss/hidden": 0.79296875, "loss/logits": 0.1416921317577362, "loss/reg": 0.018740687519311905, "step": 973 }, { "epoch": 0.12175, "grad_norm": 2.927666187286377, "grad_norm_var": 1.892721758937742, "learning_rate": 0.0001, "loss": 1.0252, "loss/crossentropy": 2.6492226123809814, "loss/hidden": 0.71875, "loss/logits": 0.11909263581037521, "loss/reg": 0.018731672316789627, "step": 974 }, { "epoch": 0.121875, "grad_norm": 16.453693389892578, "grad_norm_var": 11.523681815422924, "learning_rate": 0.0001, "loss": 1.5788, "loss/crossentropy": 2.375779867172241, "loss/hidden": 1.171875, "loss/logits": 0.2197396457195282, "loss/reg": 0.01872306317090988, "step": 975 }, { "epoch": 0.122, "grad_norm": 2.928443193435669, "grad_norm_var": 11.583339951760102, "learning_rate": 0.0001, "loss": 1.0995, "loss/crossentropy": 2.8136162757873535, "loss/hidden": 0.77734375, "loss/logits": 0.13505280017852783, "loss/reg": 0.018714020028710365, "step": 976 }, { "epoch": 0.122125, "grad_norm": 4.540535926818848, "grad_norm_var": 11.07401646408874, "learning_rate": 0.0001, "loss": 1.1806, "loss/crossentropy": 2.5312135219573975, "loss/hidden": 0.83984375, "loss/logits": 0.15369677543640137, "loss/reg": 0.01870504766702652, "step": 977 }, { "epoch": 0.12225, "grad_norm": 2.4433298110961914, "grad_norm_var": 11.358052675820652, "learning_rate": 0.0001, "loss": 1.0167, "loss/crossentropy": 2.578800678253174, "loss/hidden": 0.6953125, "loss/logits": 0.13442841172218323, "loss/reg": 0.018696293234825134, "step": 978 }, { "epoch": 0.122375, "grad_norm": 3.341182231903076, "grad_norm_var": 11.408522912728658, "learning_rate": 0.0001, "loss": 1.1346, "loss/crossentropy": 2.4482579231262207, "loss/hidden": 0.8046875, "loss/logits": 0.14306378364562988, "loss/reg": 0.018688105046749115, "step": 979 }, { "epoch": 0.1225, "grad_norm": 2.433337450027466, "grad_norm_var": 11.51984045745032, "learning_rate": 0.0001, "loss": 0.9764, "loss/crossentropy": 2.321781873703003, "loss/hidden": 0.65625, "loss/logits": 0.13331879675388336, "loss/reg": 0.018679112195968628, "step": 980 }, { "epoch": 0.122625, "grad_norm": 3.6000678539276123, "grad_norm_var": 11.483219758864427, "learning_rate": 0.0001, "loss": 1.1731, "loss/crossentropy": 2.5233397483825684, "loss/hidden": 0.8203125, "loss/logits": 0.16610421240329742, "loss/reg": 0.018670594319701195, "step": 981 }, { "epoch": 0.12275, "grad_norm": 2.8748631477355957, "grad_norm_var": 11.558394893606714, "learning_rate": 0.0001, "loss": 1.0362, "loss/crossentropy": 2.4155187606811523, "loss/hidden": 0.7265625, "loss/logits": 0.12304510176181793, "loss/reg": 0.018661517649888992, "step": 982 }, { "epoch": 0.122875, "grad_norm": 2.2293035984039307, "grad_norm_var": 11.786185692154314, "learning_rate": 0.0001, "loss": 1.0077, "loss/crossentropy": 2.618363618850708, "loss/hidden": 0.6875, "loss/logits": 0.13366106152534485, "loss/reg": 0.01865258812904358, "step": 983 }, { "epoch": 0.123, "grad_norm": 3.615424633026123, "grad_norm_var": 11.556298836968558, "learning_rate": 0.0001, "loss": 1.0544, "loss/crossentropy": 2.224966526031494, "loss/hidden": 0.734375, "loss/logits": 0.1335442066192627, "loss/reg": 0.01864360086619854, "step": 984 }, { "epoch": 0.123125, "grad_norm": 2.93835186958313, "grad_norm_var": 11.524399601900045, "learning_rate": 0.0001, "loss": 1.232, "loss/crossentropy": 2.348299026489258, "loss/hidden": 0.87890625, "loss/logits": 0.1667439341545105, "loss/reg": 0.018634630367159843, "step": 985 }, { "epoch": 0.12325, "grad_norm": 3.2750635147094727, "grad_norm_var": 11.485476360611186, "learning_rate": 0.0001, "loss": 1.0639, "loss/crossentropy": 2.5712990760803223, "loss/hidden": 0.75, "loss/logits": 0.12762659788131714, "loss/reg": 0.018625380471348763, "step": 986 }, { "epoch": 0.123375, "grad_norm": 3.81864595413208, "grad_norm_var": 11.465683474564775, "learning_rate": 0.0001, "loss": 1.1284, "loss/crossentropy": 2.4274511337280273, "loss/hidden": 0.7890625, "loss/logits": 0.15321871638298035, "loss/reg": 0.01861615665256977, "step": 987 }, { "epoch": 0.1235, "grad_norm": 3.2677698135375977, "grad_norm_var": 11.398153583229371, "learning_rate": 0.0001, "loss": 1.1315, "loss/crossentropy": 2.656604290008545, "loss/hidden": 0.8046875, "loss/logits": 0.14076048135757446, "loss/reg": 0.018606893718242645, "step": 988 }, { "epoch": 0.123625, "grad_norm": 3.561713457107544, "grad_norm_var": 11.341034456038914, "learning_rate": 0.0001, "loss": 1.0152, "loss/crossentropy": 2.6205251216888428, "loss/hidden": 0.70703125, "loss/logits": 0.12218683958053589, "loss/reg": 0.01859763078391552, "step": 989 }, { "epoch": 0.12375, "grad_norm": 2.146240472793579, "grad_norm_var": 11.49254916357394, "learning_rate": 0.0001, "loss": 1.0449, "loss/crossentropy": 2.4804821014404297, "loss/hidden": 0.71484375, "loss/logits": 0.14422178268432617, "loss/reg": 0.018588390201330185, "step": 990 }, { "epoch": 0.123875, "grad_norm": 2.6142220497131348, "grad_norm_var": 0.4215380256098586, "learning_rate": 0.0001, "loss": 1.0371, "loss/crossentropy": 2.3303298950195312, "loss/hidden": 0.73828125, "loss/logits": 0.11297546327114105, "loss/reg": 0.018579507246613503, "step": 991 }, { "epoch": 0.124, "grad_norm": 4.17028284072876, "grad_norm_var": 0.4892223582938262, "learning_rate": 0.0001, "loss": 1.6898, "loss/crossentropy": 2.250098943710327, "loss/hidden": 1.203125, "loss/logits": 0.30099910497665405, "loss/reg": 0.018569782376289368, "step": 992 }, { "epoch": 0.124125, "grad_norm": 2.3814339637756348, "grad_norm_var": 0.3887345955884323, "learning_rate": 0.0001, "loss": 1.0837, "loss/crossentropy": 2.6447057723999023, "loss/hidden": 0.74609375, "loss/logits": 0.15198630094528198, "loss/reg": 0.01855996623635292, "step": 993 }, { "epoch": 0.12425, "grad_norm": 4.670334815979004, "grad_norm_var": 0.5202129226035586, "learning_rate": 0.0001, "loss": 1.1908, "loss/crossentropy": 2.3941245079040527, "loss/hidden": 0.85546875, "loss/logits": 0.1498216688632965, "loss/reg": 0.018550006672739983, "step": 994 }, { "epoch": 0.124375, "grad_norm": 2.498332977294922, "grad_norm_var": 0.5469080049785059, "learning_rate": 0.0001, "loss": 0.9226, "loss/crossentropy": 2.489830255508423, "loss/hidden": 0.63671875, "loss/logits": 0.10051175206899643, "loss/reg": 0.01854090392589569, "step": 995 }, { "epoch": 0.1245, "grad_norm": 2.987192392349243, "grad_norm_var": 0.5145625202891589, "learning_rate": 0.0001, "loss": 1.1638, "loss/crossentropy": 2.094637632369995, "loss/hidden": 0.8515625, "loss/logits": 0.12690997123718262, "loss/reg": 0.018531804904341698, "step": 996 }, { "epoch": 0.124625, "grad_norm": 2.738851547241211, "grad_norm_var": 0.5110263660772335, "learning_rate": 0.0001, "loss": 1.1959, "loss/crossentropy": 2.1406211853027344, "loss/hidden": 0.84375, "loss/logits": 0.16688010096549988, "loss/reg": 0.018522722646594048, "step": 997 }, { "epoch": 0.12475, "grad_norm": 5.02874231338501, "grad_norm_var": 0.732945509426732, "learning_rate": 0.0001, "loss": 1.089, "loss/crossentropy": 2.442840337753296, "loss/hidden": 0.76171875, "loss/logits": 0.142162024974823, "loss/reg": 0.018513953313231468, "step": 998 }, { "epoch": 0.124875, "grad_norm": 2.947786331176758, "grad_norm_var": 0.6677765621166297, "learning_rate": 0.0001, "loss": 1.3459, "loss/crossentropy": 2.483854055404663, "loss/hidden": 0.96875, "loss/logits": 0.19213837385177612, "loss/reg": 0.018505612388253212, "step": 999 }, { "epoch": 0.125, "grad_norm": 4.166240692138672, "grad_norm_var": 0.7105452516630361, "learning_rate": 0.0001, "loss": 1.2402, "loss/crossentropy": 2.454993963241577, "loss/hidden": 0.87890625, "loss/logits": 0.17627671360969543, "loss/reg": 0.018497284501791, "step": 1000 }, { "epoch": 0.125125, "grad_norm": 2.7403693199157715, "grad_norm_var": 0.723220167440765, "learning_rate": 0.0001, "loss": 1.0711, "loss/crossentropy": 2.3623507022857666, "loss/hidden": 0.75, "loss/logits": 0.1362101435661316, "loss/reg": 0.018489044159650803, "step": 1001 }, { "epoch": 0.12525, "grad_norm": 4.017945766448975, "grad_norm_var": 0.7539223188067105, "learning_rate": 0.0001, "loss": 1.2595, "loss/crossentropy": 2.199939727783203, "loss/hidden": 0.93359375, "loss/logits": 0.14108574390411377, "loss/reg": 0.018480053171515465, "step": 1002 }, { "epoch": 0.125375, "grad_norm": 3.8252949714660645, "grad_norm_var": 0.7543319037149537, "learning_rate": 0.0001, "loss": 1.2766, "loss/crossentropy": 2.4151408672332764, "loss/hidden": 0.91796875, "loss/logits": 0.1738898754119873, "loss/reg": 0.018471699208021164, "step": 1003 }, { "epoch": 0.1255, "grad_norm": 3.2934703826904297, "grad_norm_var": 0.754056547294514, "learning_rate": 0.0001, "loss": 1.1773, "loss/crossentropy": 2.4902610778808594, "loss/hidden": 0.83984375, "loss/logits": 0.1528603434562683, "loss/reg": 0.018463551998138428, "step": 1004 }, { "epoch": 0.125625, "grad_norm": 2.8537371158599854, "grad_norm_var": 0.7665102142099857, "learning_rate": 0.0001, "loss": 1.0491, "loss/crossentropy": 2.5475528240203857, "loss/hidden": 0.734375, "loss/logits": 0.1301451176404953, "loss/reg": 0.01845443621277809, "step": 1005 }, { "epoch": 0.12575, "grad_norm": 10.271280288696289, "grad_norm_var": 3.623624147504192, "learning_rate": 0.0001, "loss": 1.9301, "loss/crossentropy": 2.789032459259033, "loss/hidden": 1.359375, "loss/logits": 0.386294960975647, "loss/reg": 0.018445348367094994, "step": 1006 }, { "epoch": 0.125875, "grad_norm": 3.5450735092163086, "grad_norm_var": 3.5274627001684156, "learning_rate": 0.0001, "loss": 1.0646, "loss/crossentropy": 2.7634024620056152, "loss/hidden": 0.73828125, "loss/logits": 0.1419609636068344, "loss/reg": 0.01843627728521824, "step": 1007 }, { "epoch": 0.126, "grad_norm": 3.8805432319641113, "grad_norm_var": 3.521631426981449, "learning_rate": 0.0001, "loss": 1.2255, "loss/crossentropy": 2.313877820968628, "loss/hidden": 0.859375, "loss/logits": 0.18188925087451935, "loss/reg": 0.018427575007081032, "step": 1008 }, { "epoch": 0.126125, "grad_norm": 3.56695556640625, "grad_norm_var": 3.3749006612486068, "learning_rate": 0.0001, "loss": 1.1632, "loss/crossentropy": 3.036876678466797, "loss/hidden": 0.8125, "loss/logits": 0.1664760708808899, "loss/reg": 0.018418410792946815, "step": 1009 }, { "epoch": 0.12625, "grad_norm": 3.4208462238311768, "grad_norm_var": 3.350722625996883, "learning_rate": 0.0001, "loss": 1.3539, "loss/crossentropy": 2.335106372833252, "loss/hidden": 1.0, "loss/logits": 0.16979742050170898, "loss/reg": 0.018409088253974915, "step": 1010 }, { "epoch": 0.126375, "grad_norm": 4.440703392028809, "grad_norm_var": 3.2335077439479125, "learning_rate": 0.0001, "loss": 1.5031, "loss/crossentropy": 1.8046929836273193, "loss/hidden": 1.0703125, "loss/logits": 0.2487386018037796, "loss/reg": 0.018400251865386963, "step": 1011 }, { "epoch": 0.1265, "grad_norm": 3.6042182445526123, "grad_norm_var": 3.175392851042456, "learning_rate": 0.0001, "loss": 1.0341, "loss/crossentropy": 2.732485294342041, "loss/hidden": 0.70703125, "loss/logits": 0.14311768114566803, "loss/reg": 0.018391618505120277, "step": 1012 }, { "epoch": 0.126625, "grad_norm": 3.0008747577667236, "grad_norm_var": 3.1348769442621385, "learning_rate": 0.0001, "loss": 1.2818, "loss/crossentropy": 2.4243671894073486, "loss/hidden": 0.8984375, "loss/logits": 0.19954946637153625, "loss/reg": 0.01838279701769352, "step": 1013 }, { "epoch": 0.12675, "grad_norm": 3.548919677734375, "grad_norm_var": 3.076212765414932, "learning_rate": 0.0001, "loss": 1.0597, "loss/crossentropy": 2.646744966506958, "loss/hidden": 0.7578125, "loss/logits": 0.11818855255842209, "loss/reg": 0.018373781815171242, "step": 1014 }, { "epoch": 0.126875, "grad_norm": 3.4126574993133545, "grad_norm_var": 3.027892721971917, "learning_rate": 0.0001, "loss": 1.2446, "loss/crossentropy": 2.2376763820648193, "loss/hidden": 0.90234375, "loss/logits": 0.15857847034931183, "loss/reg": 0.018364954739809036, "step": 1015 }, { "epoch": 0.127, "grad_norm": 2.8473548889160156, "grad_norm_var": 3.1028595438739974, "learning_rate": 0.0001, "loss": 1.171, "loss/crossentropy": 2.4969823360443115, "loss/hidden": 0.80859375, "loss/logits": 0.17886783182621002, "loss/reg": 0.01835593394935131, "step": 1016 }, { "epoch": 0.127125, "grad_norm": 3.4492313861846924, "grad_norm_var": 3.0254289441294913, "learning_rate": 0.0001, "loss": 1.1521, "loss/crossentropy": 2.6258046627044678, "loss/hidden": 0.828125, "loss/logits": 0.14048799872398376, "loss/reg": 0.018347129225730896, "step": 1017 }, { "epoch": 0.12725, "grad_norm": 2.7578630447387695, "grad_norm_var": 3.1109318052612775, "learning_rate": 0.0001, "loss": 1.1909, "loss/crossentropy": 2.5233936309814453, "loss/hidden": 0.84375, "loss/logits": 0.16377520561218262, "loss/reg": 0.018338393419981003, "step": 1018 }, { "epoch": 0.127375, "grad_norm": 4.195294380187988, "grad_norm_var": 3.117902257815278, "learning_rate": 0.0001, "loss": 1.3224, "loss/crossentropy": 1.8772588968276978, "loss/hidden": 0.984375, "loss/logits": 0.1546936333179474, "loss/reg": 0.01832934282720089, "step": 1019 }, { "epoch": 0.1275, "grad_norm": 3.7895658016204834, "grad_norm_var": 3.0944502488485717, "learning_rate": 0.0001, "loss": 1.1973, "loss/crossentropy": 2.3982863426208496, "loss/hidden": 0.84375, "loss/logits": 0.17036345601081848, "loss/reg": 0.01832025870680809, "step": 1020 }, { "epoch": 0.127625, "grad_norm": 6.021730422973633, "grad_norm_var": 3.274883958363541, "learning_rate": 0.0001, "loss": 1.5031, "loss/crossentropy": 2.428899049758911, "loss/hidden": 1.109375, "loss/logits": 0.21065470576286316, "loss/reg": 0.01831124909222126, "step": 1021 }, { "epoch": 0.12775, "grad_norm": 4.54056453704834, "grad_norm_var": 0.6193178360798726, "learning_rate": 0.0001, "loss": 1.6077, "loss/crossentropy": 2.3732786178588867, "loss/hidden": 1.1953125, "loss/logits": 0.22934843599796295, "loss/reg": 0.01830223761498928, "step": 1022 }, { "epoch": 0.127875, "grad_norm": 4.612347602844238, "grad_norm_var": 0.6611490686092556, "learning_rate": 0.0001, "loss": 1.1245, "loss/crossentropy": 2.393007516860962, "loss/hidden": 0.8046875, "loss/logits": 0.13689909875392914, "loss/reg": 0.0182929839938879, "step": 1023 }, { "epoch": 0.128, "grad_norm": 4.8808794021606445, "grad_norm_var": 0.7320190710671227, "learning_rate": 0.0001, "loss": 1.6498, "loss/crossentropy": 1.8295843601226807, "loss/hidden": 1.265625, "loss/logits": 0.2013380229473114, "loss/reg": 0.01828295737504959, "step": 1024 }, { "epoch": 0.128125, "grad_norm": 6.514127731323242, "grad_norm_var": 1.1516245124796, "learning_rate": 0.0001, "loss": 1.4269, "loss/crossentropy": 2.6622331142425537, "loss/hidden": 1.0390625, "loss/logits": 0.20514041185379028, "loss/reg": 0.018273649737238884, "step": 1025 }, { "epoch": 0.12825, "grad_norm": 3.6572396755218506, "grad_norm_var": 1.1348195216000618, "learning_rate": 0.0001, "loss": 1.2673, "loss/crossentropy": 2.410374164581299, "loss/hidden": 0.9140625, "loss/logits": 0.17059262096881866, "loss/reg": 0.018263790756464005, "step": 1026 }, { "epoch": 0.128375, "grad_norm": 2.8987033367156982, "grad_norm_var": 1.209186568114529, "learning_rate": 0.0001, "loss": 1.2157, "loss/crossentropy": 2.236950397491455, "loss/hidden": 0.8828125, "loss/logits": 0.15036781132221222, "loss/reg": 0.018253570422530174, "step": 1027 }, { "epoch": 0.1285, "grad_norm": 3.276984453201294, "grad_norm_var": 1.2324156239644137, "learning_rate": 0.0001, "loss": 1.2494, "loss/crossentropy": 2.683966636657715, "loss/hidden": 0.90234375, "loss/logits": 0.16460160911083221, "loss/reg": 0.0182446651160717, "step": 1028 }, { "epoch": 0.128625, "grad_norm": 5.419056415557861, "grad_norm_var": 1.2877520831133287, "learning_rate": 0.0001, "loss": 1.4861, "loss/crossentropy": 2.306896209716797, "loss/hidden": 1.1015625, "loss/logits": 0.20220112800598145, "loss/reg": 0.018234653398394585, "step": 1029 }, { "epoch": 0.12875, "grad_norm": 3.4227371215820312, "grad_norm_var": 1.298252758406007, "learning_rate": 0.0001, "loss": 1.1322, "loss/crossentropy": 2.6958277225494385, "loss/hidden": 0.78125, "loss/logits": 0.16871951520442963, "loss/reg": 0.018225453794002533, "step": 1030 }, { "epoch": 0.128875, "grad_norm": 2.5001378059387207, "grad_norm_var": 1.4346570797964828, "learning_rate": 0.0001, "loss": 1.0835, "loss/crossentropy": 2.743760824203491, "loss/hidden": 0.75390625, "loss/logits": 0.14742383360862732, "loss/reg": 0.01821640320122242, "step": 1031 }, { "epoch": 0.129, "grad_norm": 3.422490119934082, "grad_norm_var": 1.3631839436174078, "learning_rate": 0.0001, "loss": 1.213, "loss/crossentropy": 2.6142783164978027, "loss/hidden": 0.8671875, "loss/logits": 0.1637578308582306, "loss/reg": 0.01820731721818447, "step": 1032 }, { "epoch": 0.129125, "grad_norm": 3.200204849243164, "grad_norm_var": 1.3881674273527314, "learning_rate": 0.0001, "loss": 1.28, "loss/crossentropy": 2.4325146675109863, "loss/hidden": 0.91796875, "loss/logits": 0.18004098534584045, "loss/reg": 0.018198398873209953, "step": 1033 }, { "epoch": 0.12925, "grad_norm": 3.3802099227905273, "grad_norm_var": 1.3035463186707592, "learning_rate": 0.0001, "loss": 1.1984, "loss/crossentropy": 2.424464225769043, "loss/hidden": 0.8671875, "loss/logits": 0.14932216703891754, "loss/reg": 0.018189454451203346, "step": 1034 }, { "epoch": 0.129375, "grad_norm": 3.2020263671875, "grad_norm_var": 1.3536821307745555, "learning_rate": 0.0001, "loss": 1.085, "loss/crossentropy": 2.539475440979004, "loss/hidden": 0.77734375, "loss/logits": 0.1258580982685089, "loss/reg": 0.018180513754487038, "step": 1035 }, { "epoch": 0.1295, "grad_norm": 3.5899553298950195, "grad_norm_var": 1.3630023284114352, "learning_rate": 0.0001, "loss": 1.1137, "loss/crossentropy": 2.277602434158325, "loss/hidden": 0.78125, "loss/logits": 0.15075945854187012, "loss/reg": 0.018171606585383415, "step": 1036 }, { "epoch": 0.129625, "grad_norm": 3.1815500259399414, "grad_norm_var": 1.1143223174368395, "learning_rate": 0.0001, "loss": 1.2827, "loss/crossentropy": 2.5251145362854004, "loss/hidden": 0.91015625, "loss/logits": 0.19092242419719696, "loss/reg": 0.018162967637181282, "step": 1037 }, { "epoch": 0.12975, "grad_norm": 3.6182315349578857, "grad_norm_var": 1.083329466089586, "learning_rate": 0.0001, "loss": 1.5649, "loss/crossentropy": 2.0126724243164062, "loss/hidden": 1.1796875, "loss/logits": 0.20370075106620789, "loss/reg": 0.018154380843043327, "step": 1038 }, { "epoch": 0.129875, "grad_norm": 3.239778518676758, "grad_norm_var": 1.0521445613054454, "learning_rate": 0.0001, "loss": 1.1521, "loss/crossentropy": 2.286173105239868, "loss/hidden": 0.8203125, "loss/logits": 0.1503629982471466, "loss/reg": 0.01814563386142254, "step": 1039 }, { "epoch": 0.13, "grad_norm": 5.980926990509033, "grad_norm_var": 1.2991062966869322, "learning_rate": 0.0001, "loss": 1.4125, "loss/crossentropy": 2.5735340118408203, "loss/hidden": 0.87109375, "loss/logits": 0.36005836725234985, "loss/reg": 0.018136821687221527, "step": 1040 }, { "epoch": 0.130125, "grad_norm": 3.8425281047821045, "grad_norm_var": 0.7718063043351868, "learning_rate": 0.0001, "loss": 1.3336, "loss/crossentropy": 2.002030372619629, "loss/hidden": 0.9921875, "loss/logits": 0.16016829013824463, "loss/reg": 0.01812821812927723, "step": 1041 }, { "epoch": 0.13025, "grad_norm": 2.6303024291992188, "grad_norm_var": 0.8318731912874341, "learning_rate": 0.0001, "loss": 1.0674, "loss/crossentropy": 2.5561156272888184, "loss/hidden": 0.75390625, "loss/logits": 0.13234438002109528, "loss/reg": 0.01811978593468666, "step": 1042 }, { "epoch": 0.130375, "grad_norm": 4.081605911254883, "grad_norm_var": 0.816546710723537, "learning_rate": 0.0001, "loss": 1.1508, "loss/crossentropy": 2.5789449214935303, "loss/hidden": 0.83203125, "loss/logits": 0.13762518763542175, "loss/reg": 0.01811092346906662, "step": 1043 }, { "epoch": 0.1305, "grad_norm": 3.746345281600952, "grad_norm_var": 0.808580216385354, "learning_rate": 0.0001, "loss": 1.0868, "loss/crossentropy": 2.2884302139282227, "loss/hidden": 0.7578125, "loss/logits": 0.14801663160324097, "loss/reg": 0.018101971596479416, "step": 1044 }, { "epoch": 0.130625, "grad_norm": 3.1717352867126465, "grad_norm_var": 0.5952362637338294, "learning_rate": 0.0001, "loss": 1.0943, "loss/crossentropy": 2.4526915550231934, "loss/hidden": 0.76953125, "loss/logits": 0.1438664197921753, "loss/reg": 0.018093010410666466, "step": 1045 }, { "epoch": 0.13075, "grad_norm": 5.281366348266602, "grad_norm_var": 0.7887311446436568, "learning_rate": 0.0001, "loss": 1.1712, "loss/crossentropy": 2.400421142578125, "loss/hidden": 0.8359375, "loss/logits": 0.1543923020362854, "loss/reg": 0.01808418706059456, "step": 1046 }, { "epoch": 0.130875, "grad_norm": 5.1911540031433105, "grad_norm_var": 0.8361699826228097, "learning_rate": 0.0001, "loss": 1.2646, "loss/crossentropy": 2.6032376289367676, "loss/hidden": 0.91796875, "loss/logits": 0.16584512591362, "loss/reg": 0.01807507872581482, "step": 1047 }, { "epoch": 0.131, "grad_norm": 2.902432680130005, "grad_norm_var": 0.8790790548012068, "learning_rate": 0.0001, "loss": 1.2484, "loss/crossentropy": 2.4166929721832275, "loss/hidden": 0.90234375, "loss/logits": 0.1653715819120407, "loss/reg": 0.018065867945551872, "step": 1048 }, { "epoch": 0.131125, "grad_norm": 3.408224582672119, "grad_norm_var": 0.8661178167301301, "learning_rate": 0.0001, "loss": 1.1045, "loss/crossentropy": 2.5960233211517334, "loss/hidden": 0.78515625, "loss/logits": 0.13876128196716309, "loss/reg": 0.01805703714489937, "step": 1049 }, { "epoch": 0.13125, "grad_norm": 2.8764824867248535, "grad_norm_var": 0.9086952536899503, "learning_rate": 0.0001, "loss": 1.2047, "loss/crossentropy": 2.280080556869507, "loss/hidden": 0.875, "loss/logits": 0.1491965353488922, "loss/reg": 0.018048083409667015, "step": 1050 }, { "epoch": 0.131375, "grad_norm": 4.48457670211792, "grad_norm_var": 0.9183881653108216, "learning_rate": 0.0001, "loss": 1.0941, "loss/crossentropy": 2.5797057151794434, "loss/hidden": 0.7421875, "loss/logits": 0.17152699828147888, "loss/reg": 0.018038896843791008, "step": 1051 }, { "epoch": 0.1315, "grad_norm": 3.204270839691162, "grad_norm_var": 0.9398596856778226, "learning_rate": 0.0001, "loss": 1.423, "loss/crossentropy": 2.1369965076446533, "loss/hidden": 1.0625, "loss/logits": 0.18023288249969482, "loss/reg": 0.018029624596238136, "step": 1052 }, { "epoch": 0.131625, "grad_norm": 3.225297212600708, "grad_norm_var": 0.9363567728280238, "learning_rate": 0.0001, "loss": 1.1299, "loss/crossentropy": 2.5327794551849365, "loss/hidden": 0.79296875, "loss/logits": 0.1567188799381256, "loss/reg": 0.018020475283265114, "step": 1053 }, { "epoch": 0.13175, "grad_norm": 2.6159298419952393, "grad_norm_var": 1.0241485205327874, "learning_rate": 0.0001, "loss": 1.0505, "loss/crossentropy": 2.5744223594665527, "loss/hidden": 0.73828125, "loss/logits": 0.1320829838514328, "loss/reg": 0.018011758103966713, "step": 1054 }, { "epoch": 0.131875, "grad_norm": 4.383845806121826, "grad_norm_var": 1.0292396555670902, "learning_rate": 0.0001, "loss": 1.3117, "loss/crossentropy": 2.4903180599212646, "loss/hidden": 0.90625, "loss/logits": 0.22538328170776367, "loss/reg": 0.0180036798119545, "step": 1055 }, { "epoch": 0.132, "grad_norm": 2.4342551231384277, "grad_norm_var": 0.7907919306719045, "learning_rate": 0.0001, "loss": 1.0574, "loss/crossentropy": 2.3163890838623047, "loss/hidden": 0.75, "loss/logits": 0.12748895585536957, "loss/reg": 0.01799560710787773, "step": 1056 }, { "epoch": 0.132125, "grad_norm": 3.542881727218628, "grad_norm_var": 0.7864152227556851, "learning_rate": 0.0001, "loss": 1.294, "loss/crossentropy": 2.3338446617126465, "loss/hidden": 0.93359375, "loss/logits": 0.18051280081272125, "loss/reg": 0.017986783757805824, "step": 1057 }, { "epoch": 0.13225, "grad_norm": 4.0569915771484375, "grad_norm_var": 0.7341544247946139, "learning_rate": 0.0001, "loss": 1.3818, "loss/crossentropy": 2.330078363418579, "loss/hidden": 1.015625, "loss/logits": 0.18636882305145264, "loss/reg": 0.017977885901927948, "step": 1058 }, { "epoch": 0.132375, "grad_norm": 3.663426637649536, "grad_norm_var": 0.7217416281732681, "learning_rate": 0.0001, "loss": 1.1508, "loss/crossentropy": 2.5138370990753174, "loss/hidden": 0.796875, "loss/logits": 0.1742265522480011, "loss/reg": 0.017969388514757156, "step": 1059 }, { "epoch": 0.1325, "grad_norm": 2.9652159214019775, "grad_norm_var": 0.7484703245510673, "learning_rate": 0.0001, "loss": 1.1313, "loss/crossentropy": 2.2722136974334717, "loss/hidden": 0.80859375, "loss/logits": 0.1430921107530594, "loss/reg": 0.0179609302431345, "step": 1060 }, { "epoch": 0.132625, "grad_norm": 5.577633857727051, "grad_norm_var": 0.9767082401115801, "learning_rate": 0.0001, "loss": 1.7627, "loss/crossentropy": 2.562012195587158, "loss/hidden": 1.390625, "loss/logits": 0.19251598417758942, "loss/reg": 0.01795242354273796, "step": 1061 }, { "epoch": 0.13275, "grad_norm": 5.364041328430176, "grad_norm_var": 0.9941443511093354, "learning_rate": 0.0001, "loss": 1.5266, "loss/crossentropy": 2.507934808731079, "loss/hidden": 1.125, "loss/logits": 0.22217199206352234, "loss/reg": 0.017944158986210823, "step": 1062 }, { "epoch": 0.132875, "grad_norm": 6.644399642944336, "grad_norm_var": 1.4066377839550304, "learning_rate": 0.0001, "loss": 1.5347, "loss/crossentropy": 2.4112086296081543, "loss/hidden": 1.109375, "loss/logits": 0.24593724310398102, "loss/reg": 0.017935609444975853, "step": 1063 }, { "epoch": 0.133, "grad_norm": 2.846679210662842, "grad_norm_var": 1.4137598873748003, "learning_rate": 0.0001, "loss": 1.1476, "loss/crossentropy": 2.2182581424713135, "loss/hidden": 0.81640625, "loss/logits": 0.1519462913274765, "loss/reg": 0.01792830042541027, "step": 1064 }, { "epoch": 0.133125, "grad_norm": 6.004873752593994, "grad_norm_var": 1.688838288773405, "learning_rate": 0.0001, "loss": 1.6923, "loss/crossentropy": 2.5660815238952637, "loss/hidden": 1.2578125, "loss/logits": 0.25529351830482483, "loss/reg": 0.01792137697339058, "step": 1065 }, { "epoch": 0.13325, "grad_norm": 2.905362844467163, "grad_norm_var": 1.684590354160513, "learning_rate": 0.0001, "loss": 1.1266, "loss/crossentropy": 2.679647207260132, "loss/hidden": 0.78125, "loss/logits": 0.16615256667137146, "loss/reg": 0.017914744094014168, "step": 1066 }, { "epoch": 0.133375, "grad_norm": 2.774467945098877, "grad_norm_var": 1.7557347328903679, "learning_rate": 0.0001, "loss": 0.9917, "loss/crossentropy": 2.4071364402770996, "loss/hidden": 0.69921875, "loss/logits": 0.11337558180093765, "loss/reg": 0.017907986417412758, "step": 1067 }, { "epoch": 0.1335, "grad_norm": 3.0080766677856445, "grad_norm_var": 1.7760288881986261, "learning_rate": 0.0001, "loss": 1.1718, "loss/crossentropy": 2.4299240112304688, "loss/hidden": 0.83984375, "loss/logits": 0.15295377373695374, "loss/reg": 0.01789918728172779, "step": 1068 }, { "epoch": 0.133625, "grad_norm": 3.646827220916748, "grad_norm_var": 1.750571466335814, "learning_rate": 0.0001, "loss": 1.2924, "loss/crossentropy": 2.461158514022827, "loss/hidden": 0.93359375, "loss/logits": 0.17990370094776154, "loss/reg": 0.01789136230945587, "step": 1069 }, { "epoch": 0.13375, "grad_norm": 4.164073944091797, "grad_norm_var": 1.6348612297712255, "learning_rate": 0.0001, "loss": 1.171, "loss/crossentropy": 2.4830503463745117, "loss/hidden": 0.82421875, "loss/logits": 0.16796636581420898, "loss/reg": 0.017882652580738068, "step": 1070 }, { "epoch": 0.133875, "grad_norm": 3.7383811473846436, "grad_norm_var": 1.6277745939444626, "learning_rate": 0.0001, "loss": 1.2749, "loss/crossentropy": 2.3813910484313965, "loss/hidden": 0.94921875, "loss/logits": 0.14690269529819489, "loss/reg": 0.017873771488666534, "step": 1071 }, { "epoch": 0.134, "grad_norm": 4.526171684265137, "grad_norm_var": 1.476108335704568, "learning_rate": 0.0001, "loss": 1.6415, "loss/crossentropy": 2.1495041847229004, "loss/hidden": 1.234375, "loss/logits": 0.22847777605056763, "loss/reg": 0.017865996807813644, "step": 1072 }, { "epoch": 0.134125, "grad_norm": 3.382908582687378, "grad_norm_var": 1.4893637052056738, "learning_rate": 0.0001, "loss": 1.0784, "loss/crossentropy": 2.468477725982666, "loss/hidden": 0.765625, "loss/logits": 0.13421444594860077, "loss/reg": 0.01785840094089508, "step": 1073 }, { "epoch": 0.13425, "grad_norm": 5.163125991821289, "grad_norm_var": 1.5625376434966463, "learning_rate": 0.0001, "loss": 1.2499, "loss/crossentropy": 2.2978320121765137, "loss/hidden": 0.91796875, "loss/logits": 0.1534654200077057, "loss/reg": 0.01785091683268547, "step": 1074 }, { "epoch": 0.134375, "grad_norm": 3.1810903549194336, "grad_norm_var": 1.6082726182831408, "learning_rate": 0.0001, "loss": 1.0638, "loss/crossentropy": 2.5080108642578125, "loss/hidden": 0.7578125, "loss/logits": 0.1275557279586792, "loss/reg": 0.017842529341578484, "step": 1075 }, { "epoch": 0.1345, "grad_norm": 2.6275007724761963, "grad_norm_var": 1.6673241917556567, "learning_rate": 0.0001, "loss": 1.023, "loss/crossentropy": 2.725398302078247, "loss/hidden": 0.7265625, "loss/logits": 0.11811243742704391, "loss/reg": 0.01783500798046589, "step": 1076 }, { "epoch": 0.134625, "grad_norm": 2.4731173515319824, "grad_norm_var": 1.656907168261868, "learning_rate": 0.0001, "loss": 1.1398, "loss/crossentropy": 2.2775444984436035, "loss/hidden": 0.8125, "loss/logits": 0.14902615547180176, "loss/reg": 0.01782614178955555, "step": 1077 }, { "epoch": 0.13475, "grad_norm": 2.6200878620147705, "grad_norm_var": 1.593020801522063, "learning_rate": 0.0001, "loss": 1.0744, "loss/crossentropy": 2.6091089248657227, "loss/hidden": 0.74609375, "loss/logits": 0.1501138061285019, "loss/reg": 0.017817262560129166, "step": 1078 }, { "epoch": 0.134875, "grad_norm": 2.637160062789917, "grad_norm_var": 1.040390657280697, "learning_rate": 0.0001, "loss": 1.0533, "loss/crossentropy": 2.6109707355499268, "loss/hidden": 0.734375, "loss/logits": 0.1408311128616333, "loss/reg": 0.01780843175947666, "step": 1079 }, { "epoch": 0.135, "grad_norm": 2.4874284267425537, "grad_norm_var": 1.0788527015533946, "learning_rate": 0.0001, "loss": 1.0129, "loss/crossentropy": 2.4900012016296387, "loss/hidden": 0.71484375, "loss/logits": 0.12004341185092926, "loss/reg": 0.017799846827983856, "step": 1080 }, { "epoch": 0.135125, "grad_norm": 2.8923330307006836, "grad_norm_var": 0.6277088581300307, "learning_rate": 0.0001, "loss": 1.0085, "loss/crossentropy": 2.877713441848755, "loss/hidden": 0.69140625, "loss/logits": 0.1391412615776062, "loss/reg": 0.017792070284485817, "step": 1081 }, { "epoch": 0.13525, "grad_norm": 3.0681328773498535, "grad_norm_var": 0.6215757739924422, "learning_rate": 0.0001, "loss": 1.1841, "loss/crossentropy": 2.4870967864990234, "loss/hidden": 0.83984375, "loss/logits": 0.16639447212219238, "loss/reg": 0.017783144488930702, "step": 1082 }, { "epoch": 0.135375, "grad_norm": 2.943787097930908, "grad_norm_var": 0.6120804925495794, "learning_rate": 0.0001, "loss": 0.9977, "loss/crossentropy": 2.4002597332000732, "loss/hidden": 0.69921875, "loss/logits": 0.12076494097709656, "loss/reg": 0.017775090411305428, "step": 1083 }, { "epoch": 0.1355, "grad_norm": 3.1173973083496094, "grad_norm_var": 0.6087907870581594, "learning_rate": 0.0001, "loss": 1.1729, "loss/crossentropy": 2.6328957080841064, "loss/hidden": 0.85546875, "loss/logits": 0.13975805044174194, "loss/reg": 0.017766445875167847, "step": 1084 }, { "epoch": 0.135625, "grad_norm": 4.231754779815674, "grad_norm_var": 0.6578597190419477, "learning_rate": 0.0001, "loss": 1.5044, "loss/crossentropy": 2.6625559329986572, "loss/hidden": 1.109375, "loss/logits": 0.2174825519323349, "loss/reg": 0.01775727979838848, "step": 1085 }, { "epoch": 0.13575, "grad_norm": 3.1539108753204346, "grad_norm_var": 0.6090813956553401, "learning_rate": 0.0001, "loss": 1.0553, "loss/crossentropy": 2.685743808746338, "loss/hidden": 0.74609375, "loss/logits": 0.1317606419324875, "loss/reg": 0.017748642712831497, "step": 1086 }, { "epoch": 0.135875, "grad_norm": 3.386591672897339, "grad_norm_var": 0.5946246391748462, "learning_rate": 0.0001, "loss": 1.4794, "loss/crossentropy": 2.3431785106658936, "loss/hidden": 1.0625, "loss/logits": 0.23952066898345947, "loss/reg": 0.01773969829082489, "step": 1087 }, { "epoch": 0.136, "grad_norm": 2.8175418376922607, "grad_norm_var": 0.4848234667031941, "learning_rate": 0.0001, "loss": 1.0416, "loss/crossentropy": 2.676400899887085, "loss/hidden": 0.73046875, "loss/logits": 0.13378942012786865, "loss/reg": 0.01773088052868843, "step": 1088 }, { "epoch": 0.136125, "grad_norm": 3.5386815071105957, "grad_norm_var": 0.4914580502239192, "learning_rate": 0.0001, "loss": 1.1391, "loss/crossentropy": 2.6673851013183594, "loss/hidden": 0.81640625, "loss/logits": 0.14545656740665436, "loss/reg": 0.01772209256887436, "step": 1089 }, { "epoch": 0.13625, "grad_norm": 2.944378614425659, "grad_norm_var": 0.2024704804136301, "learning_rate": 0.0001, "loss": 1.0518, "loss/crossentropy": 2.3069140911102295, "loss/hidden": 0.7578125, "loss/logits": 0.11681585013866425, "loss/reg": 0.017712950706481934, "step": 1090 }, { "epoch": 0.136375, "grad_norm": 2.4177603721618652, "grad_norm_var": 0.221225648364459, "learning_rate": 0.0001, "loss": 1.0819, "loss/crossentropy": 2.5605850219726562, "loss/hidden": 0.75390625, "loss/logits": 0.15096938610076904, "loss/reg": 0.01770433411002159, "step": 1091 }, { "epoch": 0.1365, "grad_norm": 2.8017842769622803, "grad_norm_var": 0.2154010561648003, "learning_rate": 0.0001, "loss": 1.0446, "loss/crossentropy": 2.837974786758423, "loss/hidden": 0.7421875, "loss/logits": 0.12545213103294373, "loss/reg": 0.017695914953947067, "step": 1092 }, { "epoch": 0.136625, "grad_norm": 2.30322265625, "grad_norm_var": 0.2284775401570935, "learning_rate": 0.0001, "loss": 1.0164, "loss/crossentropy": 2.645153760910034, "loss/hidden": 0.7109375, "loss/logits": 0.1285744607448578, "loss/reg": 0.017688019201159477, "step": 1093 }, { "epoch": 0.13675, "grad_norm": 3.5715980529785156, "grad_norm_var": 0.24192379822145516, "learning_rate": 0.0001, "loss": 1.1766, "loss/crossentropy": 2.2281081676483154, "loss/hidden": 0.8359375, "loss/logits": 0.16386280953884125, "loss/reg": 0.017679255455732346, "step": 1094 }, { "epoch": 0.136875, "grad_norm": 3.5261693000793457, "grad_norm_var": 0.24598854725778285, "learning_rate": 0.0001, "loss": 1.4345, "loss/crossentropy": 2.311053514480591, "loss/hidden": 1.03125, "loss/logits": 0.22651731967926025, "loss/reg": 0.017671290785074234, "step": 1095 }, { "epoch": 0.137, "grad_norm": 4.736113548278809, "grad_norm_var": 0.3858102993473473, "learning_rate": 0.0001, "loss": 1.1132, "loss/crossentropy": 2.483963966369629, "loss/hidden": 0.8046875, "loss/logits": 0.1318715214729309, "loss/reg": 0.01766252890229225, "step": 1096 }, { "epoch": 0.137125, "grad_norm": 3.013045310974121, "grad_norm_var": 0.3815164758052994, "learning_rate": 0.0001, "loss": 1.0531, "loss/crossentropy": 2.3578484058380127, "loss/hidden": 0.75390625, "loss/logits": 0.12263330817222595, "loss/reg": 0.017653891816735268, "step": 1097 }, { "epoch": 0.13725, "grad_norm": 3.655559539794922, "grad_norm_var": 0.3909346674988434, "learning_rate": 0.0001, "loss": 1.0558, "loss/crossentropy": 2.7702715396881104, "loss/hidden": 0.7421875, "loss/logits": 0.13715983927249908, "loss/reg": 0.01764553412795067, "step": 1098 }, { "epoch": 0.137375, "grad_norm": 4.493204593658447, "grad_norm_var": 0.47566105167650385, "learning_rate": 0.0001, "loss": 1.1521, "loss/crossentropy": 3.274940013885498, "loss/hidden": 0.765625, "loss/logits": 0.21013642847537994, "loss/reg": 0.01763724535703659, "step": 1099 }, { "epoch": 0.1375, "grad_norm": 2.9585206508636475, "grad_norm_var": 0.482309950085633, "learning_rate": 0.0001, "loss": 1.0561, "loss/crossentropy": 2.3324267864227295, "loss/hidden": 0.75, "loss/logits": 0.12980124354362488, "loss/reg": 0.017628395929932594, "step": 1100 }, { "epoch": 0.137625, "grad_norm": 4.262050628662109, "grad_norm_var": 0.4859417805331835, "learning_rate": 0.0001, "loss": 1.0991, "loss/crossentropy": 2.4208176136016846, "loss/hidden": 0.80078125, "loss/logits": 0.12208649516105652, "loss/reg": 0.017619585618376732, "step": 1101 }, { "epoch": 0.13775, "grad_norm": 2.6123526096343994, "grad_norm_var": 0.5183416158931512, "learning_rate": 0.0001, "loss": 0.9988, "loss/crossentropy": 2.6298210620880127, "loss/hidden": 0.69140625, "loss/logits": 0.13125354051589966, "loss/reg": 0.017610682174563408, "step": 1102 }, { "epoch": 0.137875, "grad_norm": 3.968358039855957, "grad_norm_var": 0.5450550638000642, "learning_rate": 0.0001, "loss": 1.1401, "loss/crossentropy": 2.4403319358825684, "loss/hidden": 0.80859375, "loss/logits": 0.15549315512180328, "loss/reg": 0.01760167069733143, "step": 1103 }, { "epoch": 0.138, "grad_norm": 2.6694836616516113, "grad_norm_var": 0.5569615426057339, "learning_rate": 0.0001, "loss": 1.2515, "loss/crossentropy": 2.35309100151062, "loss/hidden": 0.8984375, "loss/logits": 0.1771024614572525, "loss/reg": 0.01759263686835766, "step": 1104 }, { "epoch": 0.138125, "grad_norm": 2.5527474880218506, "grad_norm_var": 0.5918626570387079, "learning_rate": 0.0001, "loss": 0.9445, "loss/crossentropy": 2.5089659690856934, "loss/hidden": 0.65625, "loss/logits": 0.11239723861217499, "loss/reg": 0.017583860084414482, "step": 1105 }, { "epoch": 0.13825, "grad_norm": 3.055800676345825, "grad_norm_var": 0.5876466077321024, "learning_rate": 0.0001, "loss": 0.9616, "loss/crossentropy": 3.0698087215423584, "loss/hidden": 0.66796875, "loss/logits": 0.11786890029907227, "loss/reg": 0.01757502555847168, "step": 1106 }, { "epoch": 0.138375, "grad_norm": 2.3708860874176025, "grad_norm_var": 0.5932188518407192, "learning_rate": 0.0001, "loss": 1.1363, "loss/crossentropy": 2.364453077316284, "loss/hidden": 0.82421875, "loss/logits": 0.13644903898239136, "loss/reg": 0.017566362395882607, "step": 1107 }, { "epoch": 0.1385, "grad_norm": 3.3530497550964355, "grad_norm_var": 0.5767366681943767, "learning_rate": 0.0001, "loss": 1.1719, "loss/crossentropy": 2.339834451675415, "loss/hidden": 0.86328125, "loss/logits": 0.13302001357078552, "loss/reg": 0.017557917162775993, "step": 1108 }, { "epoch": 0.138625, "grad_norm": 4.9073686599731445, "grad_norm_var": 0.6479273995103992, "learning_rate": 0.0001, "loss": 1.144, "loss/crossentropy": 2.608236074447632, "loss/hidden": 0.8359375, "loss/logits": 0.1325811743736267, "loss/reg": 0.017549151554703712, "step": 1109 }, { "epoch": 0.13875, "grad_norm": 4.318938732147217, "grad_norm_var": 0.6917982612527337, "learning_rate": 0.0001, "loss": 1.0295, "loss/crossentropy": 2.6003918647766113, "loss/hidden": 0.73046875, "loss/logits": 0.12366551160812378, "loss/reg": 0.017540371045470238, "step": 1110 }, { "epoch": 0.138875, "grad_norm": 3.439058303833008, "grad_norm_var": 0.692297895774635, "learning_rate": 0.0001, "loss": 1.1891, "loss/crossentropy": 2.3828976154327393, "loss/hidden": 0.859375, "loss/logits": 0.1544424593448639, "loss/reg": 0.01753184385597706, "step": 1111 }, { "epoch": 0.139, "grad_norm": 3.090857744216919, "grad_norm_var": 0.5953394071265308, "learning_rate": 0.0001, "loss": 1.1475, "loss/crossentropy": 2.2759549617767334, "loss/hidden": 0.8125, "loss/logits": 0.1597966104745865, "loss/reg": 0.017523299902677536, "step": 1112 }, { "epoch": 0.139125, "grad_norm": 2.421586513519287, "grad_norm_var": 0.6493026217043355, "learning_rate": 0.0001, "loss": 1.0266, "loss/crossentropy": 2.580230236053467, "loss/hidden": 0.7265625, "loss/logits": 0.12484898418188095, "loss/reg": 0.017514871433377266, "step": 1113 }, { "epoch": 0.13925, "grad_norm": 2.3141372203826904, "grad_norm_var": 0.7130373793975477, "learning_rate": 0.0001, "loss": 1.0674, "loss/crossentropy": 2.7415308952331543, "loss/hidden": 0.75, "loss/logits": 0.14231345057487488, "loss/reg": 0.01750599592924118, "step": 1114 }, { "epoch": 0.139375, "grad_norm": 3.1447672843933105, "grad_norm_var": 0.6120215321394992, "learning_rate": 0.0001, "loss": 1.0712, "loss/crossentropy": 2.4831013679504395, "loss/hidden": 0.76171875, "loss/logits": 0.13447736203670502, "loss/reg": 0.017497511580586433, "step": 1115 }, { "epoch": 0.1395, "grad_norm": 3.2429728507995605, "grad_norm_var": 0.6073512012070741, "learning_rate": 0.0001, "loss": 1.1311, "loss/crossentropy": 2.567009925842285, "loss/hidden": 0.80078125, "loss/logits": 0.1554429829120636, "loss/reg": 0.01748875342309475, "step": 1116 }, { "epoch": 0.139625, "grad_norm": 3.240558624267578, "grad_norm_var": 0.532380465942029, "learning_rate": 0.0001, "loss": 1.1099, "loss/crossentropy": 2.860992431640625, "loss/hidden": 0.79296875, "loss/logits": 0.1421348601579666, "loss/reg": 0.01748001016676426, "step": 1117 }, { "epoch": 0.13975, "grad_norm": 3.012918710708618, "grad_norm_var": 0.5126825052838881, "learning_rate": 0.0001, "loss": 1.1268, "loss/crossentropy": 2.4657137393951416, "loss/hidden": 0.8125, "loss/logits": 0.13955920934677124, "loss/reg": 0.017471779137849808, "step": 1118 }, { "epoch": 0.139875, "grad_norm": 2.7599992752075195, "grad_norm_var": 0.47917524489163554, "learning_rate": 0.0001, "loss": 1.0584, "loss/crossentropy": 2.366868019104004, "loss/hidden": 0.73828125, "loss/logits": 0.1455003321170807, "loss/reg": 0.017463646829128265, "step": 1119 }, { "epoch": 0.14, "grad_norm": 3.73720645904541, "grad_norm_var": 0.48651163922628105, "learning_rate": 0.0001, "loss": 1.2863, "loss/crossentropy": 2.9208223819732666, "loss/hidden": 0.93359375, "loss/logits": 0.17812727391719818, "loss/reg": 0.017455046996474266, "step": 1120 }, { "epoch": 0.140125, "grad_norm": 2.549912691116333, "grad_norm_var": 0.48675118323949296, "learning_rate": 0.0001, "loss": 1.0037, "loss/crossentropy": 2.217289447784424, "loss/hidden": 0.70703125, "loss/logits": 0.12221944332122803, "loss/reg": 0.017446177080273628, "step": 1121 }, { "epoch": 0.14025, "grad_norm": 3.35829758644104, "grad_norm_var": 0.48725917149039616, "learning_rate": 0.0001, "loss": 0.9516, "loss/crossentropy": 2.5682172775268555, "loss/hidden": 0.66015625, "loss/logits": 0.1170571893453598, "loss/reg": 0.017437297850847244, "step": 1122 }, { "epoch": 0.140375, "grad_norm": 3.384793519973755, "grad_norm_var": 0.4388955051274242, "learning_rate": 0.0001, "loss": 1.107, "loss/crossentropy": 2.8147690296173096, "loss/hidden": 0.78515625, "loss/logits": 0.14758194983005524, "loss/reg": 0.017428115010261536, "step": 1123 }, { "epoch": 0.1405, "grad_norm": 2.5823116302490234, "grad_norm_var": 0.4672083375473524, "learning_rate": 0.0001, "loss": 1.0724, "loss/crossentropy": 2.696993827819824, "loss/hidden": 0.75390625, "loss/logits": 0.14429491758346558, "loss/reg": 0.017419347539544106, "step": 1124 }, { "epoch": 0.140625, "grad_norm": 5.868249416351318, "grad_norm_var": 0.7412100386445967, "learning_rate": 0.0001, "loss": 1.2337, "loss/crossentropy": 2.6103923320770264, "loss/hidden": 0.8984375, "loss/logits": 0.16118671000003815, "loss/reg": 0.017410660162568092, "step": 1125 }, { "epoch": 0.14075, "grad_norm": 2.4435040950775146, "grad_norm_var": 0.7010336436922195, "learning_rate": 0.0001, "loss": 0.9667, "loss/crossentropy": 2.3706815242767334, "loss/hidden": 0.6640625, "loss/logits": 0.12864476442337036, "loss/reg": 0.01740197278559208, "step": 1126 }, { "epoch": 0.140875, "grad_norm": 2.7513833045959473, "grad_norm_var": 0.7051812497689844, "learning_rate": 0.0001, "loss": 1.1451, "loss/crossentropy": 2.4533605575561523, "loss/hidden": 0.81640625, "loss/logits": 0.15479305386543274, "loss/reg": 0.017393220216035843, "step": 1127 }, { "epoch": 0.141, "grad_norm": 2.8634872436523438, "grad_norm_var": 0.7092644673154382, "learning_rate": 0.0001, "loss": 1.1193, "loss/crossentropy": 2.1886956691741943, "loss/hidden": 0.8046875, "loss/logits": 0.14075767993927002, "loss/reg": 0.01738525740802288, "step": 1128 }, { "epoch": 0.141125, "grad_norm": 2.704045057296753, "grad_norm_var": 0.6885219755537205, "learning_rate": 0.0001, "loss": 0.973, "loss/crossentropy": 2.4812088012695312, "loss/hidden": 0.65625, "loss/logits": 0.1429394781589508, "loss/reg": 0.01737692952156067, "step": 1129 }, { "epoch": 0.14125, "grad_norm": 2.6875903606414795, "grad_norm_var": 0.656991790963667, "learning_rate": 0.0001, "loss": 1.0697, "loss/crossentropy": 2.3832883834838867, "loss/hidden": 0.76953125, "loss/logits": 0.126431405544281, "loss/reg": 0.017368923872709274, "step": 1130 }, { "epoch": 0.141375, "grad_norm": 7.774624824523926, "grad_norm_var": 1.9961090220751256, "learning_rate": 0.0001, "loss": 1.383, "loss/crossentropy": 2.5491535663604736, "loss/hidden": 1.0390625, "loss/logits": 0.1702873259782791, "loss/reg": 0.01736014150083065, "step": 1131 }, { "epoch": 0.1415, "grad_norm": 3.5188465118408203, "grad_norm_var": 1.9937980339815395, "learning_rate": 0.0001, "loss": 1.1816, "loss/crossentropy": 2.6813807487487793, "loss/hidden": 0.84375, "loss/logits": 0.16434520483016968, "loss/reg": 0.017351284623146057, "step": 1132 }, { "epoch": 0.141625, "grad_norm": 30.482587814331055, "grad_norm_var": 47.607494749063044, "learning_rate": 0.0001, "loss": 1.0592, "loss/crossentropy": 2.607267379760742, "loss/hidden": 0.75390625, "loss/logits": 0.13182219862937927, "loss/reg": 0.01734289340674877, "step": 1133 }, { "epoch": 0.14175, "grad_norm": 3.333674907684326, "grad_norm_var": 47.52231423180526, "learning_rate": 0.0001, "loss": 1.1791, "loss/crossentropy": 2.5332696437835693, "loss/hidden": 0.85546875, "loss/logits": 0.15026560425758362, "loss/reg": 0.017334245145320892, "step": 1134 }, { "epoch": 0.141875, "grad_norm": 2.8526668548583984, "grad_norm_var": 47.4930115697571, "learning_rate": 0.0001, "loss": 1.1235, "loss/crossentropy": 2.583221673965454, "loss/hidden": 0.8125, "loss/logits": 0.13772843778133392, "loss/reg": 0.017325541004538536, "step": 1135 }, { "epoch": 0.142, "grad_norm": 3.022386074066162, "grad_norm_var": 47.66253737043978, "learning_rate": 0.0001, "loss": 1.2714, "loss/crossentropy": 2.3197121620178223, "loss/hidden": 0.9453125, "loss/logits": 0.15295147895812988, "loss/reg": 0.017316767945885658, "step": 1136 }, { "epoch": 0.142125, "grad_norm": 3.3717892169952393, "grad_norm_var": 47.421346164152666, "learning_rate": 0.0001, "loss": 1.131, "loss/crossentropy": 2.543901205062866, "loss/hidden": 0.8203125, "loss/logits": 0.13759344816207886, "loss/reg": 0.017307989299297333, "step": 1137 }, { "epoch": 0.14225, "grad_norm": 3.5415263175964355, "grad_norm_var": 47.37875577313657, "learning_rate": 0.0001, "loss": 1.0293, "loss/crossentropy": 2.4758806228637695, "loss/hidden": 0.73046875, "loss/logits": 0.12582623958587646, "loss/reg": 0.01730157807469368, "step": 1138 }, { "epoch": 0.142375, "grad_norm": 2.9773669242858887, "grad_norm_var": 47.48768287025903, "learning_rate": 0.0001, "loss": 1.0163, "loss/crossentropy": 2.638218402862549, "loss/hidden": 0.71484375, "loss/logits": 0.12850065529346466, "loss/reg": 0.017293203622102737, "step": 1139 }, { "epoch": 0.1425, "grad_norm": 2.8083419799804688, "grad_norm_var": 47.41278427285148, "learning_rate": 0.0001, "loss": 1.0945, "loss/crossentropy": 2.503974676132202, "loss/hidden": 0.7578125, "loss/logits": 0.1638229787349701, "loss/reg": 0.01728462241590023, "step": 1140 }, { "epoch": 0.142625, "grad_norm": 2.425929069519043, "grad_norm_var": 47.84099408884167, "learning_rate": 0.0001, "loss": 1.0449, "loss/crossentropy": 2.3581292629241943, "loss/hidden": 0.75, "loss/logits": 0.12217840552330017, "loss/reg": 0.017276806756854057, "step": 1141 }, { "epoch": 0.14275, "grad_norm": 39.945011138916016, "grad_norm_var": 123.09327375389867, "learning_rate": 0.0001, "loss": 1.0453, "loss/crossentropy": 2.5906147956848145, "loss/hidden": 0.75390625, "loss/logits": 0.11872614920139313, "loss/reg": 0.017268478870391846, "step": 1142 }, { "epoch": 0.142875, "grad_norm": 3.0662145614624023, "grad_norm_var": 122.90784367859821, "learning_rate": 0.0001, "loss": 1.0595, "loss/crossentropy": 2.2695305347442627, "loss/hidden": 0.7578125, "loss/logits": 0.1291281282901764, "loss/reg": 0.017260266467928886, "step": 1143 }, { "epoch": 0.143, "grad_norm": 4.583246231079102, "grad_norm_var": 122.06713805652959, "learning_rate": 0.0001, "loss": 1.1495, "loss/crossentropy": 2.480984926223755, "loss/hidden": 0.8125, "loss/logits": 0.16442613303661346, "loss/reg": 0.017252640798687935, "step": 1144 }, { "epoch": 0.143125, "grad_norm": 2.981353521347046, "grad_norm_var": 121.89670586108006, "learning_rate": 0.0001, "loss": 1.1551, "loss/crossentropy": 2.2689995765686035, "loss/hidden": 0.85546875, "loss/logits": 0.12721604108810425, "loss/reg": 0.01724405400454998, "step": 1145 }, { "epoch": 0.14325, "grad_norm": 2.3794004917144775, "grad_norm_var": 122.09878373545942, "learning_rate": 0.0001, "loss": 1.0441, "loss/crossentropy": 2.5703084468841553, "loss/hidden": 0.73828125, "loss/logits": 0.13347238302230835, "loss/reg": 0.017235582694411278, "step": 1146 }, { "epoch": 0.143375, "grad_norm": 2.4713571071624756, "grad_norm_var": 123.62106362597764, "learning_rate": 0.0001, "loss": 0.9492, "loss/crossentropy": 2.447471857070923, "loss/hidden": 0.65625, "loss/logits": 0.12064240872859955, "loss/reg": 0.017227739095687866, "step": 1147 }, { "epoch": 0.1435, "grad_norm": 2.6587135791778564, "grad_norm_var": 124.07916434426059, "learning_rate": 0.0001, "loss": 1.0818, "loss/crossentropy": 2.6435859203338623, "loss/hidden": 0.75390625, "loss/logits": 0.15571220219135284, "loss/reg": 0.017219962552189827, "step": 1148 }, { "epoch": 0.143625, "grad_norm": 2.5300068855285645, "grad_norm_var": 85.6034890468201, "learning_rate": 0.0001, "loss": 1.0955, "loss/crossentropy": 2.370417356491089, "loss/hidden": 0.7890625, "loss/logits": 0.13428199291229248, "loss/reg": 0.017211301252245903, "step": 1149 }, { "epoch": 0.14375, "grad_norm": 2.821120023727417, "grad_norm_var": 85.75492487355808, "learning_rate": 0.0001, "loss": 1.1666, "loss/crossentropy": 2.331489324569702, "loss/hidden": 0.8359375, "loss/logits": 0.15861350297927856, "loss/reg": 0.017202915623784065, "step": 1150 }, { "epoch": 0.143875, "grad_norm": 2.5067081451416016, "grad_norm_var": 85.8742473316794, "learning_rate": 0.0001, "loss": 0.9844, "loss/crossentropy": 2.466968297958374, "loss/hidden": 0.69140625, "loss/logits": 0.12101612985134125, "loss/reg": 0.017194462940096855, "step": 1151 }, { "epoch": 0.144, "grad_norm": 3.062662124633789, "grad_norm_var": 85.8623557526669, "learning_rate": 0.0001, "loss": 1.0339, "loss/crossentropy": 2.409207820892334, "loss/hidden": 0.7265625, "loss/logits": 0.13548848032951355, "loss/reg": 0.017186442390084267, "step": 1152 }, { "epoch": 0.144125, "grad_norm": 3.1716883182525635, "grad_norm_var": 85.9151871866652, "learning_rate": 0.0001, "loss": 1.0846, "loss/crossentropy": 2.4929358959198, "loss/hidden": 0.75390625, "loss/logits": 0.15896344184875488, "loss/reg": 0.0171778816729784, "step": 1153 }, { "epoch": 0.14425, "grad_norm": 9.434453010559082, "grad_norm_var": 86.74661652378359, "learning_rate": 0.0001, "loss": 2.0602, "loss/crossentropy": 2.4357495307922363, "loss/hidden": 1.6015625, "loss/logits": 0.28698208928108215, "loss/reg": 0.017169814556837082, "step": 1154 }, { "epoch": 0.144375, "grad_norm": 3.7739198207855225, "grad_norm_var": 86.506246361283, "learning_rate": 0.0001, "loss": 1.0704, "loss/crossentropy": 2.6996419429779053, "loss/hidden": 0.75390625, "loss/logits": 0.1448526829481125, "loss/reg": 0.017161287367343903, "step": 1155 }, { "epoch": 0.1445, "grad_norm": 4.359219551086426, "grad_norm_var": 86.06611929299979, "learning_rate": 0.0001, "loss": 1.2325, "loss/crossentropy": 2.506620168685913, "loss/hidden": 0.87109375, "loss/logits": 0.18992647528648376, "loss/reg": 0.017152708023786545, "step": 1156 }, { "epoch": 0.144625, "grad_norm": 3.83707594871521, "grad_norm_var": 85.56313319362654, "learning_rate": 0.0001, "loss": 1.1268, "loss/crossentropy": 2.392063856124878, "loss/hidden": 0.80859375, "loss/logits": 0.14678636193275452, "loss/reg": 0.017143724486231804, "step": 1157 }, { "epoch": 0.14475, "grad_norm": 2.9818544387817383, "grad_norm_var": 2.9152543868814065, "learning_rate": 0.0001, "loss": 0.9894, "loss/crossentropy": 2.590273380279541, "loss/hidden": 0.6953125, "loss/logits": 0.1227588877081871, "loss/reg": 0.01713474653661251, "step": 1158 }, { "epoch": 0.144875, "grad_norm": 2.5520756244659424, "grad_norm_var": 2.964164435968295, "learning_rate": 0.0001, "loss": 1.1502, "loss/crossentropy": 2.3040196895599365, "loss/hidden": 0.83203125, "loss/logits": 0.1469496786594391, "loss/reg": 0.017125625163316727, "step": 1159 }, { "epoch": 0.145, "grad_norm": 3.3840601444244385, "grad_norm_var": 2.881888386237869, "learning_rate": 0.0001, "loss": 1.0954, "loss/crossentropy": 2.4384047985076904, "loss/hidden": 0.78515625, "loss/logits": 0.13907559216022491, "loss/reg": 0.017117124050855637, "step": 1160 }, { "epoch": 0.145125, "grad_norm": 6.576678276062012, "grad_norm_var": 3.47394619120245, "learning_rate": 0.0001, "loss": 1.7801, "loss/crossentropy": 2.66701078414917, "loss/hidden": 1.2578125, "loss/logits": 0.35119134187698364, "loss/reg": 0.017108280211687088, "step": 1161 }, { "epoch": 0.14525, "grad_norm": 2.326526165008545, "grad_norm_var": 3.483123034262428, "learning_rate": 0.0001, "loss": 1.0016, "loss/crossentropy": 2.351609706878662, "loss/hidden": 0.70703125, "loss/logits": 0.12352467328310013, "loss/reg": 0.01709994673728943, "step": 1162 }, { "epoch": 0.145375, "grad_norm": 9.206579208374023, "grad_norm_var": 5.257167082686383, "learning_rate": 0.0001, "loss": 1.8176, "loss/crossentropy": 2.4763855934143066, "loss/hidden": 1.3515625, "loss/logits": 0.29511886835098267, "loss/reg": 0.017091669142246246, "step": 1163 }, { "epoch": 0.1455, "grad_norm": 2.9634292125701904, "grad_norm_var": 5.205470661734309, "learning_rate": 0.0001, "loss": 1.1448, "loss/crossentropy": 2.5422465801239014, "loss/hidden": 0.828125, "loss/logits": 0.14583711326122284, "loss/reg": 0.01708296500146389, "step": 1164 }, { "epoch": 0.145625, "grad_norm": 3.018620014190674, "grad_norm_var": 5.118565326969832, "learning_rate": 0.0001, "loss": 1.1524, "loss/crossentropy": 2.559739828109741, "loss/hidden": 0.82421875, "loss/logits": 0.15742075443267822, "loss/reg": 0.01707414537668228, "step": 1165 }, { "epoch": 0.14575, "grad_norm": 8.845293045043945, "grad_norm_var": 6.340596335373259, "learning_rate": 0.0001, "loss": 1.558, "loss/crossentropy": 2.8445377349853516, "loss/hidden": 1.140625, "loss/logits": 0.24674071371555328, "loss/reg": 0.017065750434994698, "step": 1166 }, { "epoch": 0.145875, "grad_norm": 4.030366897583008, "grad_norm_var": 6.0807354199013615, "learning_rate": 0.0001, "loss": 1.2387, "loss/crossentropy": 2.525634765625, "loss/hidden": 0.8828125, "loss/logits": 0.18527851998806, "loss/reg": 0.017056919634342194, "step": 1167 }, { "epoch": 0.146, "grad_norm": 4.10527229309082, "grad_norm_var": 5.935618580704363, "learning_rate": 0.0001, "loss": 1.3889, "loss/crossentropy": 2.657388687133789, "loss/hidden": 0.9765625, "loss/logits": 0.24190323054790497, "loss/reg": 0.01704811304807663, "step": 1168 }, { "epoch": 0.146125, "grad_norm": 3.8508963584899902, "grad_norm_var": 5.829627947687252, "learning_rate": 0.0001, "loss": 1.2453, "loss/crossentropy": 2.405853509902954, "loss/hidden": 0.90234375, "loss/logits": 0.17257630825042725, "loss/reg": 0.017038943246006966, "step": 1169 }, { "epoch": 0.14625, "grad_norm": 13.325060844421387, "grad_norm_var": 9.23016466799555, "learning_rate": 0.0001, "loss": 1.3331, "loss/crossentropy": 2.4716832637786865, "loss/hidden": 0.94140625, "loss/logits": 0.2213602215051651, "loss/reg": 0.01702967844903469, "step": 1170 }, { "epoch": 0.146375, "grad_norm": 6.264571189880371, "grad_norm_var": 9.228622011623397, "learning_rate": 0.0001, "loss": 1.1745, "loss/crossentropy": 2.685593605041504, "loss/hidden": 0.83984375, "loss/logits": 0.16446860134601593, "loss/reg": 0.017020443454384804, "step": 1171 }, { "epoch": 0.1465, "grad_norm": 4.161008358001709, "grad_norm_var": 9.25070050922357, "learning_rate": 0.0001, "loss": 1.2714, "loss/crossentropy": 2.329845428466797, "loss/hidden": 0.91796875, "loss/logits": 0.18332120776176453, "loss/reg": 0.01701117865741253, "step": 1172 }, { "epoch": 0.146625, "grad_norm": 2.7158303260803223, "grad_norm_var": 9.516487065581881, "learning_rate": 0.0001, "loss": 1.1028, "loss/crossentropy": 2.3517086505889893, "loss/hidden": 0.8203125, "loss/logits": 0.11249354481697083, "loss/reg": 0.01700259931385517, "step": 1173 }, { "epoch": 0.14675, "grad_norm": 4.040686130523682, "grad_norm_var": 9.298921738225228, "learning_rate": 0.0001, "loss": 1.4656, "loss/crossentropy": 1.98357093334198, "loss/hidden": 1.1328125, "loss/logits": 0.16282862424850464, "loss/reg": 0.01699363812804222, "step": 1174 }, { "epoch": 0.146875, "grad_norm": 3.484989643096924, "grad_norm_var": 9.038196456147912, "learning_rate": 0.0001, "loss": 1.1281, "loss/crossentropy": 2.7924225330352783, "loss/hidden": 0.83984375, "loss/logits": 0.11844426393508911, "loss/reg": 0.016985056921839714, "step": 1175 }, { "epoch": 0.147, "grad_norm": 2.9667844772338867, "grad_norm_var": 9.146981868389197, "learning_rate": 0.0001, "loss": 1.202, "loss/crossentropy": 2.405456781387329, "loss/hidden": 0.875, "loss/logits": 0.15722399950027466, "loss/reg": 0.01697635091841221, "step": 1176 }, { "epoch": 0.147125, "grad_norm": 5.164290904998779, "grad_norm_var": 8.996899765603606, "learning_rate": 0.0001, "loss": 1.1927, "loss/crossentropy": 2.6594996452331543, "loss/hidden": 0.85546875, "loss/logits": 0.16758760809898376, "loss/reg": 0.01696733944118023, "step": 1177 }, { "epoch": 0.14725, "grad_norm": 3.2219250202178955, "grad_norm_var": 8.724323229467464, "learning_rate": 0.0001, "loss": 1.216, "loss/crossentropy": 2.5696029663085938, "loss/hidden": 0.8828125, "loss/logits": 0.16358155012130737, "loss/reg": 0.016958681866526604, "step": 1178 }, { "epoch": 0.147375, "grad_norm": 2.848184108734131, "grad_norm_var": 7.7572272221691225, "learning_rate": 0.0001, "loss": 1.0605, "loss/crossentropy": 2.317523241043091, "loss/hidden": 0.7578125, "loss/logits": 0.1332237273454666, "loss/reg": 0.016949651762843132, "step": 1179 }, { "epoch": 0.1475, "grad_norm": 5.728630065917969, "grad_norm_var": 7.5993034900551715, "learning_rate": 0.0001, "loss": 1.1138, "loss/crossentropy": 2.771470308303833, "loss/hidden": 0.80859375, "loss/logits": 0.13583886623382568, "loss/reg": 0.016941089183092117, "step": 1180 }, { "epoch": 0.147625, "grad_norm": 3.538257122039795, "grad_norm_var": 7.488546256518004, "learning_rate": 0.0001, "loss": 1.0898, "loss/crossentropy": 2.7069919109344482, "loss/hidden": 0.77734375, "loss/logits": 0.1431659609079361, "loss/reg": 0.01693253219127655, "step": 1181 }, { "epoch": 0.14775, "grad_norm": 3.1879353523254395, "grad_norm_var": 6.50782164977926, "learning_rate": 0.0001, "loss": 1.0471, "loss/crossentropy": 2.6199276447296143, "loss/hidden": 0.73828125, "loss/logits": 0.13955026865005493, "loss/reg": 0.01692408137023449, "step": 1182 }, { "epoch": 0.147875, "grad_norm": 4.523275375366211, "grad_norm_var": 6.489534724108, "learning_rate": 0.0001, "loss": 1.2453, "loss/crossentropy": 2.120800018310547, "loss/hidden": 0.92578125, "loss/logits": 0.1503942906856537, "loss/reg": 0.016915684565901756, "step": 1183 }, { "epoch": 0.148, "grad_norm": 3.735788345336914, "grad_norm_var": 6.520985106875844, "learning_rate": 0.0001, "loss": 1.0708, "loss/crossentropy": 2.6711180210113525, "loss/hidden": 0.78515625, "loss/logits": 0.11657284200191498, "loss/reg": 0.016907010227441788, "step": 1184 }, { "epoch": 0.148125, "grad_norm": 5.269720077514648, "grad_norm_var": 6.515042671209793, "learning_rate": 0.0001, "loss": 1.2068, "loss/crossentropy": 2.2848961353302, "loss/hidden": 0.875, "loss/logits": 0.1628110110759735, "loss/reg": 0.01689821295440197, "step": 1185 }, { "epoch": 0.14825, "grad_norm": 3.4472532272338867, "grad_norm_var": 1.169463016821809, "learning_rate": 0.0001, "loss": 1.0442, "loss/crossentropy": 2.6003897190093994, "loss/hidden": 0.7421875, "loss/logits": 0.13315384089946747, "loss/reg": 0.016889235004782677, "step": 1186 }, { "epoch": 0.148375, "grad_norm": 3.267479658126831, "grad_norm_var": 0.8333935781311614, "learning_rate": 0.0001, "loss": 1.135, "loss/crossentropy": 2.3998661041259766, "loss/hidden": 0.83203125, "loss/logits": 0.1341383457183838, "loss/reg": 0.01688062585890293, "step": 1187 }, { "epoch": 0.1485, "grad_norm": 3.262617826461792, "grad_norm_var": 0.8443526957342724, "learning_rate": 0.0001, "loss": 1.0085, "loss/crossentropy": 2.5633139610290527, "loss/hidden": 0.72265625, "loss/logits": 0.11711567640304565, "loss/reg": 0.016872059553861618, "step": 1188 }, { "epoch": 0.148625, "grad_norm": 4.104423522949219, "grad_norm_var": 0.7687216542662267, "learning_rate": 0.0001, "loss": 1.2662, "loss/crossentropy": 2.4028804302215576, "loss/hidden": 0.9296875, "loss/logits": 0.16792353987693787, "loss/reg": 0.016863279044628143, "step": 1189 }, { "epoch": 0.14875, "grad_norm": 3.4790539741516113, "grad_norm_var": 0.7750564154152146, "learning_rate": 0.0001, "loss": 1.3253, "loss/crossentropy": 2.6870651245117188, "loss/hidden": 0.9140625, "loss/logits": 0.24266795814037323, "loss/reg": 0.016854623332619667, "step": 1190 }, { "epoch": 0.148875, "grad_norm": 3.920074701309204, "grad_norm_var": 0.7670521683778655, "learning_rate": 0.0001, "loss": 1.7384, "loss/crossentropy": 2.006319761276245, "loss/hidden": 1.296875, "loss/logits": 0.27303898334503174, "loss/reg": 0.016845691949129105, "step": 1191 }, { "epoch": 0.149, "grad_norm": 3.1460444927215576, "grad_norm_var": 0.7478523869744016, "learning_rate": 0.0001, "loss": 1.1471, "loss/crossentropy": 2.322908401489258, "loss/hidden": 0.828125, "loss/logits": 0.15057553350925446, "loss/reg": 0.016836855560541153, "step": 1192 }, { "epoch": 0.149125, "grad_norm": 2.8855838775634766, "grad_norm_var": 0.6777176205160399, "learning_rate": 0.0001, "loss": 1.1942, "loss/crossentropy": 2.3052327632904053, "loss/hidden": 0.859375, "loss/logits": 0.16649743914604187, "loss/reg": 0.016828451305627823, "step": 1193 }, { "epoch": 0.14925, "grad_norm": 5.441497325897217, "grad_norm_var": 0.837366755929357, "learning_rate": 0.0001, "loss": 1.2583, "loss/crossentropy": 2.8180274963378906, "loss/hidden": 0.91015625, "loss/logits": 0.1799638569355011, "loss/reg": 0.016819985583424568, "step": 1194 }, { "epoch": 0.149375, "grad_norm": 2.6638519763946533, "grad_norm_var": 0.864398086647378, "learning_rate": 0.0001, "loss": 1.2269, "loss/crossentropy": 2.1777124404907227, "loss/hidden": 0.8984375, "loss/logits": 0.1603836864233017, "loss/reg": 0.016811655834317207, "step": 1195 }, { "epoch": 0.1495, "grad_norm": 3.1290552616119385, "grad_norm_var": 0.6356402025764699, "learning_rate": 0.0001, "loss": 1.063, "loss/crossentropy": 2.31772518157959, "loss/hidden": 0.76171875, "loss/logits": 0.13322719931602478, "loss/reg": 0.016803618520498276, "step": 1196 }, { "epoch": 0.149625, "grad_norm": 2.8640289306640625, "grad_norm_var": 0.6774789250719541, "learning_rate": 0.0001, "loss": 1.2119, "loss/crossentropy": 2.603073835372925, "loss/hidden": 0.8828125, "loss/logits": 0.16111129522323608, "loss/reg": 0.016795063391327858, "step": 1197 }, { "epoch": 0.14975, "grad_norm": 2.704012155532837, "grad_norm_var": 0.7216374904878362, "learning_rate": 0.0001, "loss": 1.0581, "loss/crossentropy": 2.5079526901245117, "loss/hidden": 0.76171875, "loss/logits": 0.12854108214378357, "loss/reg": 0.016786765307188034, "step": 1198 }, { "epoch": 0.149875, "grad_norm": 3.260575532913208, "grad_norm_var": 0.6684105203930727, "learning_rate": 0.0001, "loss": 1.1244, "loss/crossentropy": 2.43037486076355, "loss/hidden": 0.80859375, "loss/logits": 0.14800330996513367, "loss/reg": 0.01677859202027321, "step": 1199 }, { "epoch": 0.15, "grad_norm": 2.9981093406677246, "grad_norm_var": 0.6828016535629103, "learning_rate": 0.0001, "loss": 1.4566, "loss/crossentropy": 2.5041277408599854, "loss/hidden": 1.0625, "loss/logits": 0.2263844907283783, "loss/reg": 0.016770560294389725, "step": 1200 }, { "epoch": 0.150125, "grad_norm": 3.0788891315460205, "grad_norm_var": 0.4629717181381217, "learning_rate": 0.0001, "loss": 1.085, "loss/crossentropy": 2.586244821548462, "loss/hidden": 0.76953125, "loss/logits": 0.1478727161884308, "loss/reg": 0.016761835664510727, "step": 1201 }, { "epoch": 0.15025, "grad_norm": 2.979407787322998, "grad_norm_var": 0.47078996164599, "learning_rate": 0.0001, "loss": 1.1119, "loss/crossentropy": 2.566535472869873, "loss/hidden": 0.79296875, "loss/logits": 0.1514027863740921, "loss/reg": 0.016753433272242546, "step": 1202 }, { "epoch": 0.150375, "grad_norm": 3.7025930881500244, "grad_norm_var": 0.4793410999759106, "learning_rate": 0.0001, "loss": 1.1125, "loss/crossentropy": 2.696070909500122, "loss/hidden": 0.80078125, "loss/logits": 0.14427737891674042, "loss/reg": 0.016745014116168022, "step": 1203 }, { "epoch": 0.1505, "grad_norm": 3.7526400089263916, "grad_norm_var": 0.48855855062033865, "learning_rate": 0.0001, "loss": 1.4721, "loss/crossentropy": 2.3625986576080322, "loss/hidden": 1.0859375, "loss/logits": 0.21884310245513916, "loss/reg": 0.016736432909965515, "step": 1204 }, { "epoch": 0.150625, "grad_norm": 2.6353213787078857, "grad_norm_var": 0.4819149135234577, "learning_rate": 0.0001, "loss": 1.0125, "loss/crossentropy": 2.5642147064208984, "loss/hidden": 0.72265625, "loss/logits": 0.1225152313709259, "loss/reg": 0.01672798953950405, "step": 1205 }, { "epoch": 0.15075, "grad_norm": 2.7348835468292236, "grad_norm_var": 0.4977728974757267, "learning_rate": 0.0001, "loss": 1.0998, "loss/crossentropy": 2.2675602436065674, "loss/hidden": 0.80078125, "loss/logits": 0.13179215788841248, "loss/reg": 0.016719957813620567, "step": 1206 }, { "epoch": 0.150875, "grad_norm": 2.585419178009033, "grad_norm_var": 0.4887115845786629, "learning_rate": 0.0001, "loss": 1.0439, "loss/crossentropy": 2.3835902214050293, "loss/hidden": 0.73828125, "loss/logits": 0.13845473527908325, "loss/reg": 0.01671142503619194, "step": 1207 }, { "epoch": 0.151, "grad_norm": 2.457369804382324, "grad_norm_var": 0.519646055542415, "learning_rate": 0.0001, "loss": 1.0538, "loss/crossentropy": 2.719109296798706, "loss/hidden": 0.7578125, "loss/logits": 0.129006028175354, "loss/reg": 0.01670280657708645, "step": 1208 }, { "epoch": 0.151125, "grad_norm": 2.794466733932495, "grad_norm_var": 0.5229773551564895, "learning_rate": 0.0001, "loss": 1.1287, "loss/crossentropy": 2.4790573120117188, "loss/hidden": 0.8203125, "loss/logits": 0.1414172500371933, "loss/reg": 0.016694119200110435, "step": 1209 }, { "epoch": 0.15125, "grad_norm": 2.713892698287964, "grad_norm_var": 0.14054897219938098, "learning_rate": 0.0001, "loss": 1.1539, "loss/crossentropy": 2.6969070434570312, "loss/hidden": 0.84765625, "loss/logits": 0.1394297033548355, "loss/reg": 0.01668514870107174, "step": 1210 }, { "epoch": 0.151375, "grad_norm": 3.024172782897949, "grad_norm_var": 0.13535290931035035, "learning_rate": 0.0001, "loss": 0.9908, "loss/crossentropy": 2.757359027862549, "loss/hidden": 0.703125, "loss/logits": 0.1209060549736023, "loss/reg": 0.016675440594553947, "step": 1211 }, { "epoch": 0.1515, "grad_norm": 2.4915895462036133, "grad_norm_var": 0.14667295132687527, "learning_rate": 0.0001, "loss": 1.0311, "loss/crossentropy": 2.311927080154419, "loss/hidden": 0.734375, "loss/logits": 0.13000784814357758, "loss/reg": 0.01666680909693241, "step": 1212 }, { "epoch": 0.151625, "grad_norm": 3.0987884998321533, "grad_norm_var": 0.1482532510455627, "learning_rate": 0.0001, "loss": 0.9663, "loss/crossentropy": 2.665640354156494, "loss/hidden": 0.68359375, "loss/logits": 0.11612722277641296, "loss/reg": 0.016658229753375053, "step": 1213 }, { "epoch": 0.15175, "grad_norm": 3.144266128540039, "grad_norm_var": 0.1466168566420831, "learning_rate": 0.0001, "loss": 1.2371, "loss/crossentropy": 2.5783543586730957, "loss/hidden": 0.9140625, "loss/logits": 0.1565355509519577, "loss/reg": 0.01664978824555874, "step": 1214 }, { "epoch": 0.151875, "grad_norm": 2.5145692825317383, "grad_norm_var": 0.15207652538272973, "learning_rate": 0.0001, "loss": 0.9714, "loss/crossentropy": 2.4898011684417725, "loss/hidden": 0.6796875, "loss/logits": 0.12530556321144104, "loss/reg": 0.01664099656045437, "step": 1215 }, { "epoch": 0.152, "grad_norm": 2.5222647190093994, "grad_norm_var": 0.16121854801189647, "learning_rate": 0.0001, "loss": 1.073, "loss/crossentropy": 2.417036533355713, "loss/hidden": 0.76171875, "loss/logits": 0.14500710368156433, "loss/reg": 0.01663181744515896, "step": 1216 }, { "epoch": 0.152125, "grad_norm": 2.7117919921875, "grad_norm_var": 0.16036668917481525, "learning_rate": 0.0001, "loss": 1.0801, "loss/crossentropy": 2.658942461013794, "loss/hidden": 0.76953125, "loss/logits": 0.14434798061847687, "loss/reg": 0.016622193157672882, "step": 1217 }, { "epoch": 0.15225, "grad_norm": 2.9744222164154053, "grad_norm_var": 0.16029316464901414, "learning_rate": 0.0001, "loss": 1.1878, "loss/crossentropy": 2.491722822189331, "loss/hidden": 0.84375, "loss/logits": 0.17787563800811768, "loss/reg": 0.016612496227025986, "step": 1218 }, { "epoch": 0.152375, "grad_norm": 3.3779635429382324, "grad_norm_var": 0.1306752736974545, "learning_rate": 0.0001, "loss": 1.0725, "loss/crossentropy": 2.4862918853759766, "loss/hidden": 0.7578125, "loss/logits": 0.14869090914726257, "loss/reg": 0.01660403423011303, "step": 1219 }, { "epoch": 0.1525, "grad_norm": 2.5530524253845215, "grad_norm_var": 0.07557910361394207, "learning_rate": 0.0001, "loss": 1.0191, "loss/crossentropy": 2.614441394805908, "loss/hidden": 0.72265625, "loss/logits": 0.13050062954425812, "loss/reg": 0.016595516353845596, "step": 1220 }, { "epoch": 0.152625, "grad_norm": 2.378697156906128, "grad_norm_var": 0.08433378351046841, "learning_rate": 0.0001, "loss": 1.0027, "loss/crossentropy": 2.529618501663208, "loss/hidden": 0.72265625, "loss/logits": 0.11422193050384521, "loss/reg": 0.016587061807513237, "step": 1221 }, { "epoch": 0.15275, "grad_norm": 3.0336711406707764, "grad_norm_var": 0.08911795415122749, "learning_rate": 0.0001, "loss": 1.0727, "loss/crossentropy": 2.5050387382507324, "loss/hidden": 0.75390625, "loss/logits": 0.15298862755298615, "loss/reg": 0.016578199341893196, "step": 1222 }, { "epoch": 0.152875, "grad_norm": 5.338172435760498, "grad_norm_var": 0.4936799710714484, "learning_rate": 0.0001, "loss": 1.1124, "loss/crossentropy": 2.6853761672973633, "loss/hidden": 0.80859375, "loss/logits": 0.13811737298965454, "loss/reg": 0.016569815576076508, "step": 1223 }, { "epoch": 0.153, "grad_norm": 2.962609052658081, "grad_norm_var": 0.47674628875148056, "learning_rate": 0.0001, "loss": 1.0689, "loss/crossentropy": 2.441126585006714, "loss/hidden": 0.765625, "loss/logits": 0.13769929111003876, "loss/reg": 0.016561318188905716, "step": 1224 }, { "epoch": 0.153125, "grad_norm": 2.7154529094696045, "grad_norm_var": 0.4790610818976868, "learning_rate": 0.0001, "loss": 1.1047, "loss/crossentropy": 2.4226393699645996, "loss/hidden": 0.796875, "loss/logits": 0.14224863052368164, "loss/reg": 0.016552859917283058, "step": 1225 }, { "epoch": 0.15325, "grad_norm": 3.163189172744751, "grad_norm_var": 0.4762029205706464, "learning_rate": 0.0001, "loss": 1.1356, "loss/crossentropy": 2.642491340637207, "loss/hidden": 0.81640625, "loss/logits": 0.1537851244211197, "loss/reg": 0.016544297337532043, "step": 1226 }, { "epoch": 0.153375, "grad_norm": 8.438536643981934, "grad_norm_var": 2.3256512762465564, "learning_rate": 0.0001, "loss": 1.4483, "loss/crossentropy": 2.159141778945923, "loss/hidden": 1.0703125, "loss/logits": 0.21258942782878876, "loss/reg": 0.016535377129912376, "step": 1227 }, { "epoch": 0.1535, "grad_norm": 16.965858459472656, "grad_norm_var": 13.784859138236738, "learning_rate": 0.0001, "loss": 3.1548, "loss/crossentropy": 2.990753173828125, "loss/hidden": 1.8203125, "loss/logits": 1.1692469120025635, "loss/reg": 0.016526464372873306, "step": 1228 }, { "epoch": 0.153625, "grad_norm": 3.789302110671997, "grad_norm_var": 13.709283357450609, "learning_rate": 0.0001, "loss": 1.0345, "loss/crossentropy": 2.7094292640686035, "loss/hidden": 0.7265625, "loss/logits": 0.1427900195121765, "loss/reg": 0.016517426818609238, "step": 1229 }, { "epoch": 0.15375, "grad_norm": 2.838264226913452, "grad_norm_var": 13.761738651197222, "learning_rate": 0.0001, "loss": 1.1093, "loss/crossentropy": 2.543126106262207, "loss/hidden": 0.7890625, "loss/logits": 0.1551959365606308, "loss/reg": 0.01650911010801792, "step": 1230 }, { "epoch": 0.153875, "grad_norm": 3.218651533126831, "grad_norm_var": 13.628173707948106, "learning_rate": 0.0001, "loss": 1.0906, "loss/crossentropy": 2.112255096435547, "loss/hidden": 0.78515625, "loss/logits": 0.1404722034931183, "loss/reg": 0.016500860452651978, "step": 1231 }, { "epoch": 0.154, "grad_norm": 4.796919345855713, "grad_norm_var": 13.40893956577282, "learning_rate": 0.0001, "loss": 1.2598, "loss/crossentropy": 2.6373000144958496, "loss/hidden": 0.93359375, "loss/logits": 0.16131797432899475, "loss/reg": 0.01649186760187149, "step": 1232 }, { "epoch": 0.154125, "grad_norm": 3.640801191329956, "grad_norm_var": 13.24713470324538, "learning_rate": 0.0001, "loss": 1.4181, "loss/crossentropy": 2.4901435375213623, "loss/hidden": 1.03125, "loss/logits": 0.2219938188791275, "loss/reg": 0.01648273505270481, "step": 1233 }, { "epoch": 0.15425, "grad_norm": 3.2018325328826904, "grad_norm_var": 13.203757643215535, "learning_rate": 0.0001, "loss": 1.1419, "loss/crossentropy": 2.504349708557129, "loss/hidden": 0.8046875, "loss/logits": 0.1725234091281891, "loss/reg": 0.016473697498440742, "step": 1234 }, { "epoch": 0.154375, "grad_norm": 2.982257127761841, "grad_norm_var": 13.274105522819255, "learning_rate": 0.0001, "loss": 1.0759, "loss/crossentropy": 2.3660614490509033, "loss/hidden": 0.76953125, "loss/logits": 0.14176063239574432, "loss/reg": 0.01646505668759346, "step": 1235 }, { "epoch": 0.1545, "grad_norm": 4.535284519195557, "grad_norm_var": 13.004824447407662, "learning_rate": 0.0001, "loss": 1.4385, "loss/crossentropy": 2.8622822761535645, "loss/hidden": 1.0, "loss/logits": 0.2738988697528839, "loss/reg": 0.016456691548228264, "step": 1236 }, { "epoch": 0.154625, "grad_norm": 3.0818183422088623, "grad_norm_var": 12.825136343225884, "learning_rate": 0.0001, "loss": 1.0988, "loss/crossentropy": 2.449317693710327, "loss/hidden": 0.79296875, "loss/logits": 0.14135484397411346, "loss/reg": 0.01644827052950859, "step": 1237 }, { "epoch": 0.15475, "grad_norm": 4.102293491363525, "grad_norm_var": 12.663514204467905, "learning_rate": 0.0001, "loss": 1.1842, "loss/crossentropy": 2.7557554244995117, "loss/hidden": 0.8515625, "loss/logits": 0.16828957200050354, "loss/reg": 0.01643945835530758, "step": 1238 }, { "epoch": 0.154875, "grad_norm": 4.042688369750977, "grad_norm_var": 12.664341312944845, "learning_rate": 0.0001, "loss": 1.2182, "loss/crossentropy": 2.4376473426818848, "loss/hidden": 0.8828125, "loss/logits": 0.17108118534088135, "loss/reg": 0.016430867835879326, "step": 1239 }, { "epoch": 0.155, "grad_norm": 3.330106735229492, "grad_norm_var": 12.589868576516837, "learning_rate": 0.0001, "loss": 1.1139, "loss/crossentropy": 2.705000638961792, "loss/hidden": 0.80078125, "loss/logits": 0.14893847703933716, "loss/reg": 0.01642204262316227, "step": 1240 }, { "epoch": 0.155125, "grad_norm": 2.7572247982025146, "grad_norm_var": 12.579048710159194, "learning_rate": 0.0001, "loss": 1.0319, "loss/crossentropy": 2.726616859436035, "loss/hidden": 0.73828125, "loss/logits": 0.12947078049182892, "loss/reg": 0.016413651406764984, "step": 1241 }, { "epoch": 0.15525, "grad_norm": 2.501053810119629, "grad_norm_var": 12.740389120966254, "learning_rate": 0.0001, "loss": 1.0246, "loss/crossentropy": 2.1374268531799316, "loss/hidden": 0.7421875, "loss/logits": 0.11834150552749634, "loss/reg": 0.016405310481786728, "step": 1242 }, { "epoch": 0.155375, "grad_norm": 3.1334445476531982, "grad_norm_var": 11.811754750464528, "learning_rate": 0.0001, "loss": 1.1323, "loss/crossentropy": 2.204963445663452, "loss/hidden": 0.828125, "loss/logits": 0.14021529257297516, "loss/reg": 0.01639643684029579, "step": 1243 }, { "epoch": 0.1555, "grad_norm": 2.888942003250122, "grad_norm_var": 0.43771643055907344, "learning_rate": 0.0001, "loss": 1.0436, "loss/crossentropy": 2.6868066787719727, "loss/hidden": 0.75, "loss/logits": 0.12971553206443787, "loss/reg": 0.016387417912483215, "step": 1244 }, { "epoch": 0.155625, "grad_norm": 3.695841073989868, "grad_norm_var": 0.43375446821376706, "learning_rate": 0.0001, "loss": 1.1949, "loss/crossentropy": 2.679568290710449, "loss/hidden": 0.8671875, "loss/logits": 0.16394542157649994, "loss/reg": 0.01637819968163967, "step": 1245 }, { "epoch": 0.15575, "grad_norm": 3.01936411857605, "grad_norm_var": 0.4217159331567084, "learning_rate": 0.0001, "loss": 1.2859, "loss/crossentropy": 2.3360393047332764, "loss/hidden": 0.921875, "loss/logits": 0.20038416981697083, "loss/reg": 0.01636892557144165, "step": 1246 }, { "epoch": 0.155875, "grad_norm": 2.5124776363372803, "grad_norm_var": 0.47306891797669076, "learning_rate": 0.0001, "loss": 0.9599, "loss/crossentropy": 2.558067560195923, "loss/hidden": 0.6796875, "loss/logits": 0.11665983498096466, "loss/reg": 0.016359377652406693, "step": 1247 }, { "epoch": 0.156, "grad_norm": 3.540679693222046, "grad_norm_var": 0.3358607220080972, "learning_rate": 0.0001, "loss": 1.1319, "loss/crossentropy": 2.760929584503174, "loss/hidden": 0.8203125, "loss/logits": 0.1480470895767212, "loss/reg": 0.01635100692510605, "step": 1248 }, { "epoch": 0.156125, "grad_norm": 3.4042372703552246, "grad_norm_var": 0.3289363389964431, "learning_rate": 0.0001, "loss": 1.1718, "loss/crossentropy": 2.771622896194458, "loss/hidden": 0.859375, "loss/logits": 0.14904874563217163, "loss/reg": 0.01634254865348339, "step": 1249 }, { "epoch": 0.15625, "grad_norm": 3.164729118347168, "grad_norm_var": 0.32948624287587125, "learning_rate": 0.0001, "loss": 1.2872, "loss/crossentropy": 2.3246641159057617, "loss/hidden": 0.93359375, "loss/logits": 0.19026914238929749, "loss/reg": 0.016334179788827896, "step": 1250 }, { "epoch": 0.156375, "grad_norm": 3.409289598464966, "grad_norm_var": 0.32317475604933343, "learning_rate": 0.0001, "loss": 1.0262, "loss/crossentropy": 2.498500347137451, "loss/hidden": 0.73828125, "loss/logits": 0.12468535453081131, "loss/reg": 0.01632508635520935, "step": 1251 }, { "epoch": 0.1565, "grad_norm": 7.274641990661621, "grad_norm_var": 1.2360715279843908, "learning_rate": 0.0001, "loss": 1.5887, "loss/crossentropy": 2.7439510822296143, "loss/hidden": 1.2734375, "loss/logits": 0.15212371945381165, "loss/reg": 0.016316639259457588, "step": 1252 }, { "epoch": 0.156625, "grad_norm": 3.9700396060943604, "grad_norm_var": 1.2368999449904527, "learning_rate": 0.0001, "loss": 1.3172, "loss/crossentropy": 2.9382898807525635, "loss/hidden": 0.99609375, "loss/logits": 0.15806305408477783, "loss/reg": 0.016308104619383812, "step": 1253 }, { "epoch": 0.15675, "grad_norm": 3.035047769546509, "grad_norm_var": 1.2290263478015266, "learning_rate": 0.0001, "loss": 1.0668, "loss/crossentropy": 2.486111879348755, "loss/hidden": 0.74609375, "loss/logits": 0.15766200423240662, "loss/reg": 0.016299735754728317, "step": 1254 }, { "epoch": 0.156875, "grad_norm": 4.703797340393066, "grad_norm_var": 1.30594374893626, "learning_rate": 0.0001, "loss": 1.1914, "loss/crossentropy": 2.6573047637939453, "loss/hidden": 0.87890625, "loss/logits": 0.14956605434417725, "loss/reg": 0.0162909384816885, "step": 1255 }, { "epoch": 0.157, "grad_norm": 2.9516775608062744, "grad_norm_var": 1.3245417395020618, "learning_rate": 0.0001, "loss": 1.1959, "loss/crossentropy": 2.3165104389190674, "loss/hidden": 0.87890625, "loss/logits": 0.1542087197303772, "loss/reg": 0.01628235913813114, "step": 1256 }, { "epoch": 0.157125, "grad_norm": 2.8121533393859863, "grad_norm_var": 1.3193075406315065, "learning_rate": 0.0001, "loss": 1.2046, "loss/crossentropy": 2.269421100616455, "loss/hidden": 0.8828125, "loss/logits": 0.15904302895069122, "loss/reg": 0.01627359353005886, "step": 1257 }, { "epoch": 0.15725, "grad_norm": 2.7715342044830322, "grad_norm_var": 1.287814713649868, "learning_rate": 0.0001, "loss": 1.0294, "loss/crossentropy": 2.6235570907592773, "loss/hidden": 0.71484375, "loss/logits": 0.15194407105445862, "loss/reg": 0.01626538671553135, "step": 1258 }, { "epoch": 0.157375, "grad_norm": 1.9729806184768677, "grad_norm_var": 1.4314826970209784, "learning_rate": 0.0001, "loss": 0.9952, "loss/crossentropy": 2.2795357704162598, "loss/hidden": 0.71875, "loss/logits": 0.11390725523233414, "loss/reg": 0.016256939619779587, "step": 1259 }, { "epoch": 0.1575, "grad_norm": 4.472729206085205, "grad_norm_var": 1.4707347924489687, "learning_rate": 0.0001, "loss": 1.5205, "loss/crossentropy": 2.3265631198883057, "loss/hidden": 1.1328125, "loss/logits": 0.22519487142562866, "loss/reg": 0.016248464584350586, "step": 1260 }, { "epoch": 0.157625, "grad_norm": 2.59954571723938, "grad_norm_var": 1.5237222016228462, "learning_rate": 0.0001, "loss": 0.9663, "loss/crossentropy": 2.6342809200286865, "loss/hidden": 0.671875, "loss/logits": 0.13200603425502777, "loss/reg": 0.01624012365937233, "step": 1261 }, { "epoch": 0.15775, "grad_norm": 3.7313549518585205, "grad_norm_var": 1.5120623570669227, "learning_rate": 0.0001, "loss": 1.1542, "loss/crossentropy": 2.5860178470611572, "loss/hidden": 0.83984375, "loss/logits": 0.15201479196548462, "loss/reg": 0.016231315210461617, "step": 1262 }, { "epoch": 0.157875, "grad_norm": 12.554633140563965, "grad_norm_var": 6.465262907364731, "learning_rate": 0.0001, "loss": 1.0956, "loss/crossentropy": 2.696977376937866, "loss/hidden": 0.796875, "loss/logits": 0.13646942377090454, "loss/reg": 0.016222581267356873, "step": 1263 }, { "epoch": 0.158, "grad_norm": 3.071765422821045, "grad_norm_var": 6.516980451118831, "learning_rate": 0.0001, "loss": 0.9936, "loss/crossentropy": 2.407457113265991, "loss/hidden": 0.71484375, "loss/logits": 0.1166088730096817, "loss/reg": 0.01621370017528534, "step": 1264 }, { "epoch": 0.158125, "grad_norm": 3.9861507415771484, "grad_norm_var": 6.482705701616392, "learning_rate": 0.0001, "loss": 1.2125, "loss/crossentropy": 2.4362969398498535, "loss/hidden": 0.89453125, "loss/logits": 0.15587326884269714, "loss/reg": 0.01620490849018097, "step": 1265 }, { "epoch": 0.15825, "grad_norm": 4.503188610076904, "grad_norm_var": 6.417924727941712, "learning_rate": 0.0001, "loss": 1.1854, "loss/crossentropy": 2.550220012664795, "loss/hidden": 0.875, "loss/logits": 0.14846490323543549, "loss/reg": 0.016196196898818016, "step": 1266 }, { "epoch": 0.158375, "grad_norm": 3.923295021057129, "grad_norm_var": 6.3775887710365, "learning_rate": 0.0001, "loss": 1.5051, "loss/crossentropy": 2.2857823371887207, "loss/hidden": 1.109375, "loss/logits": 0.23387819528579712, "loss/reg": 0.01618727669119835, "step": 1267 }, { "epoch": 0.1585, "grad_norm": 2.639031171798706, "grad_norm_var": 5.8640922918796825, "learning_rate": 0.0001, "loss": 1.0417, "loss/crossentropy": 2.541623115539551, "loss/hidden": 0.7421875, "loss/logits": 0.13770245015621185, "loss/reg": 0.01617906242609024, "step": 1268 }, { "epoch": 0.158625, "grad_norm": 3.2652931213378906, "grad_norm_var": 5.896181098711848, "learning_rate": 0.0001, "loss": 1.0331, "loss/crossentropy": 2.8000566959381104, "loss/hidden": 0.74609375, "loss/logits": 0.12530115246772766, "loss/reg": 0.01617092825472355, "step": 1269 }, { "epoch": 0.15875, "grad_norm": 7.7591872215271, "grad_norm_var": 6.722812290225987, "learning_rate": 0.0001, "loss": 1.3254, "loss/crossentropy": 2.975308895111084, "loss/hidden": 0.9453125, "loss/logits": 0.21848054230213165, "loss/reg": 0.016162917017936707, "step": 1270 }, { "epoch": 0.158875, "grad_norm": 20.061229705810547, "grad_norm_var": 22.42875378589538, "learning_rate": 0.0001, "loss": 1.5526, "loss/crossentropy": 2.5683257579803467, "loss/hidden": 1.1796875, "loss/logits": 0.21141059696674347, "loss/reg": 0.016154874116182327, "step": 1271 }, { "epoch": 0.159, "grad_norm": 3.1273183822631836, "grad_norm_var": 22.37821079380713, "learning_rate": 0.0001, "loss": 1.101, "loss/crossentropy": 2.2700870037078857, "loss/hidden": 0.80859375, "loss/logits": 0.13093486428260803, "loss/reg": 0.01614651456475258, "step": 1272 }, { "epoch": 0.159125, "grad_norm": 5.0759968757629395, "grad_norm_var": 21.976791517132078, "learning_rate": 0.0001, "loss": 1.342, "loss/crossentropy": 2.440864324569702, "loss/hidden": 1.0234375, "loss/logits": 0.157205730676651, "loss/reg": 0.016138622537255287, "step": 1273 }, { "epoch": 0.15925, "grad_norm": 3.844823122024536, "grad_norm_var": 21.680554653297506, "learning_rate": 0.0001, "loss": 1.1638, "loss/crossentropy": 2.461087942123413, "loss/hidden": 0.8828125, "loss/logits": 0.11964771896600723, "loss/reg": 0.016129910945892334, "step": 1274 }, { "epoch": 0.159375, "grad_norm": 3.4437410831451416, "grad_norm_var": 21.141396790594833, "learning_rate": 0.0001, "loss": 1.1072, "loss/crossentropy": 2.2671289443969727, "loss/hidden": 0.82421875, "loss/logits": 0.12177357822656631, "loss/reg": 0.01612204499542713, "step": 1275 }, { "epoch": 0.1595, "grad_norm": 2.6474483013153076, "grad_norm_var": 21.600534383242742, "learning_rate": 0.0001, "loss": 1.0477, "loss/crossentropy": 2.425645112991333, "loss/hidden": 0.75390625, "loss/logits": 0.1326972246170044, "loss/reg": 0.016113854944705963, "step": 1276 }, { "epoch": 0.159625, "grad_norm": 3.2777504920959473, "grad_norm_var": 21.37698263032081, "learning_rate": 0.0001, "loss": 1.1394, "loss/crossentropy": 2.5559988021850586, "loss/hidden": 0.8125, "loss/logits": 0.16583070158958435, "loss/reg": 0.016105838119983673, "step": 1277 }, { "epoch": 0.15975, "grad_norm": 2.806304693222046, "grad_norm_var": 21.640224221543477, "learning_rate": 0.0001, "loss": 1.2464, "loss/crossentropy": 2.5195603370666504, "loss/hidden": 0.90625, "loss/logits": 0.17914444208145142, "loss/reg": 0.016097450628876686, "step": 1278 }, { "epoch": 0.159875, "grad_norm": 2.830876350402832, "grad_norm_var": 18.240248060375865, "learning_rate": 0.0001, "loss": 1.0662, "loss/crossentropy": 2.4745335578918457, "loss/hidden": 0.77734375, "loss/logits": 0.12796689569950104, "loss/reg": 0.016089415177702904, "step": 1279 }, { "epoch": 0.16, "grad_norm": 3.508394479751587, "grad_norm_var": 18.153502836014415, "learning_rate": 0.0001, "loss": 1.344, "loss/crossentropy": 2.2748706340789795, "loss/hidden": 0.9921875, "loss/logits": 0.19095824658870697, "loss/reg": 0.016081009060144424, "step": 1280 }, { "epoch": 0.160125, "grad_norm": 2.568115234375, "grad_norm_var": 18.431873650050928, "learning_rate": 0.0001, "loss": 1.1068, "loss/crossentropy": 2.393493413925171, "loss/hidden": 0.8046875, "loss/logits": 0.14136919379234314, "loss/reg": 0.016072595492005348, "step": 1281 }, { "epoch": 0.16025, "grad_norm": 2.587556838989258, "grad_norm_var": 18.712804471683103, "learning_rate": 0.0001, "loss": 1.0643, "loss/crossentropy": 2.619617462158203, "loss/hidden": 0.76171875, "loss/logits": 0.14193680882453918, "loss/reg": 0.016064899042248726, "step": 1282 }, { "epoch": 0.160375, "grad_norm": 4.345118522644043, "grad_norm_var": 18.686686687831273, "learning_rate": 0.0001, "loss": 1.087, "loss/crossentropy": 2.4879989624023438, "loss/hidden": 0.77734375, "loss/logits": 0.14906466007232666, "loss/reg": 0.016057275235652924, "step": 1283 }, { "epoch": 0.1605, "grad_norm": 3.326270818710327, "grad_norm_var": 18.535440191895614, "learning_rate": 0.0001, "loss": 1.2065, "loss/crossentropy": 2.540086507797241, "loss/hidden": 0.8828125, "loss/logits": 0.16315226256847382, "loss/reg": 0.01604924537241459, "step": 1284 }, { "epoch": 0.160625, "grad_norm": 12.10493278503418, "grad_norm_var": 21.781544615658156, "learning_rate": 0.0001, "loss": 1.4967, "loss/crossentropy": 2.0798144340515137, "loss/hidden": 1.140625, "loss/logits": 0.19571265578269958, "loss/reg": 0.016041060909628868, "step": 1285 }, { "epoch": 0.16075, "grad_norm": 3.7908778190612793, "grad_norm_var": 21.41548096635543, "learning_rate": 0.0001, "loss": 1.2948, "loss/crossentropy": 2.554609775543213, "loss/hidden": 0.95703125, "loss/logits": 0.17741422355175018, "loss/reg": 0.01603337749838829, "step": 1286 }, { "epoch": 0.160875, "grad_norm": 2.7631125450134277, "grad_norm_var": 5.285413244775635, "learning_rate": 0.0001, "loss": 1.1375, "loss/crossentropy": 2.4389936923980713, "loss/hidden": 0.8046875, "loss/logits": 0.17257535457611084, "loss/reg": 0.016025440767407417, "step": 1287 }, { "epoch": 0.161, "grad_norm": 3.0926973819732666, "grad_norm_var": 5.288953588764335, "learning_rate": 0.0001, "loss": 1.1024, "loss/crossentropy": 2.507418394088745, "loss/hidden": 0.8203125, "loss/logits": 0.12186664342880249, "loss/reg": 0.016017207875847816, "step": 1288 }, { "epoch": 0.161125, "grad_norm": 3.1981067657470703, "grad_norm_var": 5.208865380747667, "learning_rate": 0.0001, "loss": 1.0863, "loss/crossentropy": 2.4531667232513428, "loss/hidden": 0.79296875, "loss/logits": 0.13326312601566315, "loss/reg": 0.01600871980190277, "step": 1289 }, { "epoch": 0.16125, "grad_norm": 2.445035457611084, "grad_norm_var": 5.315218503488533, "learning_rate": 0.0001, "loss": 1.0727, "loss/crossentropy": 2.083653450012207, "loss/hidden": 0.77734375, "loss/logits": 0.1353735327720642, "loss/reg": 0.016000347211956978, "step": 1290 }, { "epoch": 0.161375, "grad_norm": 3.601950168609619, "grad_norm_var": 5.3119885145812225, "learning_rate": 0.0001, "loss": 1.3367, "loss/crossentropy": 2.5701797008514404, "loss/hidden": 0.9765625, "loss/logits": 0.20016901195049286, "loss/reg": 0.015991859138011932, "step": 1291 }, { "epoch": 0.1615, "grad_norm": 2.633711099624634, "grad_norm_var": 5.313893223941083, "learning_rate": 0.0001, "loss": 0.9594, "loss/crossentropy": 2.5569510459899902, "loss/hidden": 0.69140625, "loss/logits": 0.10816207528114319, "loss/reg": 0.015983374789357185, "step": 1292 }, { "epoch": 0.161625, "grad_norm": 3.0227303504943848, "grad_norm_var": 5.331637216482173, "learning_rate": 0.0001, "loss": 1.0746, "loss/crossentropy": 2.440446138381958, "loss/hidden": 0.7734375, "loss/logits": 0.1414303183555603, "loss/reg": 0.01597476750612259, "step": 1293 }, { "epoch": 0.16175, "grad_norm": 3.4858055114746094, "grad_norm_var": 5.282777369926362, "learning_rate": 0.0001, "loss": 1.221, "loss/crossentropy": 2.63417649269104, "loss/hidden": 0.90234375, "loss/logits": 0.15903231501579285, "loss/reg": 0.015966549515724182, "step": 1294 }, { "epoch": 0.161875, "grad_norm": 2.918198823928833, "grad_norm_var": 5.273058122497643, "learning_rate": 0.0001, "loss": 1.097, "loss/crossentropy": 2.473909854888916, "loss/hidden": 0.796875, "loss/logits": 0.14054538309574127, "loss/reg": 0.015958301723003387, "step": 1295 }, { "epoch": 0.162, "grad_norm": 2.7234513759613037, "grad_norm_var": 5.332879789031237, "learning_rate": 0.0001, "loss": 1.279, "loss/crossentropy": 2.241210460662842, "loss/hidden": 0.9375, "loss/logits": 0.182043194770813, "loss/reg": 0.015949726104736328, "step": 1296 }, { "epoch": 0.162125, "grad_norm": 3.3515610694885254, "grad_norm_var": 5.256872590146133, "learning_rate": 0.0001, "loss": 1.1463, "loss/crossentropy": 2.6752541065216064, "loss/hidden": 0.83203125, "loss/logits": 0.15490111708641052, "loss/reg": 0.015941519290208817, "step": 1297 }, { "epoch": 0.16225, "grad_norm": 3.6638059616088867, "grad_norm_var": 5.167917555355179, "learning_rate": 0.0001, "loss": 1.1893, "loss/crossentropy": 2.2823452949523926, "loss/hidden": 0.86328125, "loss/logits": 0.16667145490646362, "loss/reg": 0.015933820977807045, "step": 1298 }, { "epoch": 0.162375, "grad_norm": 4.22912073135376, "grad_norm_var": 5.1600059777442855, "learning_rate": 0.0001, "loss": 1.1752, "loss/crossentropy": 2.6061580181121826, "loss/hidden": 0.82421875, "loss/logits": 0.1916988492012024, "loss/reg": 0.01592625491321087, "step": 1299 }, { "epoch": 0.1625, "grad_norm": 6.90070915222168, "grad_norm_var": 5.7461320078663105, "learning_rate": 0.0001, "loss": 1.5428, "loss/crossentropy": 1.8802989721298218, "loss/hidden": 1.1015625, "loss/logits": 0.2820611596107483, "loss/reg": 0.015918578952550888, "step": 1300 }, { "epoch": 0.162625, "grad_norm": 3.0886518955230713, "grad_norm_var": 1.077876623419837, "learning_rate": 0.0001, "loss": 1.2216, "loss/crossentropy": 2.5785911083221436, "loss/hidden": 0.8984375, "loss/logits": 0.16405634582042694, "loss/reg": 0.01591094397008419, "step": 1301 }, { "epoch": 0.16275, "grad_norm": 3.582451581954956, "grad_norm_var": 1.0706141462880387, "learning_rate": 0.0001, "loss": 1.2677, "loss/crossentropy": 2.2569782733917236, "loss/hidden": 0.92578125, "loss/logits": 0.18291030824184418, "loss/reg": 0.01590333878993988, "step": 1302 }, { "epoch": 0.162875, "grad_norm": 3.291792392730713, "grad_norm_var": 1.0418618914724893, "learning_rate": 0.0001, "loss": 1.1212, "loss/crossentropy": 2.5871994495391846, "loss/hidden": 0.80859375, "loss/logits": 0.15368321537971497, "loss/reg": 0.01589547097682953, "step": 1303 }, { "epoch": 0.163, "grad_norm": 12.274004936218262, "grad_norm_var": 5.870708246927601, "learning_rate": 0.0001, "loss": 1.5367, "loss/crossentropy": 2.2327094078063965, "loss/hidden": 1.1171875, "loss/logits": 0.2606501579284668, "loss/reg": 0.015888074412941933, "step": 1304 }, { "epoch": 0.163125, "grad_norm": 3.356614112854004, "grad_norm_var": 5.854788067612952, "learning_rate": 0.0001, "loss": 1.1269, "loss/crossentropy": 2.7387585639953613, "loss/hidden": 0.80078125, "loss/logits": 0.16729386150836945, "loss/reg": 0.015879716724157333, "step": 1305 }, { "epoch": 0.16325, "grad_norm": 2.274139404296875, "grad_norm_var": 5.892856228313144, "learning_rate": 0.0001, "loss": 1.013, "loss/crossentropy": 2.458984136581421, "loss/hidden": 0.73046875, "loss/logits": 0.12379375100135803, "loss/reg": 0.015871398150920868, "step": 1306 }, { "epoch": 0.163375, "grad_norm": 3.3769636154174805, "grad_norm_var": 5.908708209046858, "learning_rate": 0.0001, "loss": 1.096, "loss/crossentropy": 2.4599368572235107, "loss/hidden": 0.80078125, "loss/logits": 0.1365453004837036, "loss/reg": 0.015863511711359024, "step": 1307 }, { "epoch": 0.1635, "grad_norm": 5.5954132080078125, "grad_norm_var": 5.91311204762225, "learning_rate": 0.0001, "loss": 1.4203, "loss/crossentropy": 3.034935474395752, "loss/hidden": 1.03125, "loss/logits": 0.23048382997512817, "loss/reg": 0.015855222940444946, "step": 1308 }, { "epoch": 0.163625, "grad_norm": 2.820237636566162, "grad_norm_var": 5.947350905923982, "learning_rate": 0.0001, "loss": 1.1673, "loss/crossentropy": 2.4714202880859375, "loss/hidden": 0.84765625, "loss/logits": 0.16117262840270996, "loss/reg": 0.01584673300385475, "step": 1309 }, { "epoch": 0.16375, "grad_norm": 2.900300979614258, "grad_norm_var": 6.0232289618053185, "learning_rate": 0.0001, "loss": 1.0603, "loss/crossentropy": 2.634578227996826, "loss/hidden": 0.76171875, "loss/logits": 0.14015381038188934, "loss/reg": 0.015838829800486565, "step": 1310 }, { "epoch": 0.163875, "grad_norm": 3.875929594039917, "grad_norm_var": 5.923678794810188, "learning_rate": 0.0001, "loss": 1.1148, "loss/crossentropy": 2.8410253524780273, "loss/hidden": 0.8046875, "loss/logits": 0.15175345540046692, "loss/reg": 0.01583097316324711, "step": 1311 }, { "epoch": 0.164, "grad_norm": 3.126546621322632, "grad_norm_var": 5.854122320902001, "learning_rate": 0.0001, "loss": 1.1639, "loss/crossentropy": 2.5248920917510986, "loss/hidden": 0.85546875, "loss/logits": 0.15023675560951233, "loss/reg": 0.015823420137166977, "step": 1312 }, { "epoch": 0.164125, "grad_norm": 3.225511312484741, "grad_norm_var": 5.869908623309258, "learning_rate": 0.0001, "loss": 1.2341, "loss/crossentropy": 2.382220506668091, "loss/hidden": 0.9140625, "loss/logits": 0.16187481582164764, "loss/reg": 0.01581561006605625, "step": 1313 }, { "epoch": 0.16425, "grad_norm": 5.516601085662842, "grad_norm_var": 5.946099660897706, "learning_rate": 0.0001, "loss": 1.5097, "loss/crossentropy": 2.2973639965057373, "loss/hidden": 1.1328125, "loss/logits": 0.2187841385602951, "loss/reg": 0.015808099880814552, "step": 1314 }, { "epoch": 0.164375, "grad_norm": 3.7441492080688477, "grad_norm_var": 5.967949014164251, "learning_rate": 0.0001, "loss": 1.0893, "loss/crossentropy": 2.7098612785339355, "loss/hidden": 0.78125, "loss/logits": 0.15003027021884918, "loss/reg": 0.015800559893250465, "step": 1315 }, { "epoch": 0.1645, "grad_norm": 3.2912096977233887, "grad_norm_var": 5.5351073509179205, "learning_rate": 0.0001, "loss": 1.1089, "loss/crossentropy": 2.5598583221435547, "loss/hidden": 0.80078125, "loss/logits": 0.15022039413452148, "loss/reg": 0.015792248770594597, "step": 1316 }, { "epoch": 0.164625, "grad_norm": 2.697364091873169, "grad_norm_var": 5.596594138613236, "learning_rate": 0.0001, "loss": 1.1302, "loss/crossentropy": 2.364313840866089, "loss/hidden": 0.81640625, "loss/logits": 0.1559445559978485, "loss/reg": 0.015783967450261116, "step": 1317 }, { "epoch": 0.16475, "grad_norm": 3.0158493518829346, "grad_norm_var": 5.652685497198369, "learning_rate": 0.0001, "loss": 1.1872, "loss/crossentropy": 2.7219552993774414, "loss/hidden": 0.87109375, "loss/logits": 0.15835720300674438, "loss/reg": 0.015775663778185844, "step": 1318 }, { "epoch": 0.164875, "grad_norm": 307.651611328125, "grad_norm_var": 5765.62343534609, "learning_rate": 0.0001, "loss": 3.7364, "loss/crossentropy": 3.080000400543213, "loss/hidden": 3.09375, "loss/logits": 0.48493900895118713, "loss/reg": 0.015767192468047142, "step": 1319 }, { "epoch": 0.165, "grad_norm": 2.4901864528656006, "grad_norm_var": 5785.658820843435, "learning_rate": 0.0001, "loss": 1.1422, "loss/crossentropy": 2.167736053466797, "loss/hidden": 0.8359375, "loss/logits": 0.1487167328596115, "loss/reg": 0.015759125351905823, "step": 1320 }, { "epoch": 0.165125, "grad_norm": 2.829414129257202, "grad_norm_var": 5787.017269350288, "learning_rate": 0.0001, "loss": 1.1765, "loss/crossentropy": 2.35046648979187, "loss/hidden": 0.83984375, "loss/logits": 0.17911039292812347, "loss/reg": 0.01575067639350891, "step": 1321 }, { "epoch": 0.16525, "grad_norm": 3.047290325164795, "grad_norm_var": 5784.979716656826, "learning_rate": 0.0001, "loss": 1.1083, "loss/crossentropy": 2.5440683364868164, "loss/hidden": 0.79296875, "loss/logits": 0.15790146589279175, "loss/reg": 0.015742652118206024, "step": 1322 }, { "epoch": 0.165375, "grad_norm": 2.9854726791381836, "grad_norm_var": 5785.984900115946, "learning_rate": 0.0001, "loss": 1.0518, "loss/crossentropy": 2.0691964626312256, "loss/hidden": 0.7734375, "loss/logits": 0.12100932002067566, "loss/reg": 0.015734149143099785, "step": 1323 }, { "epoch": 0.1655, "grad_norm": 2.8492424488067627, "grad_norm_var": 5792.618796374745, "learning_rate": 0.0001, "loss": 1.321, "loss/crossentropy": 2.045762538909912, "loss/hidden": 1.0390625, "loss/logits": 0.12468324601650238, "loss/reg": 0.01572607271373272, "step": 1324 }, { "epoch": 0.165625, "grad_norm": 3.2983663082122803, "grad_norm_var": 5791.394160827106, "learning_rate": 0.0001, "loss": 1.1712, "loss/crossentropy": 2.6199800968170166, "loss/hidden": 0.87109375, "loss/logits": 0.14290478825569153, "loss/reg": 0.01571841724216938, "step": 1325 }, { "epoch": 0.16575, "grad_norm": 3.7790138721466064, "grad_norm_var": 5789.171384194312, "learning_rate": 0.0001, "loss": 1.3719, "loss/crossentropy": 2.5326523780822754, "loss/hidden": 0.984375, "loss/logits": 0.23037347197532654, "loss/reg": 0.015710193663835526, "step": 1326 }, { "epoch": 0.165875, "grad_norm": 3.0544376373291016, "grad_norm_var": 5791.235862450414, "learning_rate": 0.0001, "loss": 1.3355, "loss/crossentropy": 2.679947853088379, "loss/hidden": 0.94140625, "loss/logits": 0.2371068298816681, "loss/reg": 0.015702618286013603, "step": 1327 }, { "epoch": 0.166, "grad_norm": 3.107823133468628, "grad_norm_var": 5791.283719365005, "learning_rate": 0.0001, "loss": 1.184, "loss/crossentropy": 2.564558506011963, "loss/hidden": 0.8515625, "loss/logits": 0.17545363306999207, "loss/reg": 0.01569523848593235, "step": 1328 }, { "epoch": 0.166125, "grad_norm": 3.589580774307251, "grad_norm_var": 5790.3667350596925, "learning_rate": 0.0001, "loss": 1.4278, "loss/crossentropy": 2.2362802028656006, "loss/hidden": 1.03125, "loss/logits": 0.2396482676267624, "loss/reg": 0.01568804495036602, "step": 1329 }, { "epoch": 0.16625, "grad_norm": 3.2111618518829346, "grad_norm_var": 5795.860842463791, "learning_rate": 0.0001, "loss": 1.0855, "loss/crossentropy": 2.5137100219726562, "loss/hidden": 0.79296875, "loss/logits": 0.1356830894947052, "loss/reg": 0.01568055897951126, "step": 1330 }, { "epoch": 0.166375, "grad_norm": 2.7476325035095215, "grad_norm_var": 5798.370483928043, "learning_rate": 0.0001, "loss": 1.2698, "loss/crossentropy": 2.456242322921753, "loss/hidden": 0.9453125, "loss/logits": 0.16777516901493073, "loss/reg": 0.01567363552749157, "step": 1331 }, { "epoch": 0.1665, "grad_norm": 2.6416473388671875, "grad_norm_var": 5800.026099397797, "learning_rate": 0.0001, "loss": 1.1194, "loss/crossentropy": 2.7138493061065674, "loss/hidden": 0.8125, "loss/logits": 0.15024441480636597, "loss/reg": 0.015665553510189056, "step": 1332 }, { "epoch": 0.166625, "grad_norm": 3.000983476638794, "grad_norm_var": 5799.247920072332, "learning_rate": 0.0001, "loss": 1.2997, "loss/crossentropy": 2.4222257137298584, "loss/hidden": 0.9609375, "loss/logits": 0.1821441352367401, "loss/reg": 0.015658436343073845, "step": 1333 }, { "epoch": 0.16675, "grad_norm": 4.0280961990356445, "grad_norm_var": 5796.738777158668, "learning_rate": 0.0001, "loss": 1.4751, "loss/crossentropy": 2.4873077869415283, "loss/hidden": 1.1015625, "loss/logits": 0.2170398086309433, "loss/reg": 0.015650250017642975, "step": 1334 }, { "epoch": 0.166875, "grad_norm": 3.3132269382476807, "grad_norm_var": 0.16804106807275332, "learning_rate": 0.0001, "loss": 1.0521, "loss/crossentropy": 2.3228578567504883, "loss/hidden": 0.7578125, "loss/logits": 0.13784348964691162, "loss/reg": 0.015642434358596802, "step": 1335 }, { "epoch": 0.167, "grad_norm": 2.9326908588409424, "grad_norm_var": 0.14292226941004174, "learning_rate": 0.0001, "loss": 1.0604, "loss/crossentropy": 2.6680498123168945, "loss/hidden": 0.7578125, "loss/logits": 0.1462496519088745, "loss/reg": 0.015634268522262573, "step": 1336 }, { "epoch": 0.167125, "grad_norm": 3.7127490043640137, "grad_norm_var": 0.1538134730442567, "learning_rate": 0.0001, "loss": 1.1028, "loss/crossentropy": 2.439518690109253, "loss/hidden": 0.8046875, "loss/logits": 0.14183643460273743, "loss/reg": 0.015626052394509315, "step": 1337 }, { "epoch": 0.16725, "grad_norm": 3.0613186359405518, "grad_norm_var": 0.15352851622272493, "learning_rate": 0.0001, "loss": 1.1624, "loss/crossentropy": 2.4576408863067627, "loss/hidden": 0.84375, "loss/logits": 0.1624375730752945, "loss/reg": 0.015617795288562775, "step": 1338 }, { "epoch": 0.167375, "grad_norm": 2.435100793838501, "grad_norm_var": 0.18872328446350153, "learning_rate": 0.0001, "loss": 1.1516, "loss/crossentropy": 2.6004011631011963, "loss/hidden": 0.84375, "loss/logits": 0.1517818719148636, "loss/reg": 0.015609413385391235, "step": 1339 }, { "epoch": 0.1675, "grad_norm": 2.6485419273376465, "grad_norm_var": 0.19989636027441596, "learning_rate": 0.0001, "loss": 1.0442, "loss/crossentropy": 2.560258626937866, "loss/hidden": 0.7578125, "loss/logits": 0.13042010366916656, "loss/reg": 0.015600843355059624, "step": 1340 }, { "epoch": 0.167625, "grad_norm": 2.89918851852417, "grad_norm_var": 0.20249881233273162, "learning_rate": 0.0001, "loss": 1.0632, "loss/crossentropy": 2.5941762924194336, "loss/hidden": 0.77734375, "loss/logits": 0.12995409965515137, "loss/reg": 0.015592261217534542, "step": 1341 }, { "epoch": 0.16775, "grad_norm": 2.8898985385894775, "grad_norm_var": 0.175583338922907, "learning_rate": 0.0001, "loss": 1.11, "loss/crossentropy": 2.4292283058166504, "loss/hidden": 0.8125, "loss/logits": 0.14161787927150726, "loss/reg": 0.015583738684654236, "step": 1342 }, { "epoch": 0.167875, "grad_norm": 2.9340474605560303, "grad_norm_var": 0.1768935876133876, "learning_rate": 0.0001, "loss": 1.0755, "loss/crossentropy": 2.2534961700439453, "loss/hidden": 0.77734375, "loss/logits": 0.14238594472408295, "loss/reg": 0.015575552359223366, "step": 1343 }, { "epoch": 0.168, "grad_norm": 2.4977829456329346, "grad_norm_var": 0.19724767622532605, "learning_rate": 0.0001, "loss": 1.1106, "loss/crossentropy": 2.5671305656433105, "loss/hidden": 0.796875, "loss/logits": 0.15805330872535706, "loss/reg": 0.015566708520054817, "step": 1344 }, { "epoch": 0.168125, "grad_norm": 3.79781174659729, "grad_norm_var": 0.2153835126984556, "learning_rate": 0.0001, "loss": 1.2096, "loss/crossentropy": 2.341862440109253, "loss/hidden": 0.90234375, "loss/logits": 0.15165603160858154, "loss/reg": 0.015558542683720589, "step": 1345 }, { "epoch": 0.16825, "grad_norm": 4.044692039489746, "grad_norm_var": 0.2770521554047564, "learning_rate": 0.0001, "loss": 1.2426, "loss/crossentropy": 2.053048849105835, "loss/hidden": 0.96484375, "loss/logits": 0.1222817674279213, "loss/reg": 0.015550050884485245, "step": 1346 }, { "epoch": 0.168375, "grad_norm": 2.795452117919922, "grad_norm_var": 0.27495421257927977, "learning_rate": 0.0001, "loss": 1.1152, "loss/crossentropy": 2.381235361099243, "loss/hidden": 0.796875, "loss/logits": 0.16290614008903503, "loss/reg": 0.015541622415184975, "step": 1347 }, { "epoch": 0.1685, "grad_norm": 2.5820398330688477, "grad_norm_var": 0.27883561860861783, "learning_rate": 0.0001, "loss": 0.9562, "loss/crossentropy": 2.6334848403930664, "loss/hidden": 0.6796875, "loss/logits": 0.12118068337440491, "loss/reg": 0.01553329173475504, "step": 1348 }, { "epoch": 0.168625, "grad_norm": 3.5735538005828857, "grad_norm_var": 0.29189209249198955, "learning_rate": 0.0001, "loss": 1.1534, "loss/crossentropy": 2.587275505065918, "loss/hidden": 0.8046875, "loss/logits": 0.1934923231601715, "loss/reg": 0.01552544254809618, "step": 1349 }, { "epoch": 0.16875, "grad_norm": 2.8559420108795166, "grad_norm_var": 0.23804927371853257, "learning_rate": 0.0001, "loss": 1.1208, "loss/crossentropy": 2.4297351837158203, "loss/hidden": 0.81640625, "loss/logits": 0.14923590421676636, "loss/reg": 0.01551780290901661, "step": 1350 }, { "epoch": 0.168875, "grad_norm": 2.5194828510284424, "grad_norm_var": 0.25071932648220735, "learning_rate": 0.0001, "loss": 1.0241, "loss/crossentropy": 2.4934616088867188, "loss/hidden": 0.74609375, "loss/logits": 0.12285730242729187, "loss/reg": 0.015510031953454018, "step": 1351 }, { "epoch": 0.169, "grad_norm": 2.714785575866699, "grad_norm_var": 0.2559699884583568, "learning_rate": 0.0001, "loss": 0.9928, "loss/crossentropy": 2.7447316646575928, "loss/hidden": 0.703125, "loss/logits": 0.13469095528125763, "loss/reg": 0.015501690097153187, "step": 1352 }, { "epoch": 0.169125, "grad_norm": 2.226409912109375, "grad_norm_var": 0.2523278002535238, "learning_rate": 0.0001, "loss": 1.0071, "loss/crossentropy": 2.3574349880218506, "loss/hidden": 0.72265625, "loss/logits": 0.1294814646244049, "loss/reg": 0.015494225546717644, "step": 1353 }, { "epoch": 0.16925, "grad_norm": 2.2927169799804688, "grad_norm_var": 0.27320470544451986, "learning_rate": 0.0001, "loss": 0.9263, "loss/crossentropy": 2.6081109046936035, "loss/hidden": 0.66015625, "loss/logits": 0.11125902831554413, "loss/reg": 0.015486053191125393, "step": 1354 }, { "epoch": 0.169375, "grad_norm": 2.4285078048706055, "grad_norm_var": 0.27357804892559606, "learning_rate": 0.0001, "loss": 1.0351, "loss/crossentropy": 2.654952049255371, "loss/hidden": 0.75, "loss/logits": 0.13033051788806915, "loss/reg": 0.015478008426725864, "step": 1355 }, { "epoch": 0.1695, "grad_norm": 2.5576510429382324, "grad_norm_var": 0.27661218725713915, "learning_rate": 0.0001, "loss": 0.9944, "loss/crossentropy": 2.525481939315796, "loss/hidden": 0.6953125, "loss/logits": 0.14441752433776855, "loss/reg": 0.015470432117581367, "step": 1356 }, { "epoch": 0.169625, "grad_norm": 3.3350884914398193, "grad_norm_var": 0.2913103816812319, "learning_rate": 0.0001, "loss": 1.0548, "loss/crossentropy": 2.649492025375366, "loss/hidden": 0.75, "loss/logits": 0.15016667544841766, "loss/reg": 0.015463395975530148, "step": 1357 }, { "epoch": 0.16975, "grad_norm": 39.05031967163086, "grad_norm_var": 82.07282531411327, "learning_rate": 0.0001, "loss": 1.0599, "loss/crossentropy": 2.447631359100342, "loss/hidden": 0.796875, "loss/logits": 0.1085047721862793, "loss/reg": 0.015455886721611023, "step": 1358 }, { "epoch": 0.169875, "grad_norm": 2.6993017196655273, "grad_norm_var": 82.14524851838627, "learning_rate": 0.0001, "loss": 0.9833, "loss/crossentropy": 2.4683852195739746, "loss/hidden": 0.70703125, "loss/logits": 0.12176641821861267, "loss/reg": 0.015447665005922318, "step": 1359 }, { "epoch": 0.17, "grad_norm": 2.8238816261291504, "grad_norm_var": 82.03774119861214, "learning_rate": 0.0001, "loss": 0.9911, "loss/crossentropy": 2.6632418632507324, "loss/hidden": 0.71875, "loss/logits": 0.11792933940887451, "loss/reg": 0.015439565293490887, "step": 1360 }, { "epoch": 0.170125, "grad_norm": 3.259418249130249, "grad_norm_var": 82.15246657395147, "learning_rate": 0.0001, "loss": 1.1184, "loss/crossentropy": 2.7344815731048584, "loss/hidden": 0.8046875, "loss/logits": 0.15935808420181274, "loss/reg": 0.015431756153702736, "step": 1361 }, { "epoch": 0.17025, "grad_norm": 3.7520382404327393, "grad_norm_var": 82.19938647618869, "learning_rate": 0.0001, "loss": 0.9883, "loss/crossentropy": 2.7563743591308594, "loss/hidden": 0.6953125, "loss/logits": 0.13872626423835754, "loss/reg": 0.015424099750816822, "step": 1362 }, { "epoch": 0.170375, "grad_norm": 2.766862154006958, "grad_norm_var": 82.20819070334429, "learning_rate": 0.0001, "loss": 0.9816, "loss/crossentropy": 2.5083751678466797, "loss/hidden": 0.703125, "loss/logits": 0.12434659898281097, "loss/reg": 0.01541648618876934, "step": 1363 }, { "epoch": 0.1705, "grad_norm": 4.0485734939575195, "grad_norm_var": 81.85223413984265, "learning_rate": 0.0001, "loss": 1.2789, "loss/crossentropy": 2.2484068870544434, "loss/hidden": 0.9296875, "loss/logits": 0.19513945281505585, "loss/reg": 0.015408649109303951, "step": 1364 }, { "epoch": 0.170625, "grad_norm": 3.177393913269043, "grad_norm_var": 81.94697865555716, "learning_rate": 0.0001, "loss": 1.0776, "loss/crossentropy": 2.5238068103790283, "loss/hidden": 0.796875, "loss/logits": 0.1267687976360321, "loss/reg": 0.01540052518248558, "step": 1365 }, { "epoch": 0.17075, "grad_norm": 8.407809257507324, "grad_norm_var": 82.17024249924071, "learning_rate": 0.0001, "loss": 1.3745, "loss/crossentropy": 2.6552045345306396, "loss/hidden": 1.0, "loss/logits": 0.2205539047718048, "loss/reg": 0.01539271418005228, "step": 1366 }, { "epoch": 0.170875, "grad_norm": 3.5715489387512207, "grad_norm_var": 81.82079857540536, "learning_rate": 0.0001, "loss": 1.1968, "loss/crossentropy": 2.3991222381591797, "loss/hidden": 0.88671875, "loss/logits": 0.1562199890613556, "loss/reg": 0.01538482028990984, "step": 1367 }, { "epoch": 0.171, "grad_norm": 3.431995153427124, "grad_norm_var": 81.57995561859133, "learning_rate": 0.0001, "loss": 1.1711, "loss/crossentropy": 2.475353479385376, "loss/hidden": 0.8515625, "loss/logits": 0.16577771306037903, "loss/reg": 0.015377094969153404, "step": 1368 }, { "epoch": 0.171125, "grad_norm": 2.35178279876709, "grad_norm_var": 81.52430399273271, "learning_rate": 0.0001, "loss": 0.9426, "loss/crossentropy": 2.6334617137908936, "loss/hidden": 0.67578125, "loss/logits": 0.113157257437706, "loss/reg": 0.015369528904557228, "step": 1369 }, { "epoch": 0.17125, "grad_norm": 2.277183771133423, "grad_norm_var": 81.53121470659494, "learning_rate": 0.0001, "loss": 1.0905, "loss/crossentropy": 2.4363012313842773, "loss/hidden": 0.796875, "loss/logits": 0.14004433155059814, "loss/reg": 0.015361781232059002, "step": 1370 }, { "epoch": 0.171375, "grad_norm": 2.6015970706939697, "grad_norm_var": 81.45940420807128, "learning_rate": 0.0001, "loss": 1.1042, "loss/crossentropy": 2.4151899814605713, "loss/hidden": 0.80859375, "loss/logits": 0.14207510650157928, "loss/reg": 0.015353736467659473, "step": 1371 }, { "epoch": 0.1715, "grad_norm": 2.7182395458221436, "grad_norm_var": 81.39518805728484, "learning_rate": 0.0001, "loss": 1.0125, "loss/crossentropy": 2.546245813369751, "loss/hidden": 0.734375, "loss/logits": 0.12465515732765198, "loss/reg": 0.015345688909292221, "step": 1372 }, { "epoch": 0.171625, "grad_norm": 2.4946951866149902, "grad_norm_var": 81.69783167090145, "learning_rate": 0.0001, "loss": 1.0365, "loss/crossentropy": 2.4872238636016846, "loss/hidden": 0.76171875, "loss/logits": 0.121379055082798, "loss/reg": 0.015337930992245674, "step": 1373 }, { "epoch": 0.17175, "grad_norm": 2.35099720954895, "grad_norm_var": 2.1436230027036345, "learning_rate": 0.0001, "loss": 1.0129, "loss/crossentropy": 2.5447275638580322, "loss/hidden": 0.73046875, "loss/logits": 0.12912125885486603, "loss/reg": 0.015329563058912754, "step": 1374 }, { "epoch": 0.171875, "grad_norm": 3.531275987625122, "grad_norm_var": 2.120711236961161, "learning_rate": 0.0001, "loss": 1.1698, "loss/crossentropy": 2.6748950481414795, "loss/hidden": 0.88671875, "loss/logits": 0.1298878937959671, "loss/reg": 0.015320966020226479, "step": 1375 }, { "epoch": 0.172, "grad_norm": 2.5129640102386475, "grad_norm_var": 2.1484737670750205, "learning_rate": 0.0001, "loss": 0.9549, "loss/crossentropy": 2.4481074810028076, "loss/hidden": 0.67578125, "loss/logits": 0.1260184943675995, "loss/reg": 0.015312742441892624, "step": 1376 }, { "epoch": 0.172125, "grad_norm": 2.2456185817718506, "grad_norm_var": 2.222034906196358, "learning_rate": 0.0001, "loss": 0.9583, "loss/crossentropy": 2.697751760482788, "loss/hidden": 0.6875, "loss/logits": 0.11780545860528946, "loss/reg": 0.01530410535633564, "step": 1377 }, { "epoch": 0.17225, "grad_norm": 3.371694564819336, "grad_norm_var": 2.206379139706632, "learning_rate": 0.0001, "loss": 1.0815, "loss/crossentropy": 2.5198638439178467, "loss/hidden": 0.77734375, "loss/logits": 0.1512106955051422, "loss/reg": 0.015296036377549171, "step": 1378 }, { "epoch": 0.172375, "grad_norm": 3.3469138145446777, "grad_norm_var": 2.190717504538292, "learning_rate": 0.0001, "loss": 1.1007, "loss/crossentropy": 2.500404119491577, "loss/hidden": 0.84375, "loss/logits": 0.10408136248588562, "loss/reg": 0.015288051217794418, "step": 1379 }, { "epoch": 0.1725, "grad_norm": 2.7227180004119873, "grad_norm_var": 2.1642779969536603, "learning_rate": 0.0001, "loss": 1.0844, "loss/crossentropy": 2.3428802490234375, "loss/hidden": 0.79296875, "loss/logits": 0.1386614292860031, "loss/reg": 0.015280190855264664, "step": 1380 }, { "epoch": 0.172625, "grad_norm": 2.9242119789123535, "grad_norm_var": 2.1688668977830954, "learning_rate": 0.0001, "loss": 1.239, "loss/crossentropy": 2.668134927749634, "loss/hidden": 0.88671875, "loss/logits": 0.19956323504447937, "loss/reg": 0.015271900221705437, "step": 1381 }, { "epoch": 0.17275, "grad_norm": 2.492800712585449, "grad_norm_var": 0.23164549730800724, "learning_rate": 0.0001, "loss": 0.9934, "loss/crossentropy": 2.5607941150665283, "loss/hidden": 0.71484375, "loss/logits": 0.12592297792434692, "loss/reg": 0.01526356115937233, "step": 1382 }, { "epoch": 0.172875, "grad_norm": 3.9195237159729004, "grad_norm_var": 0.2745866186604149, "learning_rate": 0.0001, "loss": 1.2783, "loss/crossentropy": 2.569718837738037, "loss/hidden": 0.9453125, "loss/logits": 0.18045136332511902, "loss/reg": 0.015255567617714405, "step": 1383 }, { "epoch": 0.173, "grad_norm": 3.3682620525360107, "grad_norm_var": 0.2697324337180568, "learning_rate": 0.0001, "loss": 1.0247, "loss/crossentropy": 2.756709337234497, "loss/hidden": 0.7421875, "loss/logits": 0.1299922913312912, "loss/reg": 0.015247280709445477, "step": 1384 }, { "epoch": 0.173125, "grad_norm": 75.24736022949219, "grad_norm_var": 327.7621509720236, "learning_rate": 0.0001, "loss": 1.1005, "loss/crossentropy": 2.492774248123169, "loss/hidden": 0.828125, "loss/logits": 0.11999952048063278, "loss/reg": 0.015239723026752472, "step": 1385 }, { "epoch": 0.17325, "grad_norm": 2.639838218688965, "grad_norm_var": 327.5234904743987, "learning_rate": 0.0001, "loss": 1.123, "loss/crossentropy": 2.783590793609619, "loss/hidden": 0.81640625, "loss/logits": 0.15431414544582367, "loss/reg": 0.015231656841933727, "step": 1386 }, { "epoch": 0.173375, "grad_norm": 8.296279907226562, "grad_norm_var": 325.9027345524763, "learning_rate": 0.0001, "loss": 1.0279, "loss/crossentropy": 2.591702938079834, "loss/hidden": 0.75390625, "loss/logits": 0.12174837291240692, "loss/reg": 0.01522404421120882, "step": 1387 }, { "epoch": 0.1735, "grad_norm": 5.515318393707275, "grad_norm_var": 324.5108738623049, "learning_rate": 0.0001, "loss": 1.2457, "loss/crossentropy": 2.321329355239868, "loss/hidden": 0.9453125, "loss/logits": 0.14820685982704163, "loss/reg": 0.01521630771458149, "step": 1388 }, { "epoch": 0.173625, "grad_norm": 2.965728759765625, "grad_norm_var": 324.1829850455801, "learning_rate": 0.0001, "loss": 1.0633, "loss/crossentropy": 2.864595413208008, "loss/hidden": 0.76171875, "loss/logits": 0.14945918321609497, "loss/reg": 0.015208802185952663, "step": 1389 }, { "epoch": 0.17375, "grad_norm": 2.8366498947143555, "grad_norm_var": 323.8341522332258, "learning_rate": 0.0001, "loss": 1.178, "loss/crossentropy": 2.634251594543457, "loss/hidden": 0.8671875, "loss/logits": 0.1587602198123932, "loss/reg": 0.015202116221189499, "step": 1390 }, { "epoch": 0.173875, "grad_norm": 3.016205310821533, "grad_norm_var": 324.1573581089945, "learning_rate": 0.0001, "loss": 1.0881, "loss/crossentropy": 2.1916353702545166, "loss/hidden": 0.79296875, "loss/logits": 0.14321959018707275, "loss/reg": 0.015194511041045189, "step": 1391 }, { "epoch": 0.174, "grad_norm": 34.42859649658203, "grad_norm_var": 364.624406562687, "learning_rate": 0.0001, "loss": 1.0421, "loss/crossentropy": 2.4397382736206055, "loss/hidden": 0.76953125, "loss/logits": 0.12064860761165619, "loss/reg": 0.015187704935669899, "step": 1392 }, { "epoch": 0.174125, "grad_norm": 3.928023099899292, "grad_norm_var": 363.0711295434095, "learning_rate": 0.0001, "loss": 1.2257, "loss/crossentropy": 2.1384785175323486, "loss/hidden": 0.921875, "loss/logits": 0.15198791027069092, "loss/reg": 0.015179669484496117, "step": 1393 }, { "epoch": 0.17425, "grad_norm": 5.9230194091796875, "grad_norm_var": 361.2014745642014, "learning_rate": 0.0001, "loss": 1.2374, "loss/crossentropy": 2.474677801132202, "loss/hidden": 0.9453125, "loss/logits": 0.14034368097782135, "loss/reg": 0.015171775594353676, "step": 1394 }, { "epoch": 0.174375, "grad_norm": 5.671228885650635, "grad_norm_var": 359.4081015077895, "learning_rate": 0.0001, "loss": 1.6994, "loss/crossentropy": 2.1997835636138916, "loss/hidden": 1.34375, "loss/logits": 0.20398974418640137, "loss/reg": 0.015164447948336601, "step": 1395 }, { "epoch": 0.1745, "grad_norm": 2.8046653270721436, "grad_norm_var": 359.32498119248385, "learning_rate": 0.0001, "loss": 1.2227, "loss/crossentropy": 2.2905519008636475, "loss/hidden": 0.92578125, "loss/logits": 0.14532050490379333, "loss/reg": 0.015156446024775505, "step": 1396 }, { "epoch": 0.174625, "grad_norm": 3.8924100399017334, "grad_norm_var": 358.42190384848453, "learning_rate": 0.0001, "loss": 1.2919, "loss/crossentropy": 2.5372776985168457, "loss/hidden": 0.89453125, "loss/logits": 0.2458799183368683, "loss/reg": 0.015149053186178207, "step": 1397 }, { "epoch": 0.17475, "grad_norm": 3.204949378967285, "grad_norm_var": 357.6995478125139, "learning_rate": 0.0001, "loss": 1.2475, "loss/crossentropy": 2.3744595050811768, "loss/hidden": 0.9296875, "loss/logits": 0.16641047596931458, "loss/reg": 0.015141867101192474, "step": 1398 }, { "epoch": 0.174875, "grad_norm": 2.5903260707855225, "grad_norm_var": 358.97241696361874, "learning_rate": 0.0001, "loss": 1.079, "loss/crossentropy": 2.613074541091919, "loss/hidden": 0.7890625, "loss/logits": 0.1385442167520523, "loss/reg": 0.015134819783270359, "step": 1399 }, { "epoch": 0.175, "grad_norm": 2.877967357635498, "grad_norm_var": 359.4468337869737, "learning_rate": 0.0001, "loss": 1.0731, "loss/crossentropy": 2.320415735244751, "loss/hidden": 0.79296875, "loss/logits": 0.12884414196014404, "loss/reg": 0.015128130093216896, "step": 1400 }, { "epoch": 0.175125, "grad_norm": 2.7650208473205566, "grad_norm_var": 60.75819602934475, "learning_rate": 0.0001, "loss": 1.2011, "loss/crossentropy": 2.242631673812866, "loss/hidden": 0.8828125, "loss/logits": 0.1670370101928711, "loss/reg": 0.015121539123356342, "step": 1401 }, { "epoch": 0.17525, "grad_norm": 3.245616912841797, "grad_norm_var": 60.52307577230558, "learning_rate": 0.0001, "loss": 1.2625, "loss/crossentropy": 2.2215981483459473, "loss/hidden": 0.9453125, "loss/logits": 0.16608265042304993, "loss/reg": 0.015114927664399147, "step": 1402 }, { "epoch": 0.175375, "grad_norm": 62.871517181396484, "grad_norm_var": 264.3128112734296, "learning_rate": 0.0001, "loss": 3.0131, "loss/crossentropy": 2.5125503540039062, "loss/hidden": 2.4375, "loss/logits": 0.42451030015945435, "loss/reg": 0.015108034946024418, "step": 1403 }, { "epoch": 0.1755, "grad_norm": 3.219738006591797, "grad_norm_var": 265.7955458129954, "learning_rate": 0.0001, "loss": 1.2222, "loss/crossentropy": 2.3459861278533936, "loss/hidden": 0.91015625, "loss/logits": 0.16100311279296875, "loss/reg": 0.015101352706551552, "step": 1404 }, { "epoch": 0.175625, "grad_norm": 2.7814700603485107, "grad_norm_var": 265.9493587458945, "learning_rate": 0.0001, "loss": 1.0905, "loss/crossentropy": 2.5293545722961426, "loss/hidden": 0.80078125, "loss/logits": 0.1387513130903244, "loss/reg": 0.01509346254169941, "step": 1405 }, { "epoch": 0.17575, "grad_norm": 2.6544785499572754, "grad_norm_var": 266.1042610002773, "learning_rate": 0.0001, "loss": 1.0535, "loss/crossentropy": 2.5097312927246094, "loss/hidden": 0.7734375, "loss/logits": 0.1291828155517578, "loss/reg": 0.015085602179169655, "step": 1406 }, { "epoch": 0.175875, "grad_norm": 3.1223692893981934, "grad_norm_var": 266.0186046129393, "learning_rate": 0.0001, "loss": 1.4849, "loss/crossentropy": 2.2004942893981934, "loss/hidden": 1.140625, "loss/logits": 0.1934625506401062, "loss/reg": 0.015078413300216198, "step": 1407 }, { "epoch": 0.176, "grad_norm": 4.406020164489746, "grad_norm_var": 221.05808913641562, "learning_rate": 0.0001, "loss": 1.2206, "loss/crossentropy": 2.239046096801758, "loss/hidden": 0.90625, "loss/logits": 0.1636815071105957, "loss/reg": 0.015070472843945026, "step": 1408 }, { "epoch": 0.176125, "grad_norm": 2.3968393802642822, "grad_norm_var": 221.88230400943462, "learning_rate": 0.0001, "loss": 1.0135, "loss/crossentropy": 2.5016496181488037, "loss/hidden": 0.73046875, "loss/logits": 0.13240215182304382, "loss/reg": 0.015062601305544376, "step": 1409 }, { "epoch": 0.17625, "grad_norm": 4.053096771240234, "grad_norm_var": 222.40718733745135, "learning_rate": 0.0001, "loss": 1.3875, "loss/crossentropy": 2.5696771144866943, "loss/hidden": 1.046875, "loss/logits": 0.19007228314876556, "loss/reg": 0.015055070631206036, "step": 1410 }, { "epoch": 0.176375, "grad_norm": 3.213609218597412, "grad_norm_var": 223.23151802105554, "learning_rate": 0.0001, "loss": 1.101, "loss/crossentropy": 2.580714225769043, "loss/hidden": 0.81640625, "loss/logits": 0.13412834703922272, "loss/reg": 0.01504768431186676, "step": 1411 }, { "epoch": 0.1765, "grad_norm": 7.05836820602417, "grad_norm_var": 222.05031160271446, "learning_rate": 0.0001, "loss": 1.3444, "loss/crossentropy": 2.6128039360046387, "loss/hidden": 1.046875, "loss/logits": 0.14714105427265167, "loss/reg": 0.015039726160466671, "step": 1412 }, { "epoch": 0.176625, "grad_norm": 2.905400037765503, "grad_norm_var": 222.53952156242244, "learning_rate": 0.0001, "loss": 1.0393, "loss/crossentropy": 2.390307664871216, "loss/hidden": 0.75390625, "loss/logits": 0.13505280017852783, "loss/reg": 0.015031633898615837, "step": 1413 }, { "epoch": 0.17675, "grad_norm": 2.7498996257781982, "grad_norm_var": 222.7879046702847, "learning_rate": 0.0001, "loss": 1.1323, "loss/crossentropy": 2.4936277866363525, "loss/hidden": 0.828125, "loss/logits": 0.15391477942466736, "loss/reg": 0.015023862943053246, "step": 1414 }, { "epoch": 0.176875, "grad_norm": 3.2711455821990967, "grad_norm_var": 222.41140935738122, "learning_rate": 0.0001, "loss": 1.159, "loss/crossentropy": 2.586298704147339, "loss/hidden": 0.8515625, "loss/logits": 0.15724779665470123, "loss/reg": 0.015016157180070877, "step": 1415 }, { "epoch": 0.177, "grad_norm": 2.5050208568573, "grad_norm_var": 222.63002493426723, "learning_rate": 0.0001, "loss": 0.9875, "loss/crossentropy": 2.4593968391418457, "loss/hidden": 0.71484375, "loss/logits": 0.12256527692079544, "loss/reg": 0.015008063055574894, "step": 1416 }, { "epoch": 0.177125, "grad_norm": 5.0899248123168945, "grad_norm_var": 221.63143029624337, "learning_rate": 0.0001, "loss": 1.3226, "loss/crossentropy": 2.529581069946289, "loss/hidden": 0.9921875, "loss/logits": 0.18042267858982086, "loss/reg": 0.014999698847532272, "step": 1417 }, { "epoch": 0.17725, "grad_norm": 2.873887300491333, "grad_norm_var": 221.83712878589319, "learning_rate": 0.0001, "loss": 1.1234, "loss/crossentropy": 2.637040376663208, "loss/hidden": 0.81640625, "loss/logits": 0.1570313721895218, "loss/reg": 0.014991391450166702, "step": 1418 }, { "epoch": 0.177375, "grad_norm": 2.4938929080963135, "grad_norm_var": 1.4893372742809399, "learning_rate": 0.0001, "loss": 1.0818, "loss/crossentropy": 2.3521924018859863, "loss/hidden": 0.7890625, "loss/logits": 0.14290779829025269, "loss/reg": 0.01498338021337986, "step": 1419 }, { "epoch": 0.1775, "grad_norm": 7.30941104888916, "grad_norm_var": 2.4229140389199517, "learning_rate": 0.0001, "loss": 1.2235, "loss/crossentropy": 2.295588254928589, "loss/hidden": 0.9140625, "loss/logits": 0.15963850915431976, "loss/reg": 0.014975651167333126, "step": 1420 }, { "epoch": 0.177625, "grad_norm": 2.976419448852539, "grad_norm_var": 2.4019258000462114, "learning_rate": 0.0001, "loss": 1.1336, "loss/crossentropy": 2.398700714111328, "loss/hidden": 0.8125, "loss/logits": 0.17146307229995728, "loss/reg": 0.014968100003898144, "step": 1421 }, { "epoch": 0.17775, "grad_norm": 6.012148380279541, "grad_norm_var": 2.641842426821879, "learning_rate": 0.0001, "loss": 1.4281, "loss/crossentropy": 2.3960044384002686, "loss/hidden": 1.0625, "loss/logits": 0.21602004766464233, "loss/reg": 0.01496051624417305, "step": 1422 }, { "epoch": 0.177875, "grad_norm": 3.721160888671875, "grad_norm_var": 2.601979835113735, "learning_rate": 0.0001, "loss": 1.4417, "loss/crossentropy": 2.231612205505371, "loss/hidden": 1.078125, "loss/logits": 0.21401020884513855, "loss/reg": 0.01495243888348341, "step": 1423 }, { "epoch": 0.178, "grad_norm": 2.6747055053710938, "grad_norm_var": 2.68168930149086, "learning_rate": 0.0001, "loss": 1.1718, "loss/crossentropy": 2.2853996753692627, "loss/hidden": 0.87109375, "loss/logits": 0.15127936005592346, "loss/reg": 0.014944901689887047, "step": 1424 }, { "epoch": 0.178125, "grad_norm": 11.191994667053223, "grad_norm_var": 5.833885032277132, "learning_rate": 0.0001, "loss": 1.2679, "loss/crossentropy": 2.5116124153137207, "loss/hidden": 0.9296875, "loss/logits": 0.18879596889019012, "loss/reg": 0.014937575906515121, "step": 1425 }, { "epoch": 0.17825, "grad_norm": 3.66359543800354, "grad_norm_var": 5.860409413897694, "learning_rate": 0.0001, "loss": 1.1213, "loss/crossentropy": 2.5135722160339355, "loss/hidden": 0.82421875, "loss/logits": 0.1477985680103302, "loss/reg": 0.014929663389921188, "step": 1426 }, { "epoch": 0.178375, "grad_norm": 7.735219478607178, "grad_norm_var": 6.448943732227483, "learning_rate": 0.0001, "loss": 1.1287, "loss/crossentropy": 2.675562620162964, "loss/hidden": 0.81640625, "loss/logits": 0.16307833790779114, "loss/reg": 0.01492208894342184, "step": 1427 }, { "epoch": 0.1785, "grad_norm": 3.957063913345337, "grad_norm_var": 6.0498597570917205, "learning_rate": 0.0001, "loss": 1.247, "loss/crossentropy": 2.4751713275909424, "loss/hidden": 0.91015625, "loss/logits": 0.18771307170391083, "loss/reg": 0.014914041385054588, "step": 1428 }, { "epoch": 0.178625, "grad_norm": 4.195860862731934, "grad_norm_var": 5.888917428574246, "learning_rate": 0.0001, "loss": 1.4162, "loss/crossentropy": 2.572279930114746, "loss/hidden": 1.03125, "loss/logits": 0.23586444556713104, "loss/reg": 0.014906428754329681, "step": 1429 }, { "epoch": 0.17875, "grad_norm": 3.2409324645996094, "grad_norm_var": 5.787681963969308, "learning_rate": 0.0001, "loss": 1.0805, "loss/crossentropy": 2.401996374130249, "loss/hidden": 0.80078125, "loss/logits": 0.13070350885391235, "loss/reg": 0.014898893423378468, "step": 1430 }, { "epoch": 0.178875, "grad_norm": 3.5550079345703125, "grad_norm_var": 5.744049750040046, "learning_rate": 0.0001, "loss": 1.159, "loss/crossentropy": 2.4993062019348145, "loss/hidden": 0.8359375, "loss/logits": 0.1741950362920761, "loss/reg": 0.014890996739268303, "step": 1431 }, { "epoch": 0.179, "grad_norm": 2.403970718383789, "grad_norm_var": 5.772574341640287, "learning_rate": 0.0001, "loss": 0.9385, "loss/crossentropy": 2.338052749633789, "loss/hidden": 0.67578125, "loss/logits": 0.11390648782253265, "loss/reg": 0.014883420430123806, "step": 1432 }, { "epoch": 0.179125, "grad_norm": 3.128432512283325, "grad_norm_var": 5.876657514659041, "learning_rate": 0.0001, "loss": 1.1205, "loss/crossentropy": 2.323333740234375, "loss/hidden": 0.828125, "loss/logits": 0.1436302661895752, "loss/reg": 0.014875980094075203, "step": 1433 }, { "epoch": 0.17925, "grad_norm": 2.75893235206604, "grad_norm_var": 5.901577514262766, "learning_rate": 0.0001, "loss": 1.0911, "loss/crossentropy": 2.403975009918213, "loss/hidden": 0.796875, "loss/logits": 0.14556562900543213, "loss/reg": 0.014869259670376778, "step": 1434 }, { "epoch": 0.179375, "grad_norm": 4.4465861320495605, "grad_norm_var": 5.633549820228222, "learning_rate": 0.0001, "loss": 1.4989, "loss/crossentropy": 2.418936014175415, "loss/hidden": 1.109375, "loss/logits": 0.24089978635311127, "loss/reg": 0.014861512929201126, "step": 1435 }, { "epoch": 0.1795, "grad_norm": 2.976722478866577, "grad_norm_var": 5.218912579761936, "learning_rate": 0.0001, "loss": 1.1243, "loss/crossentropy": 2.423041582107544, "loss/hidden": 0.83203125, "loss/logits": 0.14372695982456207, "loss/reg": 0.014853725209832191, "step": 1436 }, { "epoch": 0.179625, "grad_norm": 2.8299055099487305, "grad_norm_var": 5.245913751427944, "learning_rate": 0.0001, "loss": 0.9575, "loss/crossentropy": 2.5777714252471924, "loss/hidden": 0.6796875, "loss/logits": 0.1293865293264389, "loss/reg": 0.014846443198621273, "step": 1437 }, { "epoch": 0.17975, "grad_norm": 3.8410592079162598, "grad_norm_var": 5.039317138416171, "learning_rate": 0.0001, "loss": 1.1673, "loss/crossentropy": 2.2890384197235107, "loss/hidden": 0.859375, "loss/logits": 0.15949448943138123, "loss/reg": 0.014838694594800472, "step": 1438 }, { "epoch": 0.179875, "grad_norm": 3.1295011043548584, "grad_norm_var": 5.094637447706395, "learning_rate": 0.0001, "loss": 1.0505, "loss/crossentropy": 2.4979283809661865, "loss/hidden": 0.77734375, "loss/logits": 0.12488513439893723, "loss/reg": 0.014831180684268475, "step": 1439 }, { "epoch": 0.18, "grad_norm": 3.2620816230773926, "grad_norm_var": 5.0039422612885565, "learning_rate": 0.0001, "loss": 1.0724, "loss/crossentropy": 2.5473530292510986, "loss/hidden": 0.8046875, "loss/logits": 0.1195274293422699, "loss/reg": 0.014823324047029018, "step": 1440 }, { "epoch": 0.180125, "grad_norm": 2.8635447025299072, "grad_norm_var": 1.5135115386307687, "learning_rate": 0.0001, "loss": 1.0522, "loss/crossentropy": 2.709258794784546, "loss/hidden": 0.76953125, "loss/logits": 0.13453327119350433, "loss/reg": 0.01481538638472557, "step": 1441 }, { "epoch": 0.18025, "grad_norm": 2.694793462753296, "grad_norm_var": 1.5670935881051355, "learning_rate": 0.0001, "loss": 1.042, "loss/crossentropy": 2.6498730182647705, "loss/hidden": 0.76171875, "loss/logits": 0.13220643997192383, "loss/reg": 0.01480773463845253, "step": 1442 }, { "epoch": 0.180375, "grad_norm": 3.662919282913208, "grad_norm_var": 0.33856051311708674, "learning_rate": 0.0001, "loss": 1.195, "loss/crossentropy": 2.589240550994873, "loss/hidden": 0.86328125, "loss/logits": 0.18375417590141296, "loss/reg": 0.014799892902374268, "step": 1443 }, { "epoch": 0.1805, "grad_norm": 3.0655100345611572, "grad_norm_var": 0.3112265539838631, "learning_rate": 0.0001, "loss": 1.0221, "loss/crossentropy": 2.322566509246826, "loss/hidden": 0.7578125, "loss/logits": 0.11638704687356949, "loss/reg": 0.014792241156101227, "step": 1444 }, { "epoch": 0.180625, "grad_norm": 3.0010573863983154, "grad_norm_var": 0.2503215727538245, "learning_rate": 0.0001, "loss": 1.1143, "loss/crossentropy": 2.2516565322875977, "loss/hidden": 0.83203125, "loss/logits": 0.13441051542758942, "loss/reg": 0.014784677885472775, "step": 1445 }, { "epoch": 0.18075, "grad_norm": 2.6158626079559326, "grad_norm_var": 0.26956362632713443, "learning_rate": 0.0001, "loss": 1.1134, "loss/crossentropy": 2.476480007171631, "loss/hidden": 0.80078125, "loss/logits": 0.1648055911064148, "loss/reg": 0.014777293428778648, "step": 1446 }, { "epoch": 0.180875, "grad_norm": 2.593005895614624, "grad_norm_var": 0.2741393520658806, "learning_rate": 0.0001, "loss": 1.1448, "loss/crossentropy": 2.5278983116149902, "loss/hidden": 0.83203125, "loss/logits": 0.1651032269001007, "loss/reg": 0.014769522473216057, "step": 1447 }, { "epoch": 0.181, "grad_norm": 3.3676040172576904, "grad_norm_var": 0.24536603446710312, "learning_rate": 0.0001, "loss": 1.0625, "loss/crossentropy": 2.5347743034362793, "loss/hidden": 0.77734375, "loss/logits": 0.13756276667118073, "loss/reg": 0.01476323138922453, "step": 1448 }, { "epoch": 0.181125, "grad_norm": 2.7923572063446045, "grad_norm_var": 0.2529365869795574, "learning_rate": 0.0001, "loss": 1.3644, "loss/crossentropy": 2.110182046890259, "loss/hidden": 1.0234375, "loss/logits": 0.1933690905570984, "loss/reg": 0.014755993150174618, "step": 1449 }, { "epoch": 0.18125, "grad_norm": 70.97599792480469, "grad_norm_var": 287.8273579394089, "learning_rate": 0.0001, "loss": 1.1063, "loss/crossentropy": 2.6005754470825195, "loss/hidden": 0.8125, "loss/logits": 0.14632144570350647, "loss/reg": 0.014748108573257923, "step": 1450 }, { "epoch": 0.181375, "grad_norm": 5.7473530769348145, "grad_norm_var": 287.42393180966496, "learning_rate": 0.0001, "loss": 1.6866, "loss/crossentropy": 2.3792290687561035, "loss/hidden": 1.2265625, "loss/logits": 0.31258517503738403, "loss/reg": 0.014740433543920517, "step": 1451 }, { "epoch": 0.1815, "grad_norm": 3.4155495166778564, "grad_norm_var": 287.17343283264796, "learning_rate": 0.0001, "loss": 1.1977, "loss/crossentropy": 2.4722485542297363, "loss/hidden": 0.8828125, "loss/logits": 0.1675756871700287, "loss/reg": 0.014733717776834965, "step": 1452 }, { "epoch": 0.181625, "grad_norm": 2.5721304416656494, "grad_norm_var": 287.3377922083848, "learning_rate": 0.0001, "loss": 1.0864, "loss/crossentropy": 2.65114426612854, "loss/hidden": 0.79296875, "loss/logits": 0.1461598426103592, "loss/reg": 0.01472744531929493, "step": 1453 }, { "epoch": 0.18175, "grad_norm": 3.40112566947937, "grad_norm_var": 287.5630487447142, "learning_rate": 0.0001, "loss": 1.3854, "loss/crossentropy": 2.49849796295166, "loss/hidden": 1.0390625, "loss/logits": 0.19910715520381927, "loss/reg": 0.014719628728926182, "step": 1454 }, { "epoch": 0.181875, "grad_norm": 3.4560482501983643, "grad_norm_var": 287.381708208898, "learning_rate": 0.0001, "loss": 1.2084, "loss/crossentropy": 2.630333423614502, "loss/hidden": 0.90625, "loss/logits": 0.1550079882144928, "loss/reg": 0.014712607488036156, "step": 1455 }, { "epoch": 0.182, "grad_norm": 2.8808276653289795, "grad_norm_var": 287.60459257620465, "learning_rate": 0.0001, "loss": 1.1446, "loss/crossentropy": 2.7043960094451904, "loss/hidden": 0.8515625, "loss/logits": 0.14598789811134338, "loss/reg": 0.01470634713768959, "step": 1456 }, { "epoch": 0.182125, "grad_norm": 2.3799662590026855, "grad_norm_var": 287.91454947447465, "learning_rate": 0.0001, "loss": 1.0639, "loss/crossentropy": 2.4705262184143066, "loss/hidden": 0.77734375, "loss/logits": 0.13952219486236572, "loss/reg": 0.014699905179440975, "step": 1457 }, { "epoch": 0.18225, "grad_norm": 3.0028903484344482, "grad_norm_var": 287.7266240160942, "learning_rate": 0.0001, "loss": 1.1176, "loss/crossentropy": 2.4257349967956543, "loss/hidden": 0.8125, "loss/logits": 0.1581314206123352, "loss/reg": 0.014692124910652637, "step": 1458 }, { "epoch": 0.182375, "grad_norm": 2.5650155544281006, "grad_norm_var": 288.3538726561922, "learning_rate": 0.0001, "loss": 1.0762, "loss/crossentropy": 2.4188382625579834, "loss/hidden": 0.7890625, "loss/logits": 0.14031586050987244, "loss/reg": 0.014684327878057957, "step": 1459 }, { "epoch": 0.1825, "grad_norm": 3.0308585166931152, "grad_norm_var": 288.3738099925176, "learning_rate": 0.0001, "loss": 1.2435, "loss/crossentropy": 2.5064949989318848, "loss/hidden": 0.9296875, "loss/logits": 0.16703946888446808, "loss/reg": 0.014676612801849842, "step": 1460 }, { "epoch": 0.182625, "grad_norm": 3.903977394104004, "grad_norm_var": 287.89971053282926, "learning_rate": 0.0001, "loss": 1.0425, "loss/crossentropy": 2.5762782096862793, "loss/hidden": 0.75390625, "loss/logits": 0.14189936220645905, "loss/reg": 0.014669781550765038, "step": 1461 }, { "epoch": 0.18275, "grad_norm": 3.6422464847564697, "grad_norm_var": 287.3082663217944, "learning_rate": 0.0001, "loss": 1.0475, "loss/crossentropy": 2.6891908645629883, "loss/hidden": 0.7734375, "loss/logits": 0.12741170823574066, "loss/reg": 0.0146620599552989, "step": 1462 }, { "epoch": 0.182875, "grad_norm": 2.7266409397125244, "grad_norm_var": 287.22225368800906, "learning_rate": 0.0001, "loss": 1.1596, "loss/crossentropy": 2.4353747367858887, "loss/hidden": 0.84765625, "loss/logits": 0.16537800431251526, "loss/reg": 0.014655125327408314, "step": 1463 }, { "epoch": 0.183, "grad_norm": 8.853667259216309, "grad_norm_var": 286.08693801368264, "learning_rate": 0.0001, "loss": 1.2411, "loss/crossentropy": 2.9862468242645264, "loss/hidden": 0.90625, "loss/logits": 0.1883150041103363, "loss/reg": 0.014648628421127796, "step": 1464 }, { "epoch": 0.183125, "grad_norm": 3.3163163661956787, "grad_norm_var": 285.7518694340515, "learning_rate": 0.0001, "loss": 1.1675, "loss/crossentropy": 2.642627477645874, "loss/hidden": 0.8671875, "loss/logits": 0.15386328101158142, "loss/reg": 0.014640781097114086, "step": 1465 }, { "epoch": 0.18325, "grad_norm": 3.663252830505371, "grad_norm_var": 2.5336251924554736, "learning_rate": 0.0001, "loss": 1.2303, "loss/crossentropy": 2.353497266769409, "loss/hidden": 0.91015625, "loss/logits": 0.17375797033309937, "loss/reg": 0.014634395018219948, "step": 1466 }, { "epoch": 0.183375, "grad_norm": 3.4295380115509033, "grad_norm_var": 2.224270864584783, "learning_rate": 0.0001, "loss": 1.186, "loss/crossentropy": 2.5201594829559326, "loss/hidden": 0.8828125, "loss/logits": 0.1568823754787445, "loss/reg": 0.01462656632065773, "step": 1467 }, { "epoch": 0.1835, "grad_norm": 2.4592854976654053, "grad_norm_var": 2.294103952189973, "learning_rate": 0.0001, "loss": 1.0183, "loss/crossentropy": 2.5786993503570557, "loss/hidden": 0.74609375, "loss/logits": 0.1260080635547638, "loss/reg": 0.01461972575634718, "step": 1468 }, { "epoch": 0.183625, "grad_norm": 4.478452205657959, "grad_norm_var": 2.296768240317753, "learning_rate": 0.0001, "loss": 1.1604, "loss/crossentropy": 2.472398042678833, "loss/hidden": 0.87890625, "loss/logits": 0.13542431592941284, "loss/reg": 0.014611870981752872, "step": 1469 }, { "epoch": 0.18375, "grad_norm": 2.8013250827789307, "grad_norm_var": 2.3331091729009654, "learning_rate": 0.0001, "loss": 1.2052, "loss/crossentropy": 2.448435068130493, "loss/hidden": 0.8671875, "loss/logits": 0.19197094440460205, "loss/reg": 0.014604009687900543, "step": 1470 }, { "epoch": 0.183875, "grad_norm": 2.5477471351623535, "grad_norm_var": 2.394463361736054, "learning_rate": 0.0001, "loss": 1.0754, "loss/crossentropy": 2.4608027935028076, "loss/hidden": 0.796875, "loss/logits": 0.13251473009586334, "loss/reg": 0.014596136286854744, "step": 1471 }, { "epoch": 0.184, "grad_norm": 2.9321022033691406, "grad_norm_var": 2.3905305167023623, "learning_rate": 0.0001, "loss": 0.997, "loss/crossentropy": 2.4788243770599365, "loss/hidden": 0.7421875, "loss/logits": 0.10894551128149033, "loss/reg": 0.014588245190680027, "step": 1472 }, { "epoch": 0.184125, "grad_norm": 3.2976138591766357, "grad_norm_var": 2.3081604420680737, "learning_rate": 0.0001, "loss": 1.0611, "loss/crossentropy": 2.440394878387451, "loss/hidden": 0.77734375, "loss/logits": 0.13798683881759644, "loss/reg": 0.014581209979951382, "step": 1473 }, { "epoch": 0.18425, "grad_norm": 2.3695755004882812, "grad_norm_var": 2.378640708203821, "learning_rate": 0.0001, "loss": 1.0597, "loss/crossentropy": 2.457941770553589, "loss/hidden": 0.77734375, "loss/logits": 0.1365838199853897, "loss/reg": 0.014573454856872559, "step": 1474 }, { "epoch": 0.184375, "grad_norm": 2.909522771835327, "grad_norm_var": 2.343060112343133, "learning_rate": 0.0001, "loss": 1.0217, "loss/crossentropy": 2.291848659515381, "loss/hidden": 0.76953125, "loss/logits": 0.10653392970561981, "loss/reg": 0.014565936289727688, "step": 1475 }, { "epoch": 0.1845, "grad_norm": 4.1714019775390625, "grad_norm_var": 2.349577274287818, "learning_rate": 0.0001, "loss": 1.2516, "loss/crossentropy": 2.4249794483184814, "loss/hidden": 0.9375, "loss/logits": 0.16853143274784088, "loss/reg": 0.0145582165569067, "step": 1476 }, { "epoch": 0.184625, "grad_norm": 2.30051326751709, "grad_norm_var": 2.4439813338223115, "learning_rate": 0.0001, "loss": 1.0722, "loss/crossentropy": 2.621089220046997, "loss/hidden": 0.7890625, "loss/logits": 0.13760125637054443, "loss/reg": 0.014550295658409595, "step": 1477 }, { "epoch": 0.18475, "grad_norm": 2.7384438514709473, "grad_norm_var": 2.4771341504323514, "learning_rate": 0.0001, "loss": 1.1117, "loss/crossentropy": 2.1965439319610596, "loss/hidden": 0.83203125, "loss/logits": 0.1342887282371521, "loss/reg": 0.014541985467076302, "step": 1478 }, { "epoch": 0.184875, "grad_norm": 37.360321044921875, "grad_norm_var": 74.16407744545192, "learning_rate": 0.0001, "loss": 0.9989, "loss/crossentropy": 2.9801506996154785, "loss/hidden": 0.75, "loss/logits": 0.10352309793233871, "loss/reg": 0.014533448964357376, "step": 1479 }, { "epoch": 0.185, "grad_norm": 3.2833774089813232, "grad_norm_var": 73.68816936181202, "learning_rate": 0.0001, "loss": 1.3522, "loss/crossentropy": 2.354642868041992, "loss/hidden": 1.0078125, "loss/logits": 0.19911861419677734, "loss/reg": 0.014524821192026138, "step": 1480 }, { "epoch": 0.185125, "grad_norm": 2.9734416007995605, "grad_norm_var": 73.78408654274459, "learning_rate": 0.0001, "loss": 1.1487, "loss/crossentropy": 2.5445892810821533, "loss/hidden": 0.84375, "loss/logits": 0.1597684621810913, "loss/reg": 0.014516538009047508, "step": 1481 }, { "epoch": 0.18525, "grad_norm": 3.2212953567504883, "grad_norm_var": 73.88875146417945, "learning_rate": 0.0001, "loss": 1.1972, "loss/crossentropy": 2.4549801349639893, "loss/hidden": 0.875, "loss/logits": 0.17715027928352356, "loss/reg": 0.014508497901260853, "step": 1482 }, { "epoch": 0.185375, "grad_norm": 3.029637098312378, "grad_norm_var": 73.9933942207774, "learning_rate": 0.0001, "loss": 1.0726, "loss/crossentropy": 2.6153388023376465, "loss/hidden": 0.796875, "loss/logits": 0.13069182634353638, "loss/reg": 0.014500816352665424, "step": 1483 }, { "epoch": 0.1855, "grad_norm": 3.7203376293182373, "grad_norm_var": 73.63538575655532, "learning_rate": 0.0001, "loss": 1.1901, "loss/crossentropy": 2.437302350997925, "loss/hidden": 0.8828125, "loss/logits": 0.16235879063606262, "loss/reg": 0.014492850750684738, "step": 1484 }, { "epoch": 0.185625, "grad_norm": 2.606203079223633, "grad_norm_var": 74.04917997908026, "learning_rate": 0.0001, "loss": 0.9748, "loss/crossentropy": 2.3537402153015137, "loss/hidden": 0.71484375, "loss/logits": 0.11513984948396683, "loss/reg": 0.014484620653092861, "step": 1485 }, { "epoch": 0.18575, "grad_norm": 3.775780200958252, "grad_norm_var": 73.80448419578045, "learning_rate": 0.0001, "loss": 1.1872, "loss/crossentropy": 2.6341981887817383, "loss/hidden": 0.890625, "loss/logits": 0.1517730951309204, "loss/reg": 0.014477075077593327, "step": 1486 }, { "epoch": 0.185875, "grad_norm": 3.1672568321228027, "grad_norm_var": 73.60919906004487, "learning_rate": 0.0001, "loss": 1.0533, "loss/crossentropy": 2.3641412258148193, "loss/hidden": 0.78515625, "loss/logits": 0.12350372225046158, "loss/reg": 0.014468939043581486, "step": 1487 }, { "epoch": 0.186, "grad_norm": 2.9723494052886963, "grad_norm_var": 73.59690980017169, "learning_rate": 0.0001, "loss": 1.1189, "loss/crossentropy": 2.4746789932250977, "loss/hidden": 0.81640625, "loss/logits": 0.1578400433063507, "loss/reg": 0.014461321756243706, "step": 1488 }, { "epoch": 0.186125, "grad_norm": 4.781861782073975, "grad_norm_var": 73.34949321986252, "learning_rate": 0.0001, "loss": 1.0247, "loss/crossentropy": 2.6176137924194336, "loss/hidden": 0.76171875, "loss/logits": 0.11844634264707565, "loss/reg": 0.014453292824327946, "step": 1489 }, { "epoch": 0.18625, "grad_norm": 3.498753786087036, "grad_norm_var": 72.98251711179161, "learning_rate": 0.0001, "loss": 1.0433, "loss/crossentropy": 2.575167417526245, "loss/hidden": 0.76171875, "loss/logits": 0.1371680647134781, "loss/reg": 0.014445292763411999, "step": 1490 }, { "epoch": 0.186375, "grad_norm": 3.3169898986816406, "grad_norm_var": 72.85721374014727, "learning_rate": 0.0001, "loss": 1.2038, "loss/crossentropy": 2.556975841522217, "loss/hidden": 0.8984375, "loss/logits": 0.1610018014907837, "loss/reg": 0.01443762518465519, "step": 1491 }, { "epoch": 0.1865, "grad_norm": 3.731511354446411, "grad_norm_var": 72.9432662884783, "learning_rate": 0.0001, "loss": 0.967, "loss/crossentropy": 2.7008392810821533, "loss/hidden": 0.6953125, "loss/logits": 0.12734538316726685, "loss/reg": 0.014430060982704163, "step": 1492 }, { "epoch": 0.186625, "grad_norm": 2.6620421409606934, "grad_norm_var": 72.80179282549148, "learning_rate": 0.0001, "loss": 1.0512, "loss/crossentropy": 2.7235186100006104, "loss/hidden": 0.78125, "loss/logits": 0.1257205456495285, "loss/reg": 0.014422285370528698, "step": 1493 }, { "epoch": 0.18675, "grad_norm": 3.3065989017486572, "grad_norm_var": 72.61826294021522, "learning_rate": 0.0001, "loss": 1.1709, "loss/crossentropy": 2.5472517013549805, "loss/hidden": 0.87109375, "loss/logits": 0.15567487478256226, "loss/reg": 0.014414286240935326, "step": 1494 }, { "epoch": 0.186875, "grad_norm": 3.0994396209716797, "grad_norm_var": 0.27048224897859136, "learning_rate": 0.0001, "loss": 1.1325, "loss/crossentropy": 2.8537662029266357, "loss/hidden": 0.84765625, "loss/logits": 0.1407473087310791, "loss/reg": 0.014406588859856129, "step": 1495 }, { "epoch": 0.187, "grad_norm": 2.676715612411499, "grad_norm_var": 0.29658286686653407, "learning_rate": 0.0001, "loss": 1.1095, "loss/crossentropy": 2.4631519317626953, "loss/hidden": 0.83203125, "loss/logits": 0.1335071474313736, "loss/reg": 0.01439889520406723, "step": 1496 }, { "epoch": 0.187125, "grad_norm": 3.475898265838623, "grad_norm_var": 0.29157201854104237, "learning_rate": 0.0001, "loss": 1.3724, "loss/crossentropy": 2.6677818298339844, "loss/hidden": 1.03125, "loss/logits": 0.1972627192735672, "loss/reg": 0.014391305856406689, "step": 1497 }, { "epoch": 0.18725, "grad_norm": 2.769845962524414, "grad_norm_var": 0.30996036390038334, "learning_rate": 0.0001, "loss": 1.0487, "loss/crossentropy": 2.2365801334381104, "loss/hidden": 0.77734375, "loss/logits": 0.12756550312042236, "loss/reg": 0.014383896254003048, "step": 1498 }, { "epoch": 0.187375, "grad_norm": 2.8853700160980225, "grad_norm_var": 0.31621077264406544, "learning_rate": 0.0001, "loss": 1.1072, "loss/crossentropy": 2.6836471557617188, "loss/hidden": 0.8359375, "loss/logits": 0.12754273414611816, "loss/reg": 0.014376661740243435, "step": 1499 }, { "epoch": 0.1875, "grad_norm": 2.84495210647583, "grad_norm_var": 0.31246808986064234, "learning_rate": 0.0001, "loss": 1.2259, "loss/crossentropy": 2.2098467350006104, "loss/hidden": 0.9140625, "loss/logits": 0.16809730231761932, "loss/reg": 0.014369050972163677, "step": 1500 }, { "epoch": 0.187625, "grad_norm": 6.990378379821777, "grad_norm_var": 1.1530969883337083, "learning_rate": 0.0001, "loss": 1.3325, "loss/crossentropy": 2.6205296516418457, "loss/hidden": 0.98828125, "loss/logits": 0.2006484866142273, "loss/reg": 0.014361603185534477, "step": 1501 }, { "epoch": 0.18775, "grad_norm": 3.3288161754608154, "grad_norm_var": 1.1489830243296237, "learning_rate": 0.0001, "loss": 1.2901, "loss/crossentropy": 2.6651721000671387, "loss/hidden": 0.97265625, "loss/logits": 0.17393462359905243, "loss/reg": 0.014354297891259193, "step": 1502 }, { "epoch": 0.187875, "grad_norm": 2.990226984024048, "grad_norm_var": 1.1580711389422116, "learning_rate": 0.0001, "loss": 1.0356, "loss/crossentropy": 2.6010048389434814, "loss/hidden": 0.7578125, "loss/logits": 0.13436806201934814, "loss/reg": 0.014346664771437645, "step": 1503 }, { "epoch": 0.188, "grad_norm": 2.692655086517334, "grad_norm_var": 1.1810803489356902, "learning_rate": 0.0001, "loss": 1.2967, "loss/crossentropy": 2.152822971343994, "loss/hidden": 0.97265625, "loss/logits": 0.1806175410747528, "loss/reg": 0.014339223504066467, "step": 1504 }, { "epoch": 0.188125, "grad_norm": 87.7072982788086, "grad_norm_var": 445.7985967242888, "learning_rate": 0.0001, "loss": 1.5753, "loss/crossentropy": 2.5603487491607666, "loss/hidden": 1.2890625, "loss/logits": 0.1428820788860321, "loss/reg": 0.014331568963825703, "step": 1505 }, { "epoch": 0.18825, "grad_norm": 2.2747116088867188, "grad_norm_var": 446.72864180402945, "learning_rate": 0.0001, "loss": 0.9904, "loss/crossentropy": 2.3886001110076904, "loss/hidden": 0.73046875, "loss/logits": 0.11668366193771362, "loss/reg": 0.014323906973004341, "step": 1506 }, { "epoch": 0.188375, "grad_norm": 2.267629623413086, "grad_norm_var": 447.52923211089256, "learning_rate": 0.0001, "loss": 1.0243, "loss/crossentropy": 2.423229932785034, "loss/hidden": 0.7578125, "loss/logits": 0.12335064262151718, "loss/reg": 0.014316298067569733, "step": 1507 }, { "epoch": 0.1885, "grad_norm": 3.617304801940918, "grad_norm_var": 447.6023780363864, "learning_rate": 0.0001, "loss": 1.3432, "loss/crossentropy": 2.423637866973877, "loss/hidden": 0.98046875, "loss/logits": 0.21964287757873535, "loss/reg": 0.014309005811810493, "step": 1508 }, { "epoch": 0.188625, "grad_norm": 4.585550308227539, "grad_norm_var": 446.3429466687175, "learning_rate": 0.0001, "loss": 1.3114, "loss/crossentropy": 2.488982677459717, "loss/hidden": 0.984375, "loss/logits": 0.18399442732334137, "loss/reg": 0.014301459304988384, "step": 1509 }, { "epoch": 0.18875, "grad_norm": 3.1016576290130615, "grad_norm_var": 446.4900686609001, "learning_rate": 0.0001, "loss": 1.0954, "loss/crossentropy": 2.5978121757507324, "loss/hidden": 0.8046875, "loss/logits": 0.1477963924407959, "loss/reg": 0.014294042252004147, "step": 1510 }, { "epoch": 0.188875, "grad_norm": 2.6194772720336914, "grad_norm_var": 446.85530854590877, "learning_rate": 0.0001, "loss": 1.1325, "loss/crossentropy": 2.6663565635681152, "loss/hidden": 0.8359375, "loss/logits": 0.15372540056705475, "loss/reg": 0.014286375604569912, "step": 1511 }, { "epoch": 0.189, "grad_norm": 7.802513122558594, "grad_norm_var": 444.48216865196844, "learning_rate": 0.0001, "loss": 1.4337, "loss/crossentropy": 2.6862032413482666, "loss/hidden": 1.09375, "loss/logits": 0.19720105826854706, "loss/reg": 0.014278349466621876, "step": 1512 }, { "epoch": 0.189125, "grad_norm": 3.310786247253418, "grad_norm_var": 444.6026705038085, "learning_rate": 0.0001, "loss": 1.3428, "loss/crossentropy": 2.1258671283721924, "loss/hidden": 1.0234375, "loss/logits": 0.17663809657096863, "loss/reg": 0.014270216226577759, "step": 1513 }, { "epoch": 0.18925, "grad_norm": 5.729246616363525, "grad_norm_var": 442.7462351862821, "learning_rate": 0.0001, "loss": 1.0873, "loss/crossentropy": 2.9488282203674316, "loss/hidden": 0.80078125, "loss/logits": 0.1438678652048111, "loss/reg": 0.01426264550536871, "step": 1514 }, { "epoch": 0.189375, "grad_norm": 3.574831247329712, "grad_norm_var": 442.2095373355806, "learning_rate": 0.0001, "loss": 1.0687, "loss/crossentropy": 2.5907766819000244, "loss/hidden": 0.79296875, "loss/logits": 0.13321569561958313, "loss/reg": 0.014254805631935596, "step": 1515 }, { "epoch": 0.1895, "grad_norm": 3.7363853454589844, "grad_norm_var": 441.5169453192193, "learning_rate": 0.0001, "loss": 1.1045, "loss/crossentropy": 2.552393913269043, "loss/hidden": 0.828125, "loss/logits": 0.1338808685541153, "loss/reg": 0.014247224666178226, "step": 1516 }, { "epoch": 0.189625, "grad_norm": 2.745471954345703, "grad_norm_var": 443.8629711327847, "learning_rate": 0.0001, "loss": 1.1382, "loss/crossentropy": 2.7987189292907715, "loss/hidden": 0.84375, "loss/logits": 0.15207618474960327, "loss/reg": 0.01423969492316246, "step": 1517 }, { "epoch": 0.18975, "grad_norm": 2.548691987991333, "grad_norm_var": 444.4784529377907, "learning_rate": 0.0001, "loss": 1.1242, "loss/crossentropy": 2.413057565689087, "loss/hidden": 0.828125, "loss/logits": 0.15380065143108368, "loss/reg": 0.014231679029762745, "step": 1518 }, { "epoch": 0.189875, "grad_norm": 3.604227304458618, "grad_norm_var": 444.0238071702247, "learning_rate": 0.0001, "loss": 1.3671, "loss/crossentropy": 2.107539415359497, "loss/hidden": 1.015625, "loss/logits": 0.20924213528633118, "loss/reg": 0.014223325997591019, "step": 1519 }, { "epoch": 0.19, "grad_norm": 2.764598846435547, "grad_norm_var": 443.96487541121735, "learning_rate": 0.0001, "loss": 0.9063, "loss/crossentropy": 2.645019054412842, "loss/hidden": 0.6484375, "loss/logits": 0.11573326587677002, "loss/reg": 0.014215711504220963, "step": 1520 }, { "epoch": 0.190125, "grad_norm": 3.417343854904175, "grad_norm_var": 2.0384518833410272, "learning_rate": 0.0001, "loss": 1.1809, "loss/crossentropy": 2.0993950366973877, "loss/hidden": 0.91015625, "loss/logits": 0.1286228746175766, "loss/reg": 0.014208083041012287, "step": 1521 }, { "epoch": 0.19025, "grad_norm": 2.5339038372039795, "grad_norm_var": 1.996633160561107, "learning_rate": 0.0001, "loss": 1.1835, "loss/crossentropy": 2.3847148418426514, "loss/hidden": 0.890625, "loss/logits": 0.15089885890483856, "loss/reg": 0.014200469478964806, "step": 1522 }, { "epoch": 0.190375, "grad_norm": 5.412919044494629, "grad_norm_var": 2.046751372081492, "learning_rate": 0.0001, "loss": 1.2804, "loss/crossentropy": 2.647505760192871, "loss/hidden": 1.0078125, "loss/logits": 0.1306157112121582, "loss/reg": 0.014192642644047737, "step": 1523 }, { "epoch": 0.1905, "grad_norm": 3.2071800231933594, "grad_norm_var": 2.068296485893218, "learning_rate": 0.0001, "loss": 1.0836, "loss/crossentropy": 2.7921669483184814, "loss/hidden": 0.7890625, "loss/logits": 0.1526501327753067, "loss/reg": 0.01418515294790268, "step": 1524 }, { "epoch": 0.190625, "grad_norm": 3.779355049133301, "grad_norm_var": 2.0237706183651416, "learning_rate": 0.0001, "loss": 1.0808, "loss/crossentropy": 2.626934289932251, "loss/hidden": 0.79296875, "loss/logits": 0.1460772454738617, "loss/reg": 0.014177562668919563, "step": 1525 }, { "epoch": 0.19075, "grad_norm": 2.429072618484497, "grad_norm_var": 2.109561386098513, "learning_rate": 0.0001, "loss": 1.0907, "loss/crossentropy": 2.4317626953125, "loss/hidden": 0.80859375, "loss/logits": 0.14040398597717285, "loss/reg": 0.014170153997838497, "step": 1526 }, { "epoch": 0.190875, "grad_norm": 9.4876070022583, "grad_norm_var": 4.06735639009191, "learning_rate": 0.0001, "loss": 1.4633, "loss/crossentropy": 2.2017152309417725, "loss/hidden": 1.15625, "loss/logits": 0.16541144251823425, "loss/reg": 0.014162125997245312, "step": 1527 }, { "epoch": 0.191, "grad_norm": 3.5674562454223633, "grad_norm_var": 3.1147103692906586, "learning_rate": 0.0001, "loss": 1.2965, "loss/crossentropy": 2.85056734085083, "loss/hidden": 0.98828125, "loss/logits": 0.1666402667760849, "loss/reg": 0.014154158532619476, "step": 1528 }, { "epoch": 0.191125, "grad_norm": 2.6774816513061523, "grad_norm_var": 3.1866235930450406, "learning_rate": 0.0001, "loss": 1.1206, "loss/crossentropy": 2.367767095565796, "loss/hidden": 0.83984375, "loss/logits": 0.13927701115608215, "loss/reg": 0.014145908877253532, "step": 1529 }, { "epoch": 0.19125, "grad_norm": 8.835766792297363, "grad_norm_var": 4.578113572841516, "learning_rate": 0.0001, "loss": 1.3634, "loss/crossentropy": 2.98734974861145, "loss/hidden": 1.0625, "loss/logits": 0.15954606235027313, "loss/reg": 0.014136978425085545, "step": 1530 }, { "epoch": 0.191375, "grad_norm": 2.618042469024658, "grad_norm_var": 4.692138147417478, "learning_rate": 0.0001, "loss": 1.162, "loss/crossentropy": 2.5602381229400635, "loss/hidden": 0.86328125, "loss/logits": 0.15742892026901245, "loss/reg": 0.014127855189144611, "step": 1531 }, { "epoch": 0.1915, "grad_norm": 2.5733799934387207, "grad_norm_var": 4.811403170073528, "learning_rate": 0.0001, "loss": 1.1001, "loss/crossentropy": 2.511143684387207, "loss/hidden": 0.80078125, "loss/logits": 0.15818098187446594, "loss/reg": 0.014118584804236889, "step": 1532 }, { "epoch": 0.191625, "grad_norm": 3.53314208984375, "grad_norm_var": 4.730224432732357, "learning_rate": 0.0001, "loss": 1.1633, "loss/crossentropy": 2.384641647338867, "loss/hidden": 0.88671875, "loss/logits": 0.13548484444618225, "loss/reg": 0.014110967516899109, "step": 1533 }, { "epoch": 0.19175, "grad_norm": 2.8533082008361816, "grad_norm_var": 4.679641703787122, "learning_rate": 0.0001, "loss": 1.0351, "loss/crossentropy": 2.5344479084014893, "loss/hidden": 0.76171875, "loss/logits": 0.13237014412879944, "loss/reg": 0.014102863147854805, "step": 1534 }, { "epoch": 0.191875, "grad_norm": 3.1153576374053955, "grad_norm_var": 4.717503317774359, "learning_rate": 0.0001, "loss": 1.1134, "loss/crossentropy": 2.35892915725708, "loss/hidden": 0.82421875, "loss/logits": 0.1481958031654358, "loss/reg": 0.014095306396484375, "step": 1535 }, { "epoch": 0.192, "grad_norm": 2.1978485584259033, "grad_norm_var": 4.825294315312353, "learning_rate": 0.0001, "loss": 1.0504, "loss/crossentropy": 1.9747709035873413, "loss/hidden": 0.7890625, "loss/logits": 0.12044215202331543, "loss/reg": 0.014087379910051823, "step": 1536 }, { "epoch": 0.192125, "grad_norm": 3.004746913909912, "grad_norm_var": 4.861933406571506, "learning_rate": 0.0001, "loss": 1.1657, "loss/crossentropy": 2.6981475353240967, "loss/hidden": 0.8671875, "loss/logits": 0.1577630490064621, "loss/reg": 0.014079651795327663, "step": 1537 }, { "epoch": 0.19225, "grad_norm": 2.8707988262176514, "grad_norm_var": 4.809272805310865, "learning_rate": 0.0001, "loss": 1.0948, "loss/crossentropy": 2.7565841674804688, "loss/hidden": 0.80859375, "loss/logits": 0.1454852670431137, "loss/reg": 0.01407123077660799, "step": 1538 }, { "epoch": 0.192375, "grad_norm": 3.2437527179718018, "grad_norm_var": 4.661507493305176, "learning_rate": 0.0001, "loss": 1.268, "loss/crossentropy": 2.3115715980529785, "loss/hidden": 0.953125, "loss/logits": 0.1742616444826126, "loss/reg": 0.014062810689210892, "step": 1539 }, { "epoch": 0.1925, "grad_norm": 3.202810049057007, "grad_norm_var": 4.661824760391179, "learning_rate": 0.0001, "loss": 1.1887, "loss/crossentropy": 2.351571798324585, "loss/hidden": 0.89453125, "loss/logits": 0.15359902381896973, "loss/reg": 0.014054255560040474, "step": 1540 }, { "epoch": 0.192625, "grad_norm": 2.8404643535614014, "grad_norm_var": 4.713165856200402, "learning_rate": 0.0001, "loss": 1.0457, "loss/crossentropy": 2.3900203704833984, "loss/hidden": 0.7734375, "loss/logits": 0.13183440268039703, "loss/reg": 0.014045719988644123, "step": 1541 }, { "epoch": 0.19275, "grad_norm": 2.3016951084136963, "grad_norm_var": 4.735606807223379, "learning_rate": 0.0001, "loss": 1.1063, "loss/crossentropy": 2.413910150527954, "loss/hidden": 0.82421875, "loss/logits": 0.14167185127735138, "loss/reg": 0.014036578126251698, "step": 1542 }, { "epoch": 0.192875, "grad_norm": 2.4764444828033447, "grad_norm_var": 2.381355740482232, "learning_rate": 0.0001, "loss": 1.1498, "loss/crossentropy": 2.508134603500366, "loss/hidden": 0.85546875, "loss/logits": 0.15407797694206238, "loss/reg": 0.014029039070010185, "step": 1543 }, { "epoch": 0.193, "grad_norm": 3.122025966644287, "grad_norm_var": 2.374577491531939, "learning_rate": 0.0001, "loss": 1.0871, "loss/crossentropy": 2.517812490463257, "loss/hidden": 0.81640625, "loss/logits": 0.1304890513420105, "loss/reg": 0.014020491391420364, "step": 1544 }, { "epoch": 0.193125, "grad_norm": 3.5309674739837646, "grad_norm_var": 2.358743795236999, "learning_rate": 0.0001, "loss": 1.3379, "loss/crossentropy": 2.173165798187256, "loss/hidden": 1.015625, "loss/logits": 0.18210922181606293, "loss/reg": 0.014012150466442108, "step": 1545 }, { "epoch": 0.19325, "grad_norm": 2.8506641387939453, "grad_norm_var": 0.15605408960923697, "learning_rate": 0.0001, "loss": 1.0831, "loss/crossentropy": 2.6542272567749023, "loss/hidden": 0.796875, "loss/logits": 0.14616048336029053, "loss/reg": 0.014004606753587723, "step": 1546 }, { "epoch": 0.193375, "grad_norm": 2.6514549255371094, "grad_norm_var": 0.1548857183604106, "learning_rate": 0.0001, "loss": 1.2542, "loss/crossentropy": 2.267397403717041, "loss/hidden": 0.953125, "loss/logits": 0.16108238697052002, "loss/reg": 0.013996096327900887, "step": 1547 }, { "epoch": 0.1935, "grad_norm": 3.0364744663238525, "grad_norm_var": 0.1482419605375867, "learning_rate": 0.0001, "loss": 1.1003, "loss/crossentropy": 2.34012770652771, "loss/hidden": 0.81640625, "loss/logits": 0.14404839277267456, "loss/reg": 0.013987723737955093, "step": 1548 }, { "epoch": 0.193625, "grad_norm": 3.133777141571045, "grad_norm_var": 0.12593383250846937, "learning_rate": 0.0001, "loss": 1.0985, "loss/crossentropy": 2.618865966796875, "loss/hidden": 0.8203125, "loss/logits": 0.1383739411830902, "loss/reg": 0.013980153016746044, "step": 1549 }, { "epoch": 0.19375, "grad_norm": 3.1407878398895264, "grad_norm_var": 0.12923131391317852, "learning_rate": 0.0001, "loss": 1.101, "loss/crossentropy": 2.688640832901001, "loss/hidden": 0.8046875, "loss/logits": 0.1566149890422821, "loss/reg": 0.013972623273730278, "step": 1550 }, { "epoch": 0.193875, "grad_norm": 3.174556016921997, "grad_norm_var": 0.1309922878345463, "learning_rate": 0.0001, "loss": 1.1717, "loss/crossentropy": 2.1879935264587402, "loss/hidden": 0.875, "loss/logits": 0.15705443918704987, "loss/reg": 0.013964612036943436, "step": 1551 }, { "epoch": 0.194, "grad_norm": 3.0509793758392334, "grad_norm_var": 0.09391514491316547, "learning_rate": 0.0001, "loss": 0.9586, "loss/crossentropy": 2.3660058975219727, "loss/hidden": 0.70703125, "loss/logits": 0.11203601211309433, "loss/reg": 0.013956940732896328, "step": 1552 }, { "epoch": 0.194125, "grad_norm": 3.3187499046325684, "grad_norm_var": 0.10123814801312937, "learning_rate": 0.0001, "loss": 1.1062, "loss/crossentropy": 2.394197463989258, "loss/hidden": 0.81640625, "loss/logits": 0.150349423289299, "loss/reg": 0.013949030078947544, "step": 1553 }, { "epoch": 0.19425, "grad_norm": 2.903859853744507, "grad_norm_var": 0.10075169250019347, "learning_rate": 0.0001, "loss": 1.0472, "loss/crossentropy": 2.6103944778442383, "loss/hidden": 0.7578125, "loss/logits": 0.1500079333782196, "loss/reg": 0.013941409066319466, "step": 1554 }, { "epoch": 0.194375, "grad_norm": 3.239140272140503, "grad_norm_var": 0.1006023266548631, "learning_rate": 0.0001, "loss": 1.1446, "loss/crossentropy": 2.4417059421539307, "loss/hidden": 0.859375, "loss/logits": 0.1459202617406845, "loss/reg": 0.01393379457294941, "step": 1555 }, { "epoch": 0.1945, "grad_norm": 2.5746965408325195, "grad_norm_var": 0.10814357204767854, "learning_rate": 0.0001, "loss": 1.0554, "loss/crossentropy": 2.532374620437622, "loss/hidden": 0.76953125, "loss/logits": 0.1466185450553894, "loss/reg": 0.013926350511610508, "step": 1556 }, { "epoch": 0.194625, "grad_norm": 3.628814697265625, "grad_norm_var": 0.13450941960769924, "learning_rate": 0.0001, "loss": 1.3249, "loss/crossentropy": 2.3864905834198, "loss/hidden": 1.0234375, "loss/logits": 0.16223978996276855, "loss/reg": 0.013918645679950714, "step": 1557 }, { "epoch": 0.19475, "grad_norm": 2.513291358947754, "grad_norm_var": 0.11736836954879663, "learning_rate": 0.0001, "loss": 1.0366, "loss/crossentropy": 2.7693920135498047, "loss/hidden": 0.765625, "loss/logits": 0.1318259835243225, "loss/reg": 0.013910802081227303, "step": 1558 }, { "epoch": 0.194875, "grad_norm": 3.2579643726348877, "grad_norm_var": 0.09872798985575174, "learning_rate": 0.0001, "loss": 1.0427, "loss/crossentropy": 2.501122236251831, "loss/hidden": 0.77734375, "loss/logits": 0.1263553649187088, "loss/reg": 0.013903032056987286, "step": 1559 }, { "epoch": 0.195, "grad_norm": 20.420337677001953, "grad_norm_var": 18.91951441564413, "learning_rate": 0.0001, "loss": 1.3618, "loss/crossentropy": 2.46478533744812, "loss/hidden": 1.046875, "loss/logits": 0.1759488880634308, "loss/reg": 0.013895487412810326, "step": 1560 }, { "epoch": 0.195125, "grad_norm": 2.696916341781616, "grad_norm_var": 19.032016931453608, "learning_rate": 0.0001, "loss": 1.174, "loss/crossentropy": 2.2702722549438477, "loss/hidden": 0.83203125, "loss/logits": 0.2030649483203888, "loss/reg": 0.013887940905988216, "step": 1561 }, { "epoch": 0.19525, "grad_norm": 4.7133378982543945, "grad_norm_var": 18.93870030552219, "learning_rate": 0.0001, "loss": 1.2984, "loss/crossentropy": 1.9574859142303467, "loss/hidden": 1.015625, "loss/logits": 0.144018292427063, "loss/reg": 0.013880369253456593, "step": 1562 }, { "epoch": 0.195375, "grad_norm": 2.689742088317871, "grad_norm_var": 18.930805267251564, "learning_rate": 0.0001, "loss": 1.0472, "loss/crossentropy": 2.5726308822631836, "loss/hidden": 0.76953125, "loss/logits": 0.1389167606830597, "loss/reg": 0.013872841373085976, "step": 1563 }, { "epoch": 0.1955, "grad_norm": 3.9613354206085205, "grad_norm_var": 18.838524358177413, "learning_rate": 0.0001, "loss": 1.0574, "loss/crossentropy": 2.2799181938171387, "loss/hidden": 0.77734375, "loss/logits": 0.14136341214179993, "loss/reg": 0.013865368440747261, "step": 1564 }, { "epoch": 0.195625, "grad_norm": 2.84377121925354, "grad_norm_var": 18.88795320188418, "learning_rate": 0.0001, "loss": 1.2585, "loss/crossentropy": 2.306931495666504, "loss/hidden": 0.9296875, "loss/logits": 0.19025813043117523, "loss/reg": 0.01385766826570034, "step": 1565 }, { "epoch": 0.19575, "grad_norm": 3.5409600734710693, "grad_norm_var": 18.83835057402114, "learning_rate": 0.0001, "loss": 1.0235, "loss/crossentropy": 2.611679792404175, "loss/hidden": 0.734375, "loss/logits": 0.15062148869037628, "loss/reg": 0.013849391601979733, "step": 1566 }, { "epoch": 0.195875, "grad_norm": 2.4071547985076904, "grad_norm_var": 18.988576179472293, "learning_rate": 0.0001, "loss": 0.9748, "loss/crossentropy": 2.4327828884124756, "loss/hidden": 0.7265625, "loss/logits": 0.10979950428009033, "loss/reg": 0.013841108419001102, "step": 1567 }, { "epoch": 0.196, "grad_norm": 2.5137085914611816, "grad_norm_var": 19.091440757813977, "learning_rate": 0.0001, "loss": 1.0107, "loss/crossentropy": 2.6318089962005615, "loss/hidden": 0.75, "loss/logits": 0.12240324914455414, "loss/reg": 0.013833708129823208, "step": 1568 }, { "epoch": 0.196125, "grad_norm": 3.096564769744873, "grad_norm_var": 19.120676935364486, "learning_rate": 0.0001, "loss": 1.0895, "loss/crossentropy": 2.374494791030884, "loss/hidden": 0.796875, "loss/logits": 0.15435993671417236, "loss/reg": 0.013826224021613598, "step": 1569 }, { "epoch": 0.19625, "grad_norm": 2.169402837753296, "grad_norm_var": 19.28010469927802, "learning_rate": 0.0001, "loss": 0.9576, "loss/crossentropy": 2.391505479812622, "loss/hidden": 0.6953125, "loss/logits": 0.12412133812904358, "loss/reg": 0.013818818144500256, "step": 1570 }, { "epoch": 0.196375, "grad_norm": 3.0437140464782715, "grad_norm_var": 19.30600940844608, "learning_rate": 0.0001, "loss": 1.239, "loss/crossentropy": 2.501621723175049, "loss/hidden": 0.9140625, "loss/logits": 0.18683794140815735, "loss/reg": 0.013811350800096989, "step": 1571 }, { "epoch": 0.1965, "grad_norm": 2.5897915363311768, "grad_norm_var": 19.302894385600194, "learning_rate": 0.0001, "loss": 1.035, "loss/crossentropy": 2.4975814819335938, "loss/hidden": 0.7578125, "loss/logits": 0.1391228437423706, "loss/reg": 0.013803965412080288, "step": 1572 }, { "epoch": 0.196625, "grad_norm": 2.516031503677368, "grad_norm_var": 19.454711828122623, "learning_rate": 0.0001, "loss": 0.9646, "loss/crossentropy": 2.3849472999572754, "loss/hidden": 0.703125, "loss/logits": 0.12352485954761505, "loss/reg": 0.013796493411064148, "step": 1573 }, { "epoch": 0.19675, "grad_norm": 2.0846610069274902, "grad_norm_var": 19.55464017386997, "learning_rate": 0.0001, "loss": 1.0915, "loss/crossentropy": 2.333425998687744, "loss/hidden": 0.80078125, "loss/logits": 0.15285080671310425, "loss/reg": 0.013788769952952862, "step": 1574 }, { "epoch": 0.196875, "grad_norm": 2.8610305786132812, "grad_norm_var": 19.605563364937975, "learning_rate": 0.0001, "loss": 0.9948, "loss/crossentropy": 2.2638580799102783, "loss/hidden": 0.69140625, "loss/logits": 0.16560885310173035, "loss/reg": 0.013781173154711723, "step": 1575 }, { "epoch": 0.197, "grad_norm": 2.6086127758026123, "grad_norm_var": 0.45959099378937346, "learning_rate": 0.0001, "loss": 1.1686, "loss/crossentropy": 2.385111093521118, "loss/hidden": 0.87890625, "loss/logits": 0.15191304683685303, "loss/reg": 0.01377350464463234, "step": 1576 }, { "epoch": 0.197125, "grad_norm": 4.653759002685547, "grad_norm_var": 0.6469626890922541, "learning_rate": 0.0001, "loss": 1.3708, "loss/crossentropy": 2.556640625, "loss/hidden": 1.0390625, "loss/logits": 0.1940462738275528, "loss/reg": 0.013765547424554825, "step": 1577 }, { "epoch": 0.19725, "grad_norm": 3.648402690887451, "grad_norm_var": 0.4771692938880648, "learning_rate": 0.0001, "loss": 1.0714, "loss/crossentropy": 2.9089317321777344, "loss/hidden": 0.80078125, "loss/logits": 0.13300660252571106, "loss/reg": 0.013758075423538685, "step": 1578 }, { "epoch": 0.197375, "grad_norm": 3.6522624492645264, "grad_norm_var": 0.5014419172244267, "learning_rate": 0.0001, "loss": 1.1563, "loss/crossentropy": 2.547022581100464, "loss/hidden": 0.875, "loss/logits": 0.14384308457374573, "loss/reg": 0.013750174082815647, "step": 1579 }, { "epoch": 0.1975, "grad_norm": 3.541172981262207, "grad_norm_var": 0.4592891725510337, "learning_rate": 0.0001, "loss": 1.3161, "loss/crossentropy": 2.342548370361328, "loss/hidden": 1.015625, "loss/logits": 0.16299855709075928, "loss/reg": 0.013742712326347828, "step": 1580 }, { "epoch": 0.197625, "grad_norm": 3.876798391342163, "grad_norm_var": 0.506438619715117, "learning_rate": 0.0001, "loss": 1.6102, "loss/crossentropy": 2.066164970397949, "loss/hidden": 1.2265625, "loss/logits": 0.24627353250980377, "loss/reg": 0.01373511552810669, "step": 1581 }, { "epoch": 0.19775, "grad_norm": 2.5165457725524902, "grad_norm_var": 0.5050025113667628, "learning_rate": 0.0001, "loss": 0.9853, "loss/crossentropy": 2.4709391593933105, "loss/hidden": 0.73046875, "loss/logits": 0.1175711378455162, "loss/reg": 0.013726145029067993, "step": 1582 }, { "epoch": 0.197875, "grad_norm": 2.9556968212127686, "grad_norm_var": 0.48145601689114204, "learning_rate": 0.0001, "loss": 1.0748, "loss/crossentropy": 2.4822633266448975, "loss/hidden": 0.78515625, "loss/logits": 0.1524786353111267, "loss/reg": 0.013718714937567711, "step": 1583 }, { "epoch": 0.198, "grad_norm": 3.161695957183838, "grad_norm_var": 0.46391222848667796, "learning_rate": 0.0001, "loss": 1.0736, "loss/crossentropy": 2.565737724304199, "loss/hidden": 0.8046875, "loss/logits": 0.13177385926246643, "loss/reg": 0.013710074126720428, "step": 1584 }, { "epoch": 0.198125, "grad_norm": 7.7486958503723145, "grad_norm_var": 1.8386121671978246, "learning_rate": 0.0001, "loss": 1.5153, "loss/crossentropy": 2.1480207443237305, "loss/hidden": 1.1484375, "loss/logits": 0.22985613346099854, "loss/reg": 0.013702766969799995, "step": 1585 }, { "epoch": 0.19825, "grad_norm": 3.0685408115386963, "grad_norm_var": 1.7473924169074466, "learning_rate": 0.0001, "loss": 1.1056, "loss/crossentropy": 2.5632662773132324, "loss/hidden": 0.81640625, "loss/logits": 0.15228858590126038, "loss/reg": 0.013695274479687214, "step": 1586 }, { "epoch": 0.198375, "grad_norm": 4.356388568878174, "grad_norm_var": 1.7913349785216173, "learning_rate": 0.0001, "loss": 1.2621, "loss/crossentropy": 2.306581497192383, "loss/hidden": 0.92578125, "loss/logits": 0.19942906498908997, "loss/reg": 0.013687408529222012, "step": 1587 }, { "epoch": 0.1985, "grad_norm": 2.662060260772705, "grad_norm_var": 1.7829870936881813, "learning_rate": 0.0001, "loss": 1.2003, "loss/crossentropy": 2.5029749870300293, "loss/hidden": 0.8984375, "loss/logits": 0.16509464383125305, "loss/reg": 0.013679537922143936, "step": 1588 }, { "epoch": 0.198625, "grad_norm": 4.810679912567139, "grad_norm_var": 1.8127030143600467, "learning_rate": 0.0001, "loss": 1.3583, "loss/crossentropy": 2.612169027328491, "loss/hidden": 1.03125, "loss/logits": 0.1903519481420517, "loss/reg": 0.01367125939577818, "step": 1589 }, { "epoch": 0.19875, "grad_norm": 2.754514217376709, "grad_norm_var": 1.7020179846119938, "learning_rate": 0.0001, "loss": 1.0334, "loss/crossentropy": 2.3626105785369873, "loss/hidden": 0.75390625, "loss/logits": 0.1428566873073578, "loss/reg": 0.013663833029568195, "step": 1590 }, { "epoch": 0.198875, "grad_norm": 3.2916183471679688, "grad_norm_var": 1.6665986976673535, "learning_rate": 0.0001, "loss": 1.2137, "loss/crossentropy": 2.2759227752685547, "loss/hidden": 0.87109375, "loss/logits": 0.20607063174247742, "loss/reg": 0.013655973598361015, "step": 1591 }, { "epoch": 0.199, "grad_norm": 2.276350975036621, "grad_norm_var": 1.7221462363283602, "learning_rate": 0.0001, "loss": 0.9194, "loss/crossentropy": 2.429999351501465, "loss/hidden": 0.671875, "loss/logits": 0.11101213842630386, "loss/reg": 0.013647787272930145, "step": 1592 }, { "epoch": 0.199125, "grad_norm": 2.7854268550872803, "grad_norm_var": 1.6992207121709098, "learning_rate": 0.0001, "loss": 1.1081, "loss/crossentropy": 2.6592743396759033, "loss/hidden": 0.81640625, "loss/logits": 0.15529003739356995, "loss/reg": 0.013640275225043297, "step": 1593 }, { "epoch": 0.19925, "grad_norm": 2.5592005252838135, "grad_norm_var": 1.7618627623489904, "learning_rate": 0.0001, "loss": 1.1146, "loss/crossentropy": 2.27114200592041, "loss/hidden": 0.8515625, "loss/logits": 0.1267244815826416, "loss/reg": 0.013632478192448616, "step": 1594 }, { "epoch": 0.199375, "grad_norm": 3.1051416397094727, "grad_norm_var": 1.7695445919247692, "learning_rate": 0.0001, "loss": 1.2779, "loss/crossentropy": 2.096179246902466, "loss/hidden": 0.96875, "loss/logits": 0.1729447841644287, "loss/reg": 0.013624493032693863, "step": 1595 }, { "epoch": 0.1995, "grad_norm": 2.740203857421875, "grad_norm_var": 1.8017103679937707, "learning_rate": 0.0001, "loss": 0.9738, "loss/crossentropy": 2.5383737087249756, "loss/hidden": 0.7109375, "loss/logits": 0.12667007744312286, "loss/reg": 0.013617145828902721, "step": 1596 }, { "epoch": 0.199625, "grad_norm": 2.787034034729004, "grad_norm_var": 1.8091027588667723, "learning_rate": 0.0001, "loss": 1.0191, "loss/crossentropy": 2.452460527420044, "loss/hidden": 0.7578125, "loss/logits": 0.12520015239715576, "loss/reg": 0.013609787449240685, "step": 1597 }, { "epoch": 0.19975, "grad_norm": 2.521711587905884, "grad_norm_var": 1.8085312337868815, "learning_rate": 0.0001, "loss": 1.1116, "loss/crossentropy": 2.4604299068450928, "loss/hidden": 0.828125, "loss/logits": 0.14743748307228088, "loss/reg": 0.013602245599031448, "step": 1598 }, { "epoch": 0.199875, "grad_norm": 2.541367530822754, "grad_norm_var": 1.840991450339924, "learning_rate": 0.0001, "loss": 1.1753, "loss/crossentropy": 2.3595387935638428, "loss/hidden": 0.88671875, "loss/logits": 0.1526535153388977, "loss/reg": 0.013594499789178371, "step": 1599 }, { "epoch": 0.2, "grad_norm": 3.170604944229126, "grad_norm_var": 1.840804608226538, "learning_rate": 0.0001, "loss": 1.1393, "loss/crossentropy": 2.679455041885376, "loss/hidden": 0.83984375, "loss/logits": 0.16357922554016113, "loss/reg": 0.01358658354729414, "step": 1600 }, { "epoch": 0.200125, "grad_norm": 3.085852861404419, "grad_norm_var": 0.44862457908304804, "learning_rate": 0.0001, "loss": 1.162, "loss/crossentropy": 2.183800220489502, "loss/hidden": 0.87109375, "loss/logits": 0.15514615178108215, "loss/reg": 0.013579203747212887, "step": 1601 }, { "epoch": 0.20025, "grad_norm": 2.978541851043701, "grad_norm_var": 0.44869585537156215, "learning_rate": 0.0001, "loss": 1.1595, "loss/crossentropy": 2.7346818447113037, "loss/hidden": 0.8515625, "loss/logits": 0.1722511649131775, "loss/reg": 0.01357145607471466, "step": 1602 }, { "epoch": 0.200375, "grad_norm": 2.433056116104126, "grad_norm_var": 0.3388972014701608, "learning_rate": 0.0001, "loss": 1.0578, "loss/crossentropy": 2.400810718536377, "loss/hidden": 0.78515625, "loss/logits": 0.13697531819343567, "loss/reg": 0.013563701882958412, "step": 1603 }, { "epoch": 0.2005, "grad_norm": 2.651841640472412, "grad_norm_var": 0.3392367186207063, "learning_rate": 0.0001, "loss": 1.0811, "loss/crossentropy": 2.490342617034912, "loss/hidden": 0.78515625, "loss/logits": 0.1603870689868927, "loss/reg": 0.013556394726037979, "step": 1604 }, { "epoch": 0.200625, "grad_norm": 2.8456075191497803, "grad_norm_var": 0.08148981985714249, "learning_rate": 0.0001, "loss": 1.1882, "loss/crossentropy": 2.4219372272491455, "loss/hidden": 0.87890625, "loss/logits": 0.17379163205623627, "loss/reg": 0.0135487737134099, "step": 1605 }, { "epoch": 0.20075, "grad_norm": 2.8339900970458984, "grad_norm_var": 0.08158268879521771, "learning_rate": 0.0001, "loss": 0.9949, "loss/crossentropy": 2.611527442932129, "loss/hidden": 0.734375, "loss/logits": 0.12512920796871185, "loss/reg": 0.013541026972234249, "step": 1606 }, { "epoch": 0.200875, "grad_norm": 3.8811702728271484, "grad_norm_var": 0.14289600365006525, "learning_rate": 0.0001, "loss": 1.0861, "loss/crossentropy": 2.5142064094543457, "loss/hidden": 0.8125, "loss/logits": 0.13831937313079834, "loss/reg": 0.013533033430576324, "step": 1607 }, { "epoch": 0.201, "grad_norm": 3.6074378490448, "grad_norm_var": 0.15629189387123668, "learning_rate": 0.0001, "loss": 1.0655, "loss/crossentropy": 2.4579451084136963, "loss/hidden": 0.79296875, "loss/logits": 0.13723407685756683, "loss/reg": 0.013525336049497128, "step": 1608 }, { "epoch": 0.201125, "grad_norm": 2.8290226459503174, "grad_norm_var": 0.1556981224441491, "learning_rate": 0.0001, "loss": 1.0043, "loss/crossentropy": 2.7764294147491455, "loss/hidden": 0.73828125, "loss/logits": 0.1308579295873642, "loss/reg": 0.01351715624332428, "step": 1609 }, { "epoch": 0.20125, "grad_norm": 2.59934401512146, "grad_norm_var": 0.15391725674414733, "learning_rate": 0.0001, "loss": 1.0636, "loss/crossentropy": 2.5147807598114014, "loss/hidden": 0.79296875, "loss/logits": 0.13549384474754333, "loss/reg": 0.013509852811694145, "step": 1610 }, { "epoch": 0.201375, "grad_norm": 2.7972164154052734, "grad_norm_var": 0.15196475783560098, "learning_rate": 0.0001, "loss": 1.0975, "loss/crossentropy": 2.507530689239502, "loss/hidden": 0.81640625, "loss/logits": 0.14603210985660553, "loss/reg": 0.013502140529453754, "step": 1611 }, { "epoch": 0.2015, "grad_norm": 3.2658374309539795, "grad_norm_var": 0.15845418736495442, "learning_rate": 0.0001, "loss": 1.2549, "loss/crossentropy": 2.164236307144165, "loss/hidden": 0.9453125, "loss/logits": 0.17467688024044037, "loss/reg": 0.01349355187267065, "step": 1612 }, { "epoch": 0.201625, "grad_norm": 2.7505931854248047, "grad_norm_var": 0.15921652951525, "learning_rate": 0.0001, "loss": 1.0594, "loss/crossentropy": 2.0924651622772217, "loss/hidden": 0.80078125, "loss/logits": 0.12380584329366684, "loss/reg": 0.013486127369105816, "step": 1613 }, { "epoch": 0.20175, "grad_norm": 3.203188180923462, "grad_norm_var": 0.15163660104867607, "learning_rate": 0.0001, "loss": 1.1208, "loss/crossentropy": 2.62121319770813, "loss/hidden": 0.84765625, "loss/logits": 0.1383514702320099, "loss/reg": 0.013478783890604973, "step": 1614 }, { "epoch": 0.201875, "grad_norm": 2.6951982975006104, "grad_norm_var": 0.1443821198786272, "learning_rate": 0.0001, "loss": 1.4372, "loss/crossentropy": 2.364288091659546, "loss/hidden": 1.109375, "loss/logits": 0.19316649436950684, "loss/reg": 0.013470477424561977, "step": 1615 }, { "epoch": 0.202, "grad_norm": 2.867556095123291, "grad_norm_var": 0.1422903014164942, "learning_rate": 0.0001, "loss": 1.0695, "loss/crossentropy": 2.412200689315796, "loss/hidden": 0.8046875, "loss/logits": 0.1301771104335785, "loss/reg": 0.01346264686435461, "step": 1616 }, { "epoch": 0.202125, "grad_norm": 3.35479736328125, "grad_norm_var": 0.15140141291244522, "learning_rate": 0.0001, "loss": 1.1988, "loss/crossentropy": 2.4715232849121094, "loss/hidden": 0.90234375, "loss/logits": 0.16192206740379333, "loss/reg": 0.013454729691147804, "step": 1617 }, { "epoch": 0.20225, "grad_norm": 22.905237197875977, "grad_norm_var": 24.978816029077244, "learning_rate": 0.0001, "loss": 1.579, "loss/crossentropy": 2.2932474613189697, "loss/hidden": 1.21875, "loss/logits": 0.22579804062843323, "loss/reg": 0.013446752913296223, "step": 1618 }, { "epoch": 0.202375, "grad_norm": 2.676539182662964, "grad_norm_var": 24.92450698201823, "learning_rate": 0.0001, "loss": 1.0158, "loss/crossentropy": 2.443284034729004, "loss/hidden": 0.75, "loss/logits": 0.13141301274299622, "loss/reg": 0.01343945600092411, "step": 1619 }, { "epoch": 0.2025, "grad_norm": 3.8005261421203613, "grad_norm_var": 24.764457157993366, "learning_rate": 0.0001, "loss": 1.2057, "loss/crossentropy": 2.7221662998199463, "loss/hidden": 0.91015625, "loss/logits": 0.1611778438091278, "loss/reg": 0.013432328589260578, "step": 1620 }, { "epoch": 0.202625, "grad_norm": 2.675564765930176, "grad_norm_var": 24.79939933153828, "learning_rate": 0.0001, "loss": 1.0176, "loss/crossentropy": 2.7226831912994385, "loss/hidden": 0.75390625, "loss/logits": 0.12948712706565857, "loss/reg": 0.013424508273601532, "step": 1621 }, { "epoch": 0.20275, "grad_norm": 5.532473564147949, "grad_norm_var": 24.728322365826084, "learning_rate": 0.0001, "loss": 1.393, "loss/crossentropy": 2.501004219055176, "loss/hidden": 1.0546875, "loss/logits": 0.20409145951271057, "loss/reg": 0.013417219743132591, "step": 1622 }, { "epoch": 0.202875, "grad_norm": 2.9599437713623047, "grad_norm_var": 24.853088500483913, "learning_rate": 0.0001, "loss": 1.047, "loss/crossentropy": 2.583350419998169, "loss/hidden": 0.7890625, "loss/logits": 0.1238275021314621, "loss/reg": 0.013409611769020557, "step": 1623 }, { "epoch": 0.203, "grad_norm": 2.609393358230591, "grad_norm_var": 25.021814610211056, "learning_rate": 0.0001, "loss": 1.0136, "loss/crossentropy": 2.800400495529175, "loss/hidden": 0.74609375, "loss/logits": 0.13346146047115326, "loss/reg": 0.013402425684034824, "step": 1624 }, { "epoch": 0.203125, "grad_norm": 3.3347935676574707, "grad_norm_var": 24.93556049048077, "learning_rate": 0.0001, "loss": 1.3162, "loss/crossentropy": 2.306166648864746, "loss/hidden": 1.0, "loss/logits": 0.1822924166917801, "loss/reg": 0.013394915498793125, "step": 1625 }, { "epoch": 0.20325, "grad_norm": 2.487501859664917, "grad_norm_var": 24.9628476598421, "learning_rate": 0.0001, "loss": 1.163, "loss/crossentropy": 2.2325022220611572, "loss/hidden": 0.8828125, "loss/logits": 0.1463523954153061, "loss/reg": 0.013387140817940235, "step": 1626 }, { "epoch": 0.203375, "grad_norm": 2.961826801300049, "grad_norm_var": 24.930026653662896, "learning_rate": 0.0001, "loss": 1.1983, "loss/crossentropy": 2.5841550827026367, "loss/hidden": 0.85546875, "loss/logits": 0.2090195268392563, "loss/reg": 0.013379319570958614, "step": 1627 }, { "epoch": 0.2035, "grad_norm": 2.5991106033325195, "grad_norm_var": 25.056860448715973, "learning_rate": 0.0001, "loss": 1.0763, "loss/crossentropy": 2.617375135421753, "loss/hidden": 0.796875, "loss/logits": 0.1457059681415558, "loss/reg": 0.01337137445807457, "step": 1628 }, { "epoch": 0.203625, "grad_norm": 3.130793571472168, "grad_norm_var": 24.985404162175104, "learning_rate": 0.0001, "loss": 1.4514, "loss/crossentropy": 2.28818678855896, "loss/hidden": 1.09375, "loss/logits": 0.2240520417690277, "loss/reg": 0.013363334350287914, "step": 1629 }, { "epoch": 0.20375, "grad_norm": 3.3310132026672363, "grad_norm_var": 24.96667274126064, "learning_rate": 0.0001, "loss": 1.2145, "loss/crossentropy": 2.4561591148376465, "loss/hidden": 0.8828125, "loss/logits": 0.1981697976589203, "loss/reg": 0.013356066308915615, "step": 1630 }, { "epoch": 0.203875, "grad_norm": 2.9992592334747314, "grad_norm_var": 24.904546403803987, "learning_rate": 0.0001, "loss": 1.3327, "loss/crossentropy": 2.304450035095215, "loss/hidden": 1.03125, "loss/logits": 0.16797912120819092, "loss/reg": 0.013348528183996677, "step": 1631 }, { "epoch": 0.204, "grad_norm": 3.1939244270324707, "grad_norm_var": 24.844990519484032, "learning_rate": 0.0001, "loss": 1.0774, "loss/crossentropy": 2.4736955165863037, "loss/hidden": 0.7890625, "loss/logits": 0.15497320890426636, "loss/reg": 0.013341384008526802, "step": 1632 }, { "epoch": 0.204125, "grad_norm": 2.8169233798980713, "grad_norm_var": 24.938715041703063, "learning_rate": 0.0001, "loss": 0.8683, "loss/crossentropy": 2.676666736602783, "loss/hidden": 0.62109375, "loss/logits": 0.11391356587409973, "loss/reg": 0.013333736918866634, "step": 1633 }, { "epoch": 0.20425, "grad_norm": 3.69463849067688, "grad_norm_var": 0.5429387753821554, "learning_rate": 0.0001, "loss": 1.7185, "loss/crossentropy": 1.9171524047851562, "loss/hidden": 1.3125, "loss/logits": 0.27270910143852234, "loss/reg": 0.013325790874660015, "step": 1634 }, { "epoch": 0.204375, "grad_norm": 2.5067968368530273, "grad_norm_var": 0.5560268531523621, "learning_rate": 0.0001, "loss": 0.9318, "loss/crossentropy": 2.6572604179382324, "loss/hidden": 0.6875, "loss/logits": 0.11111121624708176, "loss/reg": 0.013317680917680264, "step": 1635 }, { "epoch": 0.2045, "grad_norm": 2.7445805072784424, "grad_norm_var": 0.5361896610226305, "learning_rate": 0.0001, "loss": 1.1343, "loss/crossentropy": 2.34808087348938, "loss/hidden": 0.8359375, "loss/logits": 0.16526776552200317, "loss/reg": 0.013309688307344913, "step": 1636 }, { "epoch": 0.204625, "grad_norm": 2.5069262981414795, "grad_norm_var": 0.5474804142573985, "learning_rate": 0.0001, "loss": 0.9285, "loss/crossentropy": 2.678297758102417, "loss/hidden": 0.6796875, "loss/logits": 0.11579211056232452, "loss/reg": 0.013302477076649666, "step": 1637 }, { "epoch": 0.20475, "grad_norm": 3.034487247467041, "grad_norm_var": 0.12334773017239489, "learning_rate": 0.0001, "loss": 1.1895, "loss/crossentropy": 2.0104122161865234, "loss/hidden": 0.90234375, "loss/logits": 0.15417572855949402, "loss/reg": 0.013295282609760761, "step": 1638 }, { "epoch": 0.204875, "grad_norm": 2.7470128536224365, "grad_norm_var": 0.12538795384078488, "learning_rate": 0.0001, "loss": 1.0896, "loss/crossentropy": 2.2556824684143066, "loss/hidden": 0.8203125, "loss/logits": 0.13637785613536835, "loss/reg": 0.013288102112710476, "step": 1639 }, { "epoch": 0.205, "grad_norm": 2.6958811283111572, "grad_norm_var": 0.12228878695745683, "learning_rate": 0.0001, "loss": 0.9741, "loss/crossentropy": 2.2803361415863037, "loss/hidden": 0.72265625, "loss/logits": 0.11862225830554962, "loss/reg": 0.013280958868563175, "step": 1640 }, { "epoch": 0.205125, "grad_norm": 2.923374652862549, "grad_norm_var": 0.11033848957714554, "learning_rate": 0.0001, "loss": 1.1012, "loss/crossentropy": 2.5998592376708984, "loss/hidden": 0.82421875, "loss/logits": 0.14425665140151978, "loss/reg": 0.013273877091705799, "step": 1641 }, { "epoch": 0.20525, "grad_norm": 16.231281280517578, "grad_norm_var": 11.163123044335308, "learning_rate": 0.0001, "loss": 1.573, "loss/crossentropy": 2.28498911857605, "loss/hidden": 1.1875, "loss/logits": 0.25285494327545166, "loss/reg": 0.013266587629914284, "step": 1642 }, { "epoch": 0.205375, "grad_norm": 2.8149280548095703, "grad_norm_var": 11.180053543888178, "learning_rate": 0.0001, "loss": 1.1024, "loss/crossentropy": 2.148362636566162, "loss/hidden": 0.8125, "loss/logits": 0.15731269121170044, "loss/reg": 0.013259139843285084, "step": 1643 }, { "epoch": 0.2055, "grad_norm": 3.9090774059295654, "grad_norm_var": 11.086604757622807, "learning_rate": 0.0001, "loss": 1.4303, "loss/crossentropy": 2.6009087562561035, "loss/hidden": 1.0703125, "loss/logits": 0.22749975323677063, "loss/reg": 0.01325154397636652, "step": 1644 }, { "epoch": 0.205625, "grad_norm": 2.7664973735809326, "grad_norm_var": 11.128864400454058, "learning_rate": 0.0001, "loss": 0.9888, "loss/crossentropy": 2.576356887817383, "loss/hidden": 0.734375, "loss/logits": 0.12194088101387024, "loss/reg": 0.013244031928479671, "step": 1645 }, { "epoch": 0.20575, "grad_norm": 2.7838664054870605, "grad_norm_var": 11.182320606560461, "learning_rate": 0.0001, "loss": 1.0593, "loss/crossentropy": 2.458827018737793, "loss/hidden": 0.80078125, "loss/logits": 0.12612512707710266, "loss/reg": 0.013236268423497677, "step": 1646 }, { "epoch": 0.205875, "grad_norm": 3.0026252269744873, "grad_norm_var": 11.181974019665732, "learning_rate": 0.0001, "loss": 1.0012, "loss/crossentropy": 2.8010799884796143, "loss/hidden": 0.73828125, "loss/logits": 0.1305844783782959, "loss/reg": 0.01322907768189907, "step": 1647 }, { "epoch": 0.206, "grad_norm": 3.544288158416748, "grad_norm_var": 11.162580503190616, "learning_rate": 0.0001, "loss": 1.2099, "loss/crossentropy": 2.5440893173217773, "loss/hidden": 0.8828125, "loss/logits": 0.19490712881088257, "loss/reg": 0.013221465051174164, "step": 1648 }, { "epoch": 0.206125, "grad_norm": 3.2531473636627197, "grad_norm_var": 11.117574070283142, "learning_rate": 0.0001, "loss": 1.3131, "loss/crossentropy": 2.267230272293091, "loss/hidden": 0.98828125, "loss/logits": 0.19262857735157013, "loss/reg": 0.013214114122092724, "step": 1649 }, { "epoch": 0.20625, "grad_norm": 2.65549635887146, "grad_norm_var": 11.202772982286575, "learning_rate": 0.0001, "loss": 1.1064, "loss/crossentropy": 2.623307228088379, "loss/hidden": 0.8125, "loss/logits": 0.16180500388145447, "loss/reg": 0.013206473551690578, "step": 1650 }, { "epoch": 0.206375, "grad_norm": 3.4816501140594482, "grad_norm_var": 11.09960005034767, "learning_rate": 0.0001, "loss": 1.1543, "loss/crossentropy": 2.644336462020874, "loss/hidden": 0.84375, "loss/logits": 0.17855095863342285, "loss/reg": 0.013198580592870712, "step": 1651 }, { "epoch": 0.2065, "grad_norm": 2.8406617641448975, "grad_norm_var": 11.086419925476456, "learning_rate": 0.0001, "loss": 0.9424, "loss/crossentropy": 2.5468101501464844, "loss/hidden": 0.69921875, "loss/logits": 0.11126389354467392, "loss/reg": 0.01318978238850832, "step": 1652 }, { "epoch": 0.206625, "grad_norm": 2.7195427417755127, "grad_norm_var": 11.051894988141786, "learning_rate": 0.0001, "loss": 1.2921, "loss/crossentropy": 2.5399060249328613, "loss/hidden": 0.96484375, "loss/logits": 0.19546636939048767, "loss/reg": 0.013182584196329117, "step": 1653 }, { "epoch": 0.20675, "grad_norm": 2.4805049896240234, "grad_norm_var": 11.13040761168501, "learning_rate": 0.0001, "loss": 0.9454, "loss/crossentropy": 2.6860992908477783, "loss/hidden": 0.69921875, "loss/logits": 0.11445553600788116, "loss/reg": 0.013174121268093586, "step": 1654 }, { "epoch": 0.206875, "grad_norm": 2.9009270668029785, "grad_norm_var": 11.110214998589447, "learning_rate": 0.0001, "loss": 1.1267, "loss/crossentropy": 2.6687192916870117, "loss/hidden": 0.8359375, "loss/logits": 0.15907853841781616, "loss/reg": 0.013165648095309734, "step": 1655 }, { "epoch": 0.207, "grad_norm": 5.767265796661377, "grad_norm_var": 11.242431274213367, "learning_rate": 0.0001, "loss": 1.3322, "loss/crossentropy": 2.3070762157440186, "loss/hidden": 1.0234375, "loss/logits": 0.17719680070877075, "loss/reg": 0.013158419169485569, "step": 1656 }, { "epoch": 0.207125, "grad_norm": 2.9326095581054688, "grad_norm_var": 11.241105151169359, "learning_rate": 0.0001, "loss": 1.2628, "loss/crossentropy": 2.429621934890747, "loss/hidden": 0.94921875, "loss/logits": 0.18210232257843018, "loss/reg": 0.013150298967957497, "step": 1657 }, { "epoch": 0.20725, "grad_norm": 3.11118483543396, "grad_norm_var": 0.6121424659486439, "learning_rate": 0.0001, "loss": 1.0707, "loss/crossentropy": 2.6597509384155273, "loss/hidden": 0.78125, "loss/logits": 0.15797501802444458, "loss/reg": 0.013143090531229973, "step": 1658 }, { "epoch": 0.207375, "grad_norm": 3.273897886276245, "grad_norm_var": 0.6026450391734357, "learning_rate": 0.0001, "loss": 1.5868, "loss/crossentropy": 2.4786179065704346, "loss/hidden": 1.1953125, "loss/logits": 0.2601345479488373, "loss/reg": 0.013135283254086971, "step": 1659 }, { "epoch": 0.2075, "grad_norm": 2.313868761062622, "grad_norm_var": 0.6138390088935656, "learning_rate": 0.0001, "loss": 1.0272, "loss/crossentropy": 2.646815538406372, "loss/hidden": 0.7734375, "loss/logits": 0.1224367693066597, "loss/reg": 0.013128082267940044, "step": 1660 }, { "epoch": 0.207625, "grad_norm": 3.9266319274902344, "grad_norm_var": 0.6441662093447725, "learning_rate": 0.0001, "loss": 1.3995, "loss/crossentropy": 2.480496644973755, "loss/hidden": 1.0859375, "loss/logits": 0.1823263317346573, "loss/reg": 0.013120830059051514, "step": 1661 }, { "epoch": 0.20775, "grad_norm": 4.4009881019592285, "grad_norm_var": 0.7207383535866235, "learning_rate": 0.0001, "loss": 1.5143, "loss/crossentropy": 2.490128993988037, "loss/hidden": 1.15625, "loss/logits": 0.22691452503204346, "loss/reg": 0.013114040717482567, "step": 1662 }, { "epoch": 0.207875, "grad_norm": 2.803945302963257, "grad_norm_var": 0.7307607399187871, "learning_rate": 0.0001, "loss": 1.1357, "loss/crossentropy": 2.6528525352478027, "loss/hidden": 0.84375, "loss/logits": 0.1609269231557846, "loss/reg": 0.013106826692819595, "step": 1663 }, { "epoch": 0.208, "grad_norm": 3.9250094890594482, "grad_norm_var": 0.753468894736533, "learning_rate": 0.0001, "loss": 1.5158, "loss/crossentropy": 2.8934035301208496, "loss/hidden": 1.1484375, "loss/logits": 0.23635676503181458, "loss/reg": 0.013099766336381435, "step": 1664 }, { "epoch": 0.208125, "grad_norm": 2.537315607070923, "grad_norm_var": 0.789891085089917, "learning_rate": 0.0001, "loss": 1.1371, "loss/crossentropy": 2.4187686443328857, "loss/hidden": 0.8515625, "loss/logits": 0.15466302633285522, "loss/reg": 0.013092422857880592, "step": 1665 }, { "epoch": 0.20825, "grad_norm": 2.553142547607422, "grad_norm_var": 0.7987201352076717, "learning_rate": 0.0001, "loss": 1.0203, "loss/crossentropy": 2.137171745300293, "loss/hidden": 0.76171875, "loss/logits": 0.12768462300300598, "loss/reg": 0.013086005114018917, "step": 1666 }, { "epoch": 0.208375, "grad_norm": 2.1522605419158936, "grad_norm_var": 0.8677726892771486, "learning_rate": 0.0001, "loss": 0.9736, "loss/crossentropy": 2.4442384243011475, "loss/hidden": 0.7109375, "loss/logits": 0.13187186419963837, "loss/reg": 0.013079563155770302, "step": 1667 }, { "epoch": 0.2085, "grad_norm": 2.8897452354431152, "grad_norm_var": 0.8658007433698373, "learning_rate": 0.0001, "loss": 0.9685, "loss/crossentropy": 2.4599223136901855, "loss/hidden": 0.7109375, "loss/logits": 0.12683042883872986, "loss/reg": 0.01307291816920042, "step": 1668 }, { "epoch": 0.208625, "grad_norm": 3.2847912311553955, "grad_norm_var": 0.8519672623501406, "learning_rate": 0.0001, "loss": 0.9332, "loss/crossentropy": 2.2193775177001953, "loss/hidden": 0.6796875, "loss/logits": 0.12289424240589142, "loss/reg": 0.013066742569208145, "step": 1669 }, { "epoch": 0.20875, "grad_norm": 3.1088175773620605, "grad_norm_var": 0.8160818976558214, "learning_rate": 0.0001, "loss": 1.2098, "loss/crossentropy": 2.402890920639038, "loss/hidden": 0.93359375, "loss/logits": 0.14563772082328796, "loss/reg": 0.013059607706964016, "step": 1670 }, { "epoch": 0.208875, "grad_norm": 3.5626771450042725, "grad_norm_var": 0.8133001054088592, "learning_rate": 0.0001, "loss": 1.1257, "loss/crossentropy": 2.4826173782348633, "loss/hidden": 0.8359375, "loss/logits": 0.15928024053573608, "loss/reg": 0.013052698224782944, "step": 1671 }, { "epoch": 0.209, "grad_norm": 3.8422510623931885, "grad_norm_var": 0.40753121137815923, "learning_rate": 0.0001, "loss": 1.0944, "loss/crossentropy": 2.2613954544067383, "loss/hidden": 0.81640625, "loss/logits": 0.14750930666923523, "loss/reg": 0.013046123087406158, "step": 1672 }, { "epoch": 0.209125, "grad_norm": 3.1732969284057617, "grad_norm_var": 0.40373591532357217, "learning_rate": 0.0001, "loss": 1.148, "loss/crossentropy": 2.6913723945617676, "loss/hidden": 0.84765625, "loss/logits": 0.1699780523777008, "loss/reg": 0.01303982362151146, "step": 1673 }, { "epoch": 0.20925, "grad_norm": 3.5349199771881104, "grad_norm_var": 0.41114120511671504, "learning_rate": 0.0001, "loss": 1.1601, "loss/crossentropy": 2.636218547821045, "loss/hidden": 0.859375, "loss/logits": 0.1703943908214569, "loss/reg": 0.013032618910074234, "step": 1674 }, { "epoch": 0.209375, "grad_norm": 3.3266332149505615, "grad_norm_var": 0.4117979013874437, "learning_rate": 0.0001, "loss": 1.2931, "loss/crossentropy": 2.5062716007232666, "loss/hidden": 0.96875, "loss/logits": 0.1940596103668213, "loss/reg": 0.013026205822825432, "step": 1675 }, { "epoch": 0.2095, "grad_norm": 2.690976142883301, "grad_norm_var": 0.3757021597893411, "learning_rate": 0.0001, "loss": 1.0795, "loss/crossentropy": 2.6262998580932617, "loss/hidden": 0.80078125, "loss/logits": 0.14850963652133942, "loss/reg": 0.013019601814448833, "step": 1676 }, { "epoch": 0.209625, "grad_norm": 2.262477397918701, "grad_norm_var": 0.3946797642913109, "learning_rate": 0.0001, "loss": 1.0682, "loss/crossentropy": 2.522778272628784, "loss/hidden": 0.7890625, "loss/logits": 0.14901478588581085, "loss/reg": 0.013013252057135105, "step": 1677 }, { "epoch": 0.20975, "grad_norm": 4.060770034790039, "grad_norm_var": 0.3441717651006473, "learning_rate": 0.0001, "loss": 1.1066, "loss/crossentropy": 2.75931715965271, "loss/hidden": 0.83203125, "loss/logits": 0.1445423662662506, "loss/reg": 0.0130059365183115, "step": 1678 }, { "epoch": 0.209875, "grad_norm": 3.035757064819336, "grad_norm_var": 0.3381691610716331, "learning_rate": 0.0001, "loss": 1.2199, "loss/crossentropy": 2.2356534004211426, "loss/hidden": 0.9140625, "loss/logits": 0.1757938712835312, "loss/reg": 0.012999509461224079, "step": 1679 }, { "epoch": 0.21, "grad_norm": 3.4442572593688965, "grad_norm_var": 0.3010964780836121, "learning_rate": 0.0001, "loss": 1.3513, "loss/crossentropy": 2.327789545059204, "loss/hidden": 1.03125, "loss/logits": 0.19014956057071686, "loss/reg": 0.012993012554943562, "step": 1680 }, { "epoch": 0.210125, "grad_norm": 3.582969903945923, "grad_norm_var": 0.29220290919850694, "learning_rate": 0.0001, "loss": 1.1276, "loss/crossentropy": 2.5549631118774414, "loss/hidden": 0.80859375, "loss/logits": 0.18918761610984802, "loss/reg": 0.0129857761785388, "step": 1681 }, { "epoch": 0.21025, "grad_norm": 2.9530413150787354, "grad_norm_var": 0.27002111859183725, "learning_rate": 0.0001, "loss": 1.051, "loss/crossentropy": 2.4788119792938232, "loss/hidden": 0.77734375, "loss/logits": 0.14384132623672485, "loss/reg": 0.012978510931134224, "step": 1682 }, { "epoch": 0.210375, "grad_norm": 2.191362142562866, "grad_norm_var": 0.2647501539546122, "learning_rate": 0.0001, "loss": 1.0957, "loss/crossentropy": 2.532787561416626, "loss/hidden": 0.80859375, "loss/logits": 0.15740235149860382, "loss/reg": 0.012971782125532627, "step": 1683 }, { "epoch": 0.2105, "grad_norm": 5.248608112335205, "grad_norm_var": 0.5199526136507975, "learning_rate": 0.0001, "loss": 1.3987, "loss/crossentropy": 2.741248369216919, "loss/hidden": 1.0625, "loss/logits": 0.206559956073761, "loss/reg": 0.012965119443833828, "step": 1684 }, { "epoch": 0.210625, "grad_norm": 9.923733711242676, "grad_norm_var": 3.2333504676999985, "learning_rate": 0.0001, "loss": 1.2908, "loss/crossentropy": 2.4856460094451904, "loss/hidden": 1.0078125, "loss/logits": 0.1534312665462494, "loss/reg": 0.012957965023815632, "step": 1685 }, { "epoch": 0.21075, "grad_norm": 2.7674450874328613, "grad_norm_var": 3.2696547533182656, "learning_rate": 0.0001, "loss": 1.094, "loss/crossentropy": 2.563523054122925, "loss/hidden": 0.828125, "loss/logits": 0.13640597462654114, "loss/reg": 0.012951286509633064, "step": 1686 }, { "epoch": 0.210875, "grad_norm": 2.63311505317688, "grad_norm_var": 3.3437877784147436, "learning_rate": 0.0001, "loss": 1.0298, "loss/crossentropy": 2.604543685913086, "loss/hidden": 0.76171875, "loss/logits": 0.13861128687858582, "loss/reg": 0.012944560497999191, "step": 1687 }, { "epoch": 0.211, "grad_norm": 10.589808464050293, "grad_norm_var": 6.347074021055989, "learning_rate": 0.0001, "loss": 1.3073, "loss/crossentropy": 2.7445929050445557, "loss/hidden": 1.0078125, "loss/logits": 0.1701403260231018, "loss/reg": 0.01293735858052969, "step": 1688 }, { "epoch": 0.211125, "grad_norm": 2.85196590423584, "grad_norm_var": 6.392746951466915, "learning_rate": 0.0001, "loss": 1.0699, "loss/crossentropy": 2.5519180297851562, "loss/hidden": 0.8203125, "loss/logits": 0.12032581865787506, "loss/reg": 0.012930169701576233, "step": 1689 }, { "epoch": 0.21125, "grad_norm": 3.8103320598602295, "grad_norm_var": 6.377889547085541, "learning_rate": 0.0001, "loss": 1.1586, "loss/crossentropy": 2.479715585708618, "loss/hidden": 0.87890625, "loss/logits": 0.15049707889556885, "loss/reg": 0.012922849506139755, "step": 1690 }, { "epoch": 0.211375, "grad_norm": 2.617124080657959, "grad_norm_var": 6.481173027892926, "learning_rate": 0.0001, "loss": 1.0754, "loss/crossentropy": 2.60794734954834, "loss/hidden": 0.80078125, "loss/logits": 0.14545781910419464, "loss/reg": 0.012915358878672123, "step": 1691 }, { "epoch": 0.2115, "grad_norm": 3.9162254333496094, "grad_norm_var": 6.354372430431797, "learning_rate": 0.0001, "loss": 1.1995, "loss/crossentropy": 2.4997646808624268, "loss/hidden": 0.921875, "loss/logits": 0.1485539972782135, "loss/reg": 0.012908329255878925, "step": 1692 }, { "epoch": 0.211625, "grad_norm": 2.7217462062835693, "grad_norm_var": 6.2539271325365196, "learning_rate": 0.0001, "loss": 1.1512, "loss/crossentropy": 2.617048501968384, "loss/hidden": 0.79296875, "loss/logits": 0.22927230596542358, "loss/reg": 0.012900839559733868, "step": 1693 }, { "epoch": 0.21175, "grad_norm": 2.224349021911621, "grad_norm_var": 6.485761495009887, "learning_rate": 0.0001, "loss": 0.9654, "loss/crossentropy": 2.703951358795166, "loss/hidden": 0.72265625, "loss/logits": 0.11382674425840378, "loss/reg": 0.012893814593553543, "step": 1694 }, { "epoch": 0.211875, "grad_norm": 2.9117870330810547, "grad_norm_var": 6.503189101884489, "learning_rate": 0.0001, "loss": 1.2238, "loss/crossentropy": 2.5064597129821777, "loss/hidden": 0.8984375, "loss/logits": 0.1964527666568756, "loss/reg": 0.012886927463114262, "step": 1695 }, { "epoch": 0.212, "grad_norm": 2.943073034286499, "grad_norm_var": 6.557645425580934, "learning_rate": 0.0001, "loss": 1.0268, "loss/crossentropy": 2.681680679321289, "loss/hidden": 0.75, "loss/logits": 0.14798036217689514, "loss/reg": 0.012880067341029644, "step": 1696 }, { "epoch": 0.212125, "grad_norm": 2.8364195823669434, "grad_norm_var": 6.633285254118307, "learning_rate": 0.0001, "loss": 1.2396, "loss/crossentropy": 2.267443895339966, "loss/hidden": 0.94140625, "loss/logits": 0.16948428750038147, "loss/reg": 0.012873291037976742, "step": 1697 }, { "epoch": 0.21225, "grad_norm": 2.667727470397949, "grad_norm_var": 6.676156819217446, "learning_rate": 0.0001, "loss": 1.1879, "loss/crossentropy": 2.5819482803344727, "loss/hidden": 0.8671875, "loss/logits": 0.1920512467622757, "loss/reg": 0.012866680510342121, "step": 1698 }, { "epoch": 0.212375, "grad_norm": 3.2915964126586914, "grad_norm_var": 6.496990351425703, "learning_rate": 0.0001, "loss": 1.0907, "loss/crossentropy": 2.5162808895111084, "loss/hidden": 0.828125, "loss/logits": 0.13395392894744873, "loss/reg": 0.012859686277806759, "step": 1699 }, { "epoch": 0.2125, "grad_norm": 2.9825503826141357, "grad_norm_var": 6.439824510110865, "learning_rate": 0.0001, "loss": 1.2178, "loss/crossentropy": 2.2996246814727783, "loss/hidden": 0.9140625, "loss/logits": 0.1752283126115799, "loss/reg": 0.01285255141556263, "step": 1700 }, { "epoch": 0.212625, "grad_norm": 2.9510700702667236, "grad_norm_var": 3.836942936103522, "learning_rate": 0.0001, "loss": 1.3336, "loss/crossentropy": 2.3370251655578613, "loss/hidden": 1.015625, "loss/logits": 0.1895022988319397, "loss/reg": 0.01284568477421999, "step": 1701 }, { "epoch": 0.21275, "grad_norm": 2.5308520793914795, "grad_norm_var": 3.8610195504163127, "learning_rate": 0.0001, "loss": 1.2202, "loss/crossentropy": 2.513655185699463, "loss/hidden": 0.91015625, "loss/logits": 0.18170946836471558, "loss/reg": 0.01283828355371952, "step": 1702 }, { "epoch": 0.212875, "grad_norm": 4.751686096191406, "grad_norm_var": 3.923506474684037, "learning_rate": 0.0001, "loss": 1.2352, "loss/crossentropy": 2.173074245452881, "loss/hidden": 0.953125, "loss/logits": 0.15377816557884216, "loss/reg": 0.012831181287765503, "step": 1703 }, { "epoch": 0.213, "grad_norm": 3.511111259460449, "grad_norm_var": 0.3989999503182124, "learning_rate": 0.0001, "loss": 1.2827, "loss/crossentropy": 2.21486759185791, "loss/hidden": 0.9765625, "loss/logits": 0.17789113521575928, "loss/reg": 0.01282414235174656, "step": 1704 }, { "epoch": 0.213125, "grad_norm": 2.6096677780151367, "grad_norm_var": 0.4105200098953181, "learning_rate": 0.0001, "loss": 1.0013, "loss/crossentropy": 2.316843271255493, "loss/hidden": 0.74609375, "loss/logits": 0.12707841396331787, "loss/reg": 0.012817141599953175, "step": 1705 }, { "epoch": 0.21325, "grad_norm": 2.8105530738830566, "grad_norm_var": 0.375613954977198, "learning_rate": 0.0001, "loss": 1.0661, "loss/crossentropy": 2.369736909866333, "loss/hidden": 0.8046875, "loss/logits": 0.13333451747894287, "loss/reg": 0.012809785082936287, "step": 1706 }, { "epoch": 0.213375, "grad_norm": 3.1986374855041504, "grad_norm_var": 0.36571755056712973, "learning_rate": 0.0001, "loss": 1.057, "loss/crossentropy": 2.320181131362915, "loss/hidden": 0.8046875, "loss/logits": 0.12431928515434265, "loss/reg": 0.012802771292626858, "step": 1707 }, { "epoch": 0.2135, "grad_norm": 2.6183996200561523, "grad_norm_var": 0.32173357495411, "learning_rate": 0.0001, "loss": 1.0431, "loss/crossentropy": 2.4588027000427246, "loss/hidden": 0.77734375, "loss/logits": 0.13782772421836853, "loss/reg": 0.01279559638351202, "step": 1708 }, { "epoch": 0.213625, "grad_norm": 2.6844642162323, "grad_norm_var": 0.3230673077730707, "learning_rate": 0.0001, "loss": 1.12, "loss/crossentropy": 2.0865495204925537, "loss/hidden": 0.8359375, "loss/logits": 0.1561318039894104, "loss/reg": 0.012788429856300354, "step": 1709 }, { "epoch": 0.21375, "grad_norm": 15.15482234954834, "grad_norm_var": 9.486914195081253, "learning_rate": 0.0001, "loss": 1.095, "loss/crossentropy": 2.5281543731689453, "loss/hidden": 0.82421875, "loss/logits": 0.14299717545509338, "loss/reg": 0.012780962511897087, "step": 1710 }, { "epoch": 0.213875, "grad_norm": 3.1808626651763916, "grad_norm_var": 9.460348003251534, "learning_rate": 0.0001, "loss": 1.1502, "loss/crossentropy": 2.383219003677368, "loss/hidden": 0.89453125, "loss/logits": 0.12796291708946228, "loss/reg": 0.012773919850587845, "step": 1711 }, { "epoch": 0.214, "grad_norm": 4.163309574127197, "grad_norm_var": 9.414766565786032, "learning_rate": 0.0001, "loss": 1.3702, "loss/crossentropy": 2.455665111541748, "loss/hidden": 1.03125, "loss/logits": 0.21129587292671204, "loss/reg": 0.012766973115503788, "step": 1712 }, { "epoch": 0.214125, "grad_norm": 2.4835665225982666, "grad_norm_var": 9.471244857981635, "learning_rate": 0.0001, "loss": 1.0306, "loss/crossentropy": 2.1449975967407227, "loss/hidden": 0.765625, "loss/logits": 0.13736189901828766, "loss/reg": 0.012759552337229252, "step": 1713 }, { "epoch": 0.21425, "grad_norm": 3.456458806991577, "grad_norm_var": 9.385853171996628, "learning_rate": 0.0001, "loss": 1.0432, "loss/crossentropy": 2.746030330657959, "loss/hidden": 0.78125, "loss/logits": 0.1344452202320099, "loss/reg": 0.01275256834924221, "step": 1714 }, { "epoch": 0.214375, "grad_norm": 2.5408084392547607, "grad_norm_var": 9.481860031377286, "learning_rate": 0.0001, "loss": 1.1865, "loss/crossentropy": 2.4518415927886963, "loss/hidden": 0.875, "loss/logits": 0.18404340744018555, "loss/reg": 0.01274561882019043, "step": 1715 }, { "epoch": 0.2145, "grad_norm": 2.634857654571533, "grad_norm_var": 9.529713299554377, "learning_rate": 0.0001, "loss": 1.0157, "loss/crossentropy": 2.7000675201416016, "loss/hidden": 0.7421875, "loss/logits": 0.14611287415027618, "loss/reg": 0.012738562189042568, "step": 1716 }, { "epoch": 0.214625, "grad_norm": 2.735212564468384, "grad_norm_var": 9.557923964972339, "learning_rate": 0.0001, "loss": 1.1027, "loss/crossentropy": 2.5324275493621826, "loss/hidden": 0.83203125, "loss/logits": 0.14335589110851288, "loss/reg": 0.012731647118926048, "step": 1717 }, { "epoch": 0.21475, "grad_norm": 2.8685081005096436, "grad_norm_var": 9.50716521368737, "learning_rate": 0.0001, "loss": 1.0448, "loss/crossentropy": 2.559062957763672, "loss/hidden": 0.77734375, "loss/logits": 0.14023086428642273, "loss/reg": 0.012724741362035275, "step": 1718 }, { "epoch": 0.214875, "grad_norm": 2.7883400917053223, "grad_norm_var": 9.508818411578824, "learning_rate": 0.0001, "loss": 1.1709, "loss/crossentropy": 2.7375028133392334, "loss/hidden": 0.86328125, "loss/logits": 0.18043699860572815, "loss/reg": 0.012717757374048233, "step": 1719 }, { "epoch": 0.215, "grad_norm": 4.499538421630859, "grad_norm_var": 9.54301307944679, "learning_rate": 0.0001, "loss": 1.394, "loss/crossentropy": 2.3297677040100098, "loss/hidden": 1.046875, "loss/logits": 0.2200247049331665, "loss/reg": 0.012710826471447945, "step": 1720 }, { "epoch": 0.215125, "grad_norm": 2.6797218322753906, "grad_norm_var": 9.53241861946436, "learning_rate": 0.0001, "loss": 1.2027, "loss/crossentropy": 2.094529628753662, "loss/hidden": 0.92578125, "loss/logits": 0.1498618870973587, "loss/reg": 0.012703881599009037, "step": 1721 }, { "epoch": 0.21525, "grad_norm": 2.300706148147583, "grad_norm_var": 9.614644455093018, "learning_rate": 0.0001, "loss": 1.056, "loss/crossentropy": 2.341108560562134, "loss/hidden": 0.78125, "loss/logits": 0.14775308966636658, "loss/reg": 0.012696735560894012, "step": 1722 }, { "epoch": 0.215375, "grad_norm": 2.584787607192993, "grad_norm_var": 9.683262071884398, "learning_rate": 0.0001, "loss": 1.2349, "loss/crossentropy": 2.511127471923828, "loss/hidden": 0.94140625, "loss/logits": 0.16660267114639282, "loss/reg": 0.012690168805420399, "step": 1723 }, { "epoch": 0.2155, "grad_norm": 2.7051329612731934, "grad_norm_var": 9.67109810339239, "learning_rate": 0.0001, "loss": 1.0733, "loss/crossentropy": 2.2387161254882812, "loss/hidden": 0.796875, "loss/logits": 0.14958390593528748, "loss/reg": 0.012683309614658356, "step": 1724 }, { "epoch": 0.215625, "grad_norm": 2.6227893829345703, "grad_norm_var": 9.679821099755788, "learning_rate": 0.0001, "loss": 1.0231, "loss/crossentropy": 2.2900543212890625, "loss/hidden": 0.76953125, "loss/logits": 0.1268245130777359, "loss/reg": 0.012676582671701908, "step": 1725 }, { "epoch": 0.21575, "grad_norm": 3.2791683673858643, "grad_norm_var": 0.3762032236898106, "learning_rate": 0.0001, "loss": 1.3265, "loss/crossentropy": 2.2019684314727783, "loss/hidden": 1.0078125, "loss/logits": 0.19199709594249725, "loss/reg": 0.01266949437558651, "step": 1726 }, { "epoch": 0.215875, "grad_norm": 3.5843939781188965, "grad_norm_var": 0.3977131857555946, "learning_rate": 0.0001, "loss": 1.1419, "loss/crossentropy": 2.474095106124878, "loss/hidden": 0.8515625, "loss/logits": 0.16370782256126404, "loss/reg": 0.012662646360695362, "step": 1727 }, { "epoch": 0.216, "grad_norm": 4.510242938995361, "grad_norm_var": 0.45925816137897757, "learning_rate": 0.0001, "loss": 1.0694, "loss/crossentropy": 2.829896926879883, "loss/hidden": 0.8046875, "loss/logits": 0.13811606168746948, "loss/reg": 0.012655883096158504, "step": 1728 }, { "epoch": 0.216125, "grad_norm": 3.1752772331237793, "grad_norm_var": 0.4399517134814609, "learning_rate": 0.0001, "loss": 1.2447, "loss/crossentropy": 2.6229610443115234, "loss/hidden": 0.92578125, "loss/logits": 0.19241932034492493, "loss/reg": 0.012648857198655605, "step": 1729 }, { "epoch": 0.21625, "grad_norm": 3.531876802444458, "grad_norm_var": 0.44429015337232514, "learning_rate": 0.0001, "loss": 1.2492, "loss/crossentropy": 2.466808795928955, "loss/hidden": 0.91015625, "loss/logits": 0.2125817835330963, "loss/reg": 0.01264151744544506, "step": 1730 }, { "epoch": 0.216375, "grad_norm": 7.509683609008789, "grad_norm_var": 1.6400556058435427, "learning_rate": 0.0001, "loss": 1.6584, "loss/crossentropy": 2.631517171859741, "loss/hidden": 1.2890625, "loss/logits": 0.24304136633872986, "loss/reg": 0.012634233571588993, "step": 1731 }, { "epoch": 0.2165, "grad_norm": 2.4990897178649902, "grad_norm_var": 1.6546175936024687, "learning_rate": 0.0001, "loss": 1.041, "loss/crossentropy": 2.5932819843292236, "loss/hidden": 0.77734375, "loss/logits": 0.13742858171463013, "loss/reg": 0.012627250514924526, "step": 1732 }, { "epoch": 0.216625, "grad_norm": 2.389944553375244, "grad_norm_var": 1.6911601234778078, "learning_rate": 0.0001, "loss": 0.9812, "loss/crossentropy": 2.318802833557129, "loss/hidden": 0.73046875, "loss/logits": 0.12450764328241348, "loss/reg": 0.012620464898645878, "step": 1733 }, { "epoch": 0.21675, "grad_norm": 2.6159985065460205, "grad_norm_var": 1.7112070581365304, "learning_rate": 0.0001, "loss": 1.141, "loss/crossentropy": 2.716378688812256, "loss/hidden": 0.84375, "loss/logits": 0.17106443643569946, "loss/reg": 0.012613578699529171, "step": 1734 }, { "epoch": 0.216875, "grad_norm": 3.3391213417053223, "grad_norm_var": 1.6904040902109745, "learning_rate": 0.0001, "loss": 1.1009, "loss/crossentropy": 2.4364097118377686, "loss/hidden": 0.828125, "loss/logits": 0.14670366048812866, "loss/reg": 0.012606512755155563, "step": 1735 }, { "epoch": 0.217, "grad_norm": 3.596266984939575, "grad_norm_var": 1.604664018069381, "learning_rate": 0.0001, "loss": 1.2259, "loss/crossentropy": 2.486934185028076, "loss/hidden": 0.94140625, "loss/logits": 0.15848477184772491, "loss/reg": 0.012599713169038296, "step": 1736 }, { "epoch": 0.217125, "grad_norm": 2.6140332221984863, "grad_norm_var": 1.6104343887098584, "learning_rate": 0.0001, "loss": 1.0859, "loss/crossentropy": 2.3189902305603027, "loss/hidden": 0.82421875, "loss/logits": 0.13571876287460327, "loss/reg": 0.012592630460858345, "step": 1737 }, { "epoch": 0.21725, "grad_norm": 2.87418532371521, "grad_norm_var": 1.554299756346355, "learning_rate": 0.0001, "loss": 1.2125, "loss/crossentropy": 2.095885992050171, "loss/hidden": 0.91796875, "loss/logits": 0.16862446069717407, "loss/reg": 0.012585851363837719, "step": 1738 }, { "epoch": 0.217375, "grad_norm": 4.191568851470947, "grad_norm_var": 1.5539712836144928, "learning_rate": 0.0001, "loss": 1.7198, "loss/crossentropy": 2.424238681793213, "loss/hidden": 1.3046875, "loss/logits": 0.2893022298812866, "loss/reg": 0.012578519992530346, "step": 1739 }, { "epoch": 0.2175, "grad_norm": 2.666806936264038, "grad_norm_var": 1.5578179682411237, "learning_rate": 0.0001, "loss": 1.1622, "loss/crossentropy": 2.2649426460266113, "loss/hidden": 0.875, "loss/logits": 0.16145509481430054, "loss/reg": 0.01257177721709013, "step": 1740 }, { "epoch": 0.217625, "grad_norm": 2.6806042194366455, "grad_norm_var": 1.5517463474106703, "learning_rate": 0.0001, "loss": 1.0853, "loss/crossentropy": 2.4488718509674072, "loss/hidden": 0.80078125, "loss/logits": 0.15884113311767578, "loss/reg": 0.012565212324261665, "step": 1741 }, { "epoch": 0.21775, "grad_norm": 3.704366683959961, "grad_norm_var": 1.5538631925025423, "learning_rate": 0.0001, "loss": 1.2354, "loss/crossentropy": 2.6138522624969482, "loss/hidden": 0.94140625, "loss/logits": 0.16840386390686035, "loss/reg": 0.012558743357658386, "step": 1742 }, { "epoch": 0.217875, "grad_norm": 3.2077255249023438, "grad_norm_var": 1.556870797982151, "learning_rate": 0.0001, "loss": 1.316, "loss/crossentropy": 2.141016721725464, "loss/hidden": 1.0390625, "loss/logits": 0.15139830112457275, "loss/reg": 0.012552441097795963, "step": 1743 }, { "epoch": 0.218, "grad_norm": 2.951503038406372, "grad_norm_var": 1.4871620619995438, "learning_rate": 0.0001, "loss": 1.0989, "loss/crossentropy": 2.5490455627441406, "loss/hidden": 0.8203125, "loss/logits": 0.1530894637107849, "loss/reg": 0.012546169571578503, "step": 1744 }, { "epoch": 0.218125, "grad_norm": 3.119734525680542, "grad_norm_var": 1.488624773240887, "learning_rate": 0.0001, "loss": 1.0844, "loss/crossentropy": 2.4211771488189697, "loss/hidden": 0.81640625, "loss/logits": 0.14257250726222992, "loss/reg": 0.012539266608655453, "step": 1745 }, { "epoch": 0.21825, "grad_norm": 3.663892984390259, "grad_norm_var": 1.4930337180673443, "learning_rate": 0.0001, "loss": 1.0072, "loss/crossentropy": 2.431640148162842, "loss/hidden": 0.76953125, "loss/logits": 0.11231502890586853, "loss/reg": 0.012532477267086506, "step": 1746 }, { "epoch": 0.218375, "grad_norm": 2.77474045753479, "grad_norm_var": 0.26911648905078756, "learning_rate": 0.0001, "loss": 1.0839, "loss/crossentropy": 2.7117726802825928, "loss/hidden": 0.80859375, "loss/logits": 0.15006102621555328, "loss/reg": 0.012525715865194798, "step": 1747 }, { "epoch": 0.2185, "grad_norm": 2.6455776691436768, "grad_norm_var": 0.25958807313815746, "learning_rate": 0.0001, "loss": 1.0041, "loss/crossentropy": 2.7506678104400635, "loss/hidden": 0.74609375, "loss/logits": 0.1328078657388687, "loss/reg": 0.012519202195107937, "step": 1748 }, { "epoch": 0.218625, "grad_norm": 24.219947814941406, "grad_norm_var": 28.07975632569756, "learning_rate": 0.0001, "loss": 1.4577, "loss/crossentropy": 2.543936014175415, "loss/hidden": 1.203125, "loss/logits": 0.12944281101226807, "loss/reg": 0.01251268945634365, "step": 1749 }, { "epoch": 0.21875, "grad_norm": 37.90777587890625, "grad_norm_var": 97.39228721131126, "learning_rate": 0.0001, "loss": 1.1124, "loss/crossentropy": 2.2267796993255615, "loss/hidden": 0.8515625, "loss/logits": 0.1357845962047577, "loss/reg": 0.012506171129643917, "step": 1750 }, { "epoch": 0.218875, "grad_norm": 3.590038537979126, "grad_norm_var": 97.28596098453178, "learning_rate": 0.0001, "loss": 1.0614, "loss/crossentropy": 2.42260479927063, "loss/hidden": 0.8046875, "loss/logits": 0.1316903978586197, "loss/reg": 0.012499329634010792, "step": 1751 }, { "epoch": 0.219, "grad_norm": 2.845022439956665, "grad_norm_var": 97.6271689383777, "learning_rate": 0.0001, "loss": 1.092, "loss/crossentropy": 2.804971218109131, "loss/hidden": 0.82421875, "loss/logits": 0.14280739426612854, "loss/reg": 0.012492484413087368, "step": 1752 }, { "epoch": 0.219125, "grad_norm": 3.189939260482788, "grad_norm_var": 97.34154979157398, "learning_rate": 0.0001, "loss": 1.1496, "loss/crossentropy": 2.448812484741211, "loss/hidden": 0.875, "loss/logits": 0.14971600472927094, "loss/reg": 0.012485613115131855, "step": 1753 }, { "epoch": 0.21925, "grad_norm": 2.670297145843506, "grad_norm_var": 97.44651079060357, "learning_rate": 0.0001, "loss": 1.1915, "loss/crossentropy": 2.4728586673736572, "loss/hidden": 0.87890625, "loss/logits": 0.1877635270357132, "loss/reg": 0.012478794902563095, "step": 1754 }, { "epoch": 0.219375, "grad_norm": 3.3437023162841797, "grad_norm_var": 97.76674601970879, "learning_rate": 0.0001, "loss": 1.1638, "loss/crossentropy": 2.3619091510772705, "loss/hidden": 0.87890625, "loss/logits": 0.1601634919643402, "loss/reg": 0.012471921741962433, "step": 1755 }, { "epoch": 0.2195, "grad_norm": 3.5836129188537598, "grad_norm_var": 97.34167870831958, "learning_rate": 0.0001, "loss": 1.3367, "loss/crossentropy": 2.671821355819702, "loss/hidden": 1.03125, "loss/logits": 0.1808364987373352, "loss/reg": 0.012465192936360836, "step": 1756 }, { "epoch": 0.219625, "grad_norm": 2.6130692958831787, "grad_norm_var": 97.37753712214686, "learning_rate": 0.0001, "loss": 0.9876, "loss/crossentropy": 2.5102763175964355, "loss/hidden": 0.73828125, "loss/logits": 0.12473314255475998, "loss/reg": 0.01245811302214861, "step": 1757 }, { "epoch": 0.21975, "grad_norm": 3.395615339279175, "grad_norm_var": 97.50380796911385, "learning_rate": 0.0001, "loss": 1.2064, "loss/crossentropy": 2.485377073287964, "loss/hidden": 0.9140625, "loss/logits": 0.16786439716815948, "loss/reg": 0.01245111133903265, "step": 1758 }, { "epoch": 0.219875, "grad_norm": 4.825221061706543, "grad_norm_var": 96.93408061336578, "learning_rate": 0.0001, "loss": 1.0737, "loss/crossentropy": 2.5381832122802734, "loss/hidden": 0.8125, "loss/logits": 0.13672339916229248, "loss/reg": 0.012444333173334599, "step": 1759 }, { "epoch": 0.22, "grad_norm": 2.695854663848877, "grad_norm_var": 97.06623592058641, "learning_rate": 0.0001, "loss": 1.2057, "loss/crossentropy": 2.5472490787506104, "loss/hidden": 0.90625, "loss/logits": 0.17507830262184143, "loss/reg": 0.012437481433153152, "step": 1760 }, { "epoch": 0.220125, "grad_norm": 2.619779109954834, "grad_norm_var": 97.32003810205347, "learning_rate": 0.0001, "loss": 1.0803, "loss/crossentropy": 2.5513741970062256, "loss/hidden": 0.81640625, "loss/logits": 0.13957199454307556, "loss/reg": 0.012430677190423012, "step": 1761 }, { "epoch": 0.22025, "grad_norm": 2.6490063667297363, "grad_norm_var": 97.79004434756365, "learning_rate": 0.0001, "loss": 1.1991, "loss/crossentropy": 2.408766031265259, "loss/hidden": 0.921875, "loss/logits": 0.15302732586860657, "loss/reg": 0.012424097396433353, "step": 1762 }, { "epoch": 0.220375, "grad_norm": 3.600940465927124, "grad_norm_var": 97.41152871154476, "learning_rate": 0.0001, "loss": 1.0448, "loss/crossentropy": 2.567258358001709, "loss/hidden": 0.79296875, "loss/logits": 0.12761420011520386, "loss/reg": 0.012417309917509556, "step": 1763 }, { "epoch": 0.2205, "grad_norm": 2.783167600631714, "grad_norm_var": 97.33925474643891, "learning_rate": 0.0001, "loss": 1.2647, "loss/crossentropy": 2.4639530181884766, "loss/hidden": 0.98828125, "loss/logits": 0.15236002206802368, "loss/reg": 0.012410394847393036, "step": 1764 }, { "epoch": 0.220625, "grad_norm": 2.8482487201690674, "grad_norm_var": 75.84316673308427, "learning_rate": 0.0001, "loss": 1.1159, "loss/crossentropy": 2.614351511001587, "loss/hidden": 0.84765625, "loss/logits": 0.14418500661849976, "loss/reg": 0.012403246946632862, "step": 1765 }, { "epoch": 0.22075, "grad_norm": 4.04698371887207, "grad_norm_var": 0.388008374816809, "learning_rate": 0.0001, "loss": 1.3027, "loss/crossentropy": 2.6768503189086914, "loss/hidden": 1.0, "loss/logits": 0.17873653769493103, "loss/reg": 0.012396049685776234, "step": 1766 }, { "epoch": 0.220875, "grad_norm": 16.276508331298828, "grad_norm_var": 11.096302421984861, "learning_rate": 0.0001, "loss": 1.1526, "loss/crossentropy": 2.6857051849365234, "loss/hidden": 0.87890625, "loss/logits": 0.14977185428142548, "loss/reg": 0.01238931342959404, "step": 1767 }, { "epoch": 0.221, "grad_norm": 3.4267630577087402, "grad_norm_var": 11.027930664929489, "learning_rate": 0.0001, "loss": 1.0696, "loss/crossentropy": 2.5231096744537354, "loss/hidden": 0.80078125, "loss/logits": 0.1450020670890808, "loss/reg": 0.01238264236599207, "step": 1768 }, { "epoch": 0.221125, "grad_norm": 2.5508995056152344, "grad_norm_var": 11.12550393762463, "learning_rate": 0.0001, "loss": 1.1015, "loss/crossentropy": 2.5338897705078125, "loss/hidden": 0.8125, "loss/logits": 0.1652265191078186, "loss/reg": 0.012375940568745136, "step": 1769 }, { "epoch": 0.22125, "grad_norm": 2.664597511291504, "grad_norm_var": 11.126513136887851, "learning_rate": 0.0001, "loss": 1.0946, "loss/crossentropy": 2.4849798679351807, "loss/hidden": 0.80859375, "loss/logits": 0.16228261590003967, "loss/reg": 0.012369180098176003, "step": 1770 }, { "epoch": 0.221375, "grad_norm": 4.765702247619629, "grad_norm_var": 11.129360295504242, "learning_rate": 0.0001, "loss": 1.1274, "loss/crossentropy": 2.440256357192993, "loss/hidden": 0.85546875, "loss/logits": 0.14826975762844086, "loss/reg": 0.01236215140670538, "step": 1771 }, { "epoch": 0.2215, "grad_norm": 3.133784294128418, "grad_norm_var": 11.172026082855703, "learning_rate": 0.0001, "loss": 1.0617, "loss/crossentropy": 2.7009589672088623, "loss/hidden": 0.7890625, "loss/logits": 0.14906959235668182, "loss/reg": 0.012355109676718712, "step": 1772 }, { "epoch": 0.221625, "grad_norm": 3.329200506210327, "grad_norm_var": 11.066300955859553, "learning_rate": 0.0001, "loss": 1.064, "loss/crossentropy": 2.5220742225646973, "loss/hidden": 0.8125, "loss/logits": 0.12804913520812988, "loss/reg": 0.01234830915927887, "step": 1773 }, { "epoch": 0.22175, "grad_norm": 4.502491474151611, "grad_norm_var": 11.038805635564632, "learning_rate": 0.0001, "loss": 1.1216, "loss/crossentropy": 2.4427030086517334, "loss/hidden": 0.8515625, "loss/logits": 0.14662238955497742, "loss/reg": 0.012341534718871117, "step": 1774 }, { "epoch": 0.221875, "grad_norm": 4.261502265930176, "grad_norm_var": 11.009414759143056, "learning_rate": 0.0001, "loss": 1.2303, "loss/crossentropy": 2.463496208190918, "loss/hidden": 0.921875, "loss/logits": 0.18511801958084106, "loss/reg": 0.012334545142948627, "step": 1775 }, { "epoch": 0.222, "grad_norm": 2.7611560821533203, "grad_norm_var": 10.997153332448207, "learning_rate": 0.0001, "loss": 1.103, "loss/crossentropy": 2.643156051635742, "loss/hidden": 0.80859375, "loss/logits": 0.17114627361297607, "loss/reg": 0.012327339500188828, "step": 1776 }, { "epoch": 0.222125, "grad_norm": 2.4437193870544434, "grad_norm_var": 11.034748998650539, "learning_rate": 0.0001, "loss": 1.0003, "loss/crossentropy": 2.910154342651367, "loss/hidden": 0.73828125, "loss/logits": 0.13876628875732422, "loss/reg": 0.012320267036557198, "step": 1777 }, { "epoch": 0.22225, "grad_norm": 4.437688827514648, "grad_norm_var": 10.88203350793716, "learning_rate": 0.0001, "loss": 1.3096, "loss/crossentropy": 2.2305355072021484, "loss/hidden": 1.0, "loss/logits": 0.1865069717168808, "loss/reg": 0.012313010171055794, "step": 1778 }, { "epoch": 0.222375, "grad_norm": 3.549940586090088, "grad_norm_var": 10.886538839719051, "learning_rate": 0.0001, "loss": 1.2606, "loss/crossentropy": 2.4338810443878174, "loss/hidden": 0.9609375, "loss/logits": 0.1766429841518402, "loss/reg": 0.012306897900998592, "step": 1779 }, { "epoch": 0.2225, "grad_norm": 3.347196578979492, "grad_norm_var": 10.797133407006523, "learning_rate": 0.0001, "loss": 1.1304, "loss/crossentropy": 2.835353374481201, "loss/hidden": 0.84765625, "loss/logits": 0.1597260981798172, "loss/reg": 0.012300064787268639, "step": 1780 }, { "epoch": 0.222625, "grad_norm": 3.352013111114502, "grad_norm_var": 10.717386787566932, "learning_rate": 0.0001, "loss": 0.9526, "loss/crossentropy": 2.4358811378479004, "loss/hidden": 0.7265625, "loss/logits": 0.10308530926704407, "loss/reg": 0.012293456122279167, "step": 1781 }, { "epoch": 0.22275, "grad_norm": 3.733184576034546, "grad_norm_var": 10.73425846404735, "learning_rate": 0.0001, "loss": 1.0594, "loss/crossentropy": 2.7393431663513184, "loss/hidden": 0.796875, "loss/logits": 0.139676034450531, "loss/reg": 0.012287287972867489, "step": 1782 }, { "epoch": 0.222875, "grad_norm": 2.7327675819396973, "grad_norm_var": 0.5414954532598804, "learning_rate": 0.0001, "loss": 0.9985, "loss/crossentropy": 2.4664580821990967, "loss/hidden": 0.75, "loss/logits": 0.1256864368915558, "loss/reg": 0.012280437164008617, "step": 1783 }, { "epoch": 0.223, "grad_norm": 2.7125771045684814, "grad_norm_var": 0.5743527285802199, "learning_rate": 0.0001, "loss": 1.0003, "loss/crossentropy": 2.3793299198150635, "loss/hidden": 0.75, "loss/logits": 0.12756191194057465, "loss/reg": 0.012273788452148438, "step": 1784 }, { "epoch": 0.223125, "grad_norm": 4.580681800842285, "grad_norm_var": 0.6041116655885861, "learning_rate": 0.0001, "loss": 1.5202, "loss/crossentropy": 1.8176430463790894, "loss/hidden": 1.171875, "loss/logits": 0.22560694813728333, "loss/reg": 0.012266921810805798, "step": 1785 }, { "epoch": 0.22325, "grad_norm": 3.310520887374878, "grad_norm_var": 0.5565812947330983, "learning_rate": 0.0001, "loss": 1.0981, "loss/crossentropy": 2.3666412830352783, "loss/hidden": 0.8359375, "loss/logits": 0.13953973352909088, "loss/reg": 0.012260017916560173, "step": 1786 }, { "epoch": 0.223375, "grad_norm": 3.3059277534484863, "grad_norm_var": 0.45501991794078456, "learning_rate": 0.0001, "loss": 1.198, "loss/crossentropy": 2.3159430027008057, "loss/hidden": 0.90625, "loss/logits": 0.16920451819896698, "loss/reg": 0.012253600172698498, "step": 1787 }, { "epoch": 0.2235, "grad_norm": 2.687586784362793, "grad_norm_var": 0.4873702987343137, "learning_rate": 0.0001, "loss": 1.0853, "loss/crossentropy": 2.2798428535461426, "loss/hidden": 0.8359375, "loss/logits": 0.12685424089431763, "loss/reg": 0.012246805243194103, "step": 1788 }, { "epoch": 0.223625, "grad_norm": 2.5434353351593018, "grad_norm_var": 0.5376211993019903, "learning_rate": 0.0001, "loss": 0.9544, "loss/crossentropy": 2.6347262859344482, "loss/hidden": 0.703125, "loss/logits": 0.12888537347316742, "loss/reg": 0.012239991687238216, "step": 1789 }, { "epoch": 0.22375, "grad_norm": 2.1676807403564453, "grad_norm_var": 0.5324380567161892, "learning_rate": 0.0001, "loss": 1.0395, "loss/crossentropy": 2.4155385494232178, "loss/hidden": 0.78125, "loss/logits": 0.13590013980865479, "loss/reg": 0.012233071029186249, "step": 1790 }, { "epoch": 0.223875, "grad_norm": 2.7618322372436523, "grad_norm_var": 0.4698401846206546, "learning_rate": 0.0001, "loss": 0.9624, "loss/crossentropy": 2.396468162536621, "loss/hidden": 0.7109375, "loss/logits": 0.12916944921016693, "loss/reg": 0.012226960621774197, "step": 1791 }, { "epoch": 0.224, "grad_norm": 2.605301856994629, "grad_norm_var": 0.4794749872916848, "learning_rate": 0.0001, "loss": 1.0749, "loss/crossentropy": 2.55946946144104, "loss/hidden": 0.80859375, "loss/logits": 0.1441141664981842, "loss/reg": 0.012221286073327065, "step": 1792 }, { "epoch": 0.224125, "grad_norm": 2.7405450344085693, "grad_norm_var": 0.4573457631061404, "learning_rate": 0.0001, "loss": 1.2526, "loss/crossentropy": 2.6217191219329834, "loss/hidden": 0.91015625, "loss/logits": 0.22033345699310303, "loss/reg": 0.012214362621307373, "step": 1793 }, { "epoch": 0.22425, "grad_norm": 4.141777515411377, "grad_norm_var": 0.412429371225325, "learning_rate": 0.0001, "loss": 1.3743, "loss/crossentropy": 1.6800086498260498, "loss/hidden": 1.109375, "loss/logits": 0.14288680255413055, "loss/reg": 0.012208379805088043, "step": 1794 }, { "epoch": 0.224375, "grad_norm": 3.810920476913452, "grad_norm_var": 0.43087940783878576, "learning_rate": 0.0001, "loss": 1.0222, "loss/crossentropy": 2.799727201461792, "loss/hidden": 0.7734375, "loss/logits": 0.12675243616104126, "loss/reg": 0.012202396057546139, "step": 1795 }, { "epoch": 0.2245, "grad_norm": 5.86505126953125, "grad_norm_var": 0.8904950250011618, "learning_rate": 0.0001, "loss": 1.3993, "loss/crossentropy": 2.3400473594665527, "loss/hidden": 1.109375, "loss/logits": 0.16792380809783936, "loss/reg": 0.012196486815810204, "step": 1796 }, { "epoch": 0.224625, "grad_norm": 4.985130310058594, "grad_norm_var": 1.0650859328511084, "learning_rate": 0.0001, "loss": 1.2963, "loss/crossentropy": 2.8352878093719482, "loss/hidden": 0.9609375, "loss/logits": 0.21343833208084106, "loss/reg": 0.012190500274300575, "step": 1797 }, { "epoch": 0.22475, "grad_norm": 3.3390769958496094, "grad_norm_var": 1.0582211532143861, "learning_rate": 0.0001, "loss": 1.2406, "loss/crossentropy": 2.443668842315674, "loss/hidden": 0.94921875, "loss/logits": 0.16958631575107574, "loss/reg": 0.012183710001409054, "step": 1798 }, { "epoch": 0.224875, "grad_norm": 6.64145040512085, "grad_norm_var": 1.6689068782404564, "learning_rate": 0.0001, "loss": 1.0831, "loss/crossentropy": 2.9244539737701416, "loss/hidden": 0.81640625, "loss/logits": 0.1449125111103058, "loss/reg": 0.012176886200904846, "step": 1799 }, { "epoch": 0.225, "grad_norm": 5.070358753204346, "grad_norm_var": 1.7255938342974626, "learning_rate": 0.0001, "loss": 1.6577, "loss/crossentropy": 2.39843487739563, "loss/hidden": 1.296875, "loss/logits": 0.23911422491073608, "loss/reg": 0.012170514091849327, "step": 1800 }, { "epoch": 0.225125, "grad_norm": 4.553746223449707, "grad_norm_var": 1.7227809488467594, "learning_rate": 0.0001, "loss": 1.2558, "loss/crossentropy": 2.6131913661956787, "loss/hidden": 0.953125, "loss/logits": 0.181059330701828, "loss/reg": 0.012163708917796612, "step": 1801 }, { "epoch": 0.22525, "grad_norm": 4.960707187652588, "grad_norm_var": 1.7889862701755266, "learning_rate": 0.0001, "loss": 1.1394, "loss/crossentropy": 2.8065645694732666, "loss/hidden": 0.87890625, "loss/logits": 0.13889148831367493, "loss/reg": 0.012157038785517216, "step": 1802 }, { "epoch": 0.225375, "grad_norm": 3.1211495399475098, "grad_norm_var": 1.8054184757113145, "learning_rate": 0.0001, "loss": 1.1695, "loss/crossentropy": 2.45322585105896, "loss/hidden": 0.8828125, "loss/logits": 0.16519255936145782, "loss/reg": 0.012150485999882221, "step": 1803 }, { "epoch": 0.2255, "grad_norm": 3.1585381031036377, "grad_norm_var": 1.74473550652442, "learning_rate": 0.0001, "loss": 1.1105, "loss/crossentropy": 2.432692050933838, "loss/hidden": 0.85546875, "loss/logits": 0.1336173415184021, "loss/reg": 0.012143667787313461, "step": 1804 }, { "epoch": 0.225625, "grad_norm": 3.253127098083496, "grad_norm_var": 1.647454221879684, "learning_rate": 0.0001, "loss": 1.1977, "loss/crossentropy": 2.515928268432617, "loss/hidden": 0.9140625, "loss/logits": 0.1622786521911621, "loss/reg": 0.012136753648519516, "step": 1805 }, { "epoch": 0.22575, "grad_norm": 2.5755836963653564, "grad_norm_var": 1.560998409452076, "learning_rate": 0.0001, "loss": 1.0782, "loss/crossentropy": 2.3887364864349365, "loss/hidden": 0.8125, "loss/logits": 0.14443060755729675, "loss/reg": 0.01213008351624012, "step": 1806 }, { "epoch": 0.225875, "grad_norm": 3.9604837894439697, "grad_norm_var": 1.4570643895132678, "learning_rate": 0.0001, "loss": 1.3124, "loss/crossentropy": 2.5280768871307373, "loss/hidden": 0.96484375, "loss/logits": 0.2262798249721527, "loss/reg": 0.012123593129217625, "step": 1807 }, { "epoch": 0.226, "grad_norm": 3.0762956142425537, "grad_norm_var": 1.380270170821843, "learning_rate": 0.0001, "loss": 1.0253, "loss/crossentropy": 2.928957939147949, "loss/hidden": 0.76953125, "loss/logits": 0.13464616239070892, "loss/reg": 0.012116815894842148, "step": 1808 }, { "epoch": 0.226125, "grad_norm": 3.482069730758667, "grad_norm_var": 1.2823655143187342, "learning_rate": 0.0001, "loss": 1.3007, "loss/crossentropy": 2.523679733276367, "loss/hidden": 0.99609375, "loss/logits": 0.18347124755382538, "loss/reg": 0.012109990231692791, "step": 1809 }, { "epoch": 0.22625, "grad_norm": 2.7978577613830566, "grad_norm_var": 1.3921909123204605, "learning_rate": 0.0001, "loss": 1.1764, "loss/crossentropy": 2.5396063327789307, "loss/hidden": 0.8984375, "loss/logits": 0.15695957839488983, "loss/reg": 0.012103389948606491, "step": 1810 }, { "epoch": 0.226375, "grad_norm": 3.3859915733337402, "grad_norm_var": 1.4164960881741988, "learning_rate": 0.0001, "loss": 1.1827, "loss/crossentropy": 2.421473264694214, "loss/hidden": 0.890625, "loss/logits": 0.1710580289363861, "loss/reg": 0.01209702342748642, "step": 1811 }, { "epoch": 0.2265, "grad_norm": 2.7233433723449707, "grad_norm_var": 1.2580651775254919, "learning_rate": 0.0001, "loss": 1.1054, "loss/crossentropy": 2.460507392883301, "loss/hidden": 0.8359375, "loss/logits": 0.14855897426605225, "loss/reg": 0.012090392410755157, "step": 1812 }, { "epoch": 0.226625, "grad_norm": 2.803961992263794, "grad_norm_var": 1.2159247798178134, "learning_rate": 0.0001, "loss": 1.0841, "loss/crossentropy": 2.4394664764404297, "loss/hidden": 0.83203125, "loss/logits": 0.13119429349899292, "loss/reg": 0.012084128335118294, "step": 1813 }, { "epoch": 0.22675, "grad_norm": 2.345188856124878, "grad_norm_var": 1.323038348759911, "learning_rate": 0.0001, "loss": 1.0238, "loss/crossentropy": 2.4309263229370117, "loss/hidden": 0.76953125, "loss/logits": 0.13351188600063324, "loss/reg": 0.01207789871841669, "step": 1814 }, { "epoch": 0.226875, "grad_norm": 2.409219741821289, "grad_norm_var": 0.7371698535500113, "learning_rate": 0.0001, "loss": 1.0862, "loss/crossentropy": 2.6133697032928467, "loss/hidden": 0.82421875, "loss/logits": 0.14128842949867249, "loss/reg": 0.012071667239069939, "step": 1815 }, { "epoch": 0.227, "grad_norm": 2.506357192993164, "grad_norm_var": 0.5615762297781826, "learning_rate": 0.0001, "loss": 1.1755, "loss/crossentropy": 2.237166166305542, "loss/hidden": 0.875, "loss/logits": 0.17987734079360962, "loss/reg": 0.01206506323069334, "step": 1816 }, { "epoch": 0.227125, "grad_norm": 3.0118813514709473, "grad_norm_var": 0.43074473519190304, "learning_rate": 0.0001, "loss": 1.2733, "loss/crossentropy": 2.306960344314575, "loss/hidden": 0.97265625, "loss/logits": 0.18007943034172058, "loss/reg": 0.012058539316058159, "step": 1817 }, { "epoch": 0.22725, "grad_norm": 2.3795273303985596, "grad_norm_var": 0.20616682699356942, "learning_rate": 0.0001, "loss": 1.1529, "loss/crossentropy": 2.451083183288574, "loss/hidden": 0.8828125, "loss/logits": 0.1495814025402069, "loss/reg": 0.012052006088197231, "step": 1818 }, { "epoch": 0.227375, "grad_norm": 2.332099676132202, "grad_norm_var": 0.22569619304637148, "learning_rate": 0.0001, "loss": 1.0095, "loss/crossentropy": 2.538863182067871, "loss/hidden": 0.765625, "loss/logits": 0.1233830600976944, "loss/reg": 0.012045500800013542, "step": 1819 }, { "epoch": 0.2275, "grad_norm": 2.9021058082580566, "grad_norm_var": 0.22054224463718033, "learning_rate": 0.0001, "loss": 1.4194, "loss/crossentropy": 2.32419753074646, "loss/hidden": 1.1171875, "loss/logits": 0.18185210227966309, "loss/reg": 0.012038588523864746, "step": 1820 }, { "epoch": 0.227625, "grad_norm": 2.590693235397339, "grad_norm_var": 0.21426742260109843, "learning_rate": 0.0001, "loss": 1.0937, "loss/crossentropy": 2.387709856033325, "loss/hidden": 0.82421875, "loss/logits": 0.14921152591705322, "loss/reg": 0.012031761929392815, "step": 1821 }, { "epoch": 0.22775, "grad_norm": 3.300173044204712, "grad_norm_var": 0.22248606839011675, "learning_rate": 0.0001, "loss": 1.2738, "loss/crossentropy": 2.198507308959961, "loss/hidden": 0.921875, "loss/logits": 0.2317131608724594, "loss/reg": 0.012025104835629463, "step": 1822 }, { "epoch": 0.227875, "grad_norm": 2.4631481170654297, "grad_norm_var": 0.14599126890724012, "learning_rate": 0.0001, "loss": 1.0501, "loss/crossentropy": 2.370026111602783, "loss/hidden": 0.7890625, "loss/logits": 0.14089564979076385, "loss/reg": 0.012018561363220215, "step": 1823 }, { "epoch": 0.228, "grad_norm": 3.0455853939056396, "grad_norm_var": 0.14484462892754285, "learning_rate": 0.0001, "loss": 1.1923, "loss/crossentropy": 2.4721148014068604, "loss/hidden": 0.890625, "loss/logits": 0.18152591586112976, "loss/reg": 0.01201186515390873, "step": 1824 }, { "epoch": 0.228125, "grad_norm": 3.3358211517333984, "grad_norm_var": 0.13249022272456326, "learning_rate": 0.0001, "loss": 1.1468, "loss/crossentropy": 2.6852481365203857, "loss/hidden": 0.86328125, "loss/logits": 0.16346678137779236, "loss/reg": 0.012004739604890347, "step": 1825 }, { "epoch": 0.22825, "grad_norm": 2.6543102264404297, "grad_norm_var": 0.13326040062104477, "learning_rate": 0.0001, "loss": 1.034, "loss/crossentropy": 2.3123793601989746, "loss/hidden": 0.7734375, "loss/logits": 0.14057926833629608, "loss/reg": 0.011998065747320652, "step": 1826 }, { "epoch": 0.228375, "grad_norm": 2.4901700019836426, "grad_norm_var": 0.10886572110197955, "learning_rate": 0.0001, "loss": 1.0816, "loss/crossentropy": 2.3769681453704834, "loss/hidden": 0.80859375, "loss/logits": 0.1531093716621399, "loss/reg": 0.011991064064204693, "step": 1827 }, { "epoch": 0.2285, "grad_norm": 2.7322981357574463, "grad_norm_var": 0.10889162038143449, "learning_rate": 0.0001, "loss": 1.2769, "loss/crossentropy": 2.432474136352539, "loss/hidden": 0.96484375, "loss/logits": 0.19221791625022888, "loss/reg": 0.011984668672084808, "step": 1828 }, { "epoch": 0.228625, "grad_norm": 3.040367603302002, "grad_norm_var": 0.11545954489478329, "learning_rate": 0.0001, "loss": 1.1185, "loss/crossentropy": 2.5954103469848633, "loss/hidden": 0.828125, "loss/logits": 0.17058409750461578, "loss/reg": 0.011977394111454487, "step": 1829 }, { "epoch": 0.22875, "grad_norm": 4.54348611831665, "grad_norm_var": 0.30728487463357607, "learning_rate": 0.0001, "loss": 1.4038, "loss/crossentropy": 2.3670899868011475, "loss/hidden": 1.1171875, "loss/logits": 0.16688917577266693, "loss/reg": 0.011970371939241886, "step": 1830 }, { "epoch": 0.228875, "grad_norm": 2.4918017387390137, "grad_norm_var": 0.3027632602969258, "learning_rate": 0.0001, "loss": 1.1499, "loss/crossentropy": 2.4433906078338623, "loss/hidden": 0.859375, "loss/logits": 0.17090028524398804, "loss/reg": 0.011963790282607079, "step": 1831 }, { "epoch": 0.229, "grad_norm": 2.9950368404388428, "grad_norm_var": 0.2944027102760967, "learning_rate": 0.0001, "loss": 1.1104, "loss/crossentropy": 2.791016101837158, "loss/hidden": 0.828125, "loss/logits": 0.16274631023406982, "loss/reg": 0.011956454254686832, "step": 1832 }, { "epoch": 0.229125, "grad_norm": 3.249807596206665, "grad_norm_var": 0.3016714416861295, "learning_rate": 0.0001, "loss": 1.1681, "loss/crossentropy": 2.459493637084961, "loss/hidden": 0.890625, "loss/logits": 0.1580270528793335, "loss/reg": 0.011949477717280388, "step": 1833 }, { "epoch": 0.22925, "grad_norm": 2.811523199081421, "grad_norm_var": 0.2828291293008836, "learning_rate": 0.0001, "loss": 1.1852, "loss/crossentropy": 2.3925209045410156, "loss/hidden": 0.90234375, "loss/logits": 0.16343240439891815, "loss/reg": 0.011942173354327679, "step": 1834 }, { "epoch": 0.229375, "grad_norm": 2.8339016437530518, "grad_norm_var": 0.25815168646451997, "learning_rate": 0.0001, "loss": 1.2253, "loss/crossentropy": 2.3892555236816406, "loss/hidden": 0.93359375, "loss/logits": 0.17232704162597656, "loss/reg": 0.011935535818338394, "step": 1835 }, { "epoch": 0.2295, "grad_norm": 3.5241827964782715, "grad_norm_var": 0.2769127015292412, "learning_rate": 0.0001, "loss": 1.0834, "loss/crossentropy": 2.4555206298828125, "loss/hidden": 0.80859375, "loss/logits": 0.15549984574317932, "loss/reg": 0.011928863823413849, "step": 1836 }, { "epoch": 0.229625, "grad_norm": 3.1511709690093994, "grad_norm_var": 0.26548067421464716, "learning_rate": 0.0001, "loss": 1.178, "loss/crossentropy": 2.634005069732666, "loss/hidden": 0.91015625, "loss/logits": 0.14858701825141907, "loss/reg": 0.011921834200620651, "step": 1837 }, { "epoch": 0.22975, "grad_norm": 3.1957132816314697, "grad_norm_var": 0.2625588163447295, "learning_rate": 0.0001, "loss": 1.4165, "loss/crossentropy": 2.364680051803589, "loss/hidden": 1.0703125, "loss/logits": 0.22706648707389832, "loss/reg": 0.01191527210175991, "step": 1838 }, { "epoch": 0.229875, "grad_norm": 3.4130618572235107, "grad_norm_var": 0.24654008934847388, "learning_rate": 0.0001, "loss": 1.1042, "loss/crossentropy": 2.80603289604187, "loss/hidden": 0.84765625, "loss/logits": 0.1374989151954651, "loss/reg": 0.011908802203834057, "step": 1839 }, { "epoch": 0.23, "grad_norm": 10.125365257263184, "grad_norm_var": 3.3332932374288853, "learning_rate": 0.0001, "loss": 1.2822, "loss/crossentropy": 2.7263705730438232, "loss/hidden": 0.96875, "loss/logits": 0.1943938136100769, "loss/reg": 0.011902069672942162, "step": 1840 }, { "epoch": 0.230125, "grad_norm": 3.0267136096954346, "grad_norm_var": 3.3475461515590967, "learning_rate": 0.0001, "loss": 1.4492, "loss/crossentropy": 2.527991771697998, "loss/hidden": 1.125, "loss/logits": 0.20524314045906067, "loss/reg": 0.01189569290727377, "step": 1841 }, { "epoch": 0.23025, "grad_norm": 3.0950570106506348, "grad_norm_var": 3.308964844199806, "learning_rate": 0.0001, "loss": 1.1675, "loss/crossentropy": 2.9109840393066406, "loss/hidden": 0.87890625, "loss/logits": 0.16966286301612854, "loss/reg": 0.011889121495187283, "step": 1842 }, { "epoch": 0.230375, "grad_norm": 2.6885743141174316, "grad_norm_var": 3.283521301133828, "learning_rate": 0.0001, "loss": 0.9502, "loss/crossentropy": 2.463144540786743, "loss/hidden": 0.7265625, "loss/logits": 0.10477050393819809, "loss/reg": 0.011882682330906391, "step": 1843 }, { "epoch": 0.2305, "grad_norm": 7.153027534484863, "grad_norm_var": 4.018621504186891, "learning_rate": 0.0001, "loss": 1.5744, "loss/crossentropy": 2.408703088760376, "loss/hidden": 1.1796875, "loss/logits": 0.2759320139884949, "loss/reg": 0.01187664270401001, "step": 1844 }, { "epoch": 0.230625, "grad_norm": 2.739917516708374, "grad_norm_var": 4.056043276959989, "learning_rate": 0.0001, "loss": 1.1633, "loss/crossentropy": 2.417595863342285, "loss/hidden": 0.87109375, "loss/logits": 0.17353396117687225, "loss/reg": 0.011870414949953556, "step": 1845 }, { "epoch": 0.23075, "grad_norm": 3.544043779373169, "grad_norm_var": 4.021382457840771, "learning_rate": 0.0001, "loss": 1.256, "loss/crossentropy": 2.5209176540374756, "loss/hidden": 0.96484375, "loss/logits": 0.1725134551525116, "loss/reg": 0.0118643743917346, "step": 1846 }, { "epoch": 0.230875, "grad_norm": 2.5454280376434326, "grad_norm_var": 4.012548475227705, "learning_rate": 0.0001, "loss": 1.125, "loss/crossentropy": 2.788163185119629, "loss/hidden": 0.84765625, "loss/logits": 0.15880800783634186, "loss/reg": 0.011857859790325165, "step": 1847 }, { "epoch": 0.231, "grad_norm": 2.663525104522705, "grad_norm_var": 4.053043390213641, "learning_rate": 0.0001, "loss": 1.1004, "loss/crossentropy": 2.458189010620117, "loss/hidden": 0.83203125, "loss/logits": 0.14984336495399475, "loss/reg": 0.011851528659462929, "step": 1848 }, { "epoch": 0.231125, "grad_norm": 3.5077850818634033, "grad_norm_var": 4.0405115731206624, "learning_rate": 0.0001, "loss": 1.0857, "loss/crossentropy": 2.5606067180633545, "loss/hidden": 0.8125, "loss/logits": 0.15474531054496765, "loss/reg": 0.011845485307276249, "step": 1849 }, { "epoch": 0.23125, "grad_norm": 3.274979829788208, "grad_norm_var": 3.995870290819255, "learning_rate": 0.0001, "loss": 1.0438, "loss/crossentropy": 2.5572433471679688, "loss/hidden": 0.796875, "loss/logits": 0.12853728234767914, "loss/reg": 0.011839361861348152, "step": 1850 }, { "epoch": 0.231375, "grad_norm": 2.723388433456421, "grad_norm_var": 4.010576716926485, "learning_rate": 0.0001, "loss": 1.0091, "loss/crossentropy": 2.6289405822753906, "loss/hidden": 0.7578125, "loss/logits": 0.1329689919948578, "loss/reg": 0.011833377182483673, "step": 1851 }, { "epoch": 0.2315, "grad_norm": 5.253420829772949, "grad_norm_var": 4.140042975539734, "learning_rate": 0.0001, "loss": 1.2997, "loss/crossentropy": 2.522901773452759, "loss/hidden": 0.984375, "loss/logits": 0.19709725677967072, "loss/reg": 0.011827518232166767, "step": 1852 }, { "epoch": 0.231625, "grad_norm": 2.8319897651672363, "grad_norm_var": 4.177483717989236, "learning_rate": 0.0001, "loss": 1.3684, "loss/crossentropy": 2.2863683700561523, "loss/hidden": 1.078125, "loss/logits": 0.17204830050468445, "loss/reg": 0.011821010150015354, "step": 1853 }, { "epoch": 0.23175, "grad_norm": 3.758786201477051, "grad_norm_var": 4.14732397532542, "learning_rate": 0.0001, "loss": 1.091, "loss/crossentropy": 2.6834845542907715, "loss/hidden": 0.81640625, "loss/logits": 0.1564285159111023, "loss/reg": 0.011815285310149193, "step": 1854 }, { "epoch": 0.231875, "grad_norm": 2.970064163208008, "grad_norm_var": 4.18814826964047, "learning_rate": 0.0001, "loss": 1.1222, "loss/crossentropy": 2.350801944732666, "loss/hidden": 0.85546875, "loss/logits": 0.14867839217185974, "loss/reg": 0.011809652671217918, "step": 1855 }, { "epoch": 0.232, "grad_norm": 2.991593599319458, "grad_norm_var": 1.4178378002755372, "learning_rate": 0.0001, "loss": 1.0011, "loss/crossentropy": 2.438445568084717, "loss/hidden": 0.7734375, "loss/logits": 0.10962004959583282, "loss/reg": 0.011803114786744118, "step": 1856 }, { "epoch": 0.232125, "grad_norm": 2.449436664581299, "grad_norm_var": 1.4691695267047424, "learning_rate": 0.0001, "loss": 0.9734, "loss/crossentropy": 2.3218026161193848, "loss/hidden": 0.7265625, "loss/logits": 0.12885171175003052, "loss/reg": 0.011797088198363781, "step": 1857 }, { "epoch": 0.23225, "grad_norm": 3.538789987564087, "grad_norm_var": 1.4642067121892361, "learning_rate": 0.0001, "loss": 1.2534, "loss/crossentropy": 2.3883466720581055, "loss/hidden": 0.95703125, "loss/logits": 0.17844754457473755, "loss/reg": 0.01179055217653513, "step": 1858 }, { "epoch": 0.232375, "grad_norm": 2.6442153453826904, "grad_norm_var": 1.4686242200827846, "learning_rate": 0.0001, "loss": 1.1711, "loss/crossentropy": 2.5905346870422363, "loss/hidden": 0.890625, "loss/logits": 0.1626666784286499, "loss/reg": 0.011784548871219158, "step": 1859 }, { "epoch": 0.2325, "grad_norm": 2.63577938079834, "grad_norm_var": 0.4906894012779368, "learning_rate": 0.0001, "loss": 0.9256, "loss/crossentropy": 2.1434152126312256, "loss/hidden": 0.703125, "loss/logits": 0.10471002757549286, "loss/reg": 0.011777986772358418, "step": 1860 }, { "epoch": 0.232625, "grad_norm": 2.578632354736328, "grad_norm_var": 0.5006945948105681, "learning_rate": 0.0001, "loss": 1.0467, "loss/crossentropy": 2.7150838375091553, "loss/hidden": 0.796875, "loss/logits": 0.13211926817893982, "loss/reg": 0.011771964840590954, "step": 1861 }, { "epoch": 0.23275, "grad_norm": 3.5398929119110107, "grad_norm_var": 0.5004607034463172, "learning_rate": 0.0001, "loss": 1.2583, "loss/crossentropy": 2.7141544818878174, "loss/hidden": 0.94140625, "loss/logits": 0.19919423758983612, "loss/reg": 0.01176582369953394, "step": 1862 }, { "epoch": 0.232875, "grad_norm": 3.1188647747039795, "grad_norm_var": 0.4771405434982313, "learning_rate": 0.0001, "loss": 1.0285, "loss/crossentropy": 2.2451930046081543, "loss/hidden": 0.7890625, "loss/logits": 0.12186941504478455, "loss/reg": 0.011759834364056587, "step": 1863 }, { "epoch": 0.233, "grad_norm": 3.0794904232025146, "grad_norm_var": 0.4606925715881855, "learning_rate": 0.0001, "loss": 1.0381, "loss/crossentropy": 2.338972330093384, "loss/hidden": 0.7890625, "loss/logits": 0.1315452754497528, "loss/reg": 0.011753806844353676, "step": 1864 }, { "epoch": 0.233125, "grad_norm": 3.019987106323242, "grad_norm_var": 0.45431474823360435, "learning_rate": 0.0001, "loss": 1.0534, "loss/crossentropy": 2.5802109241485596, "loss/hidden": 0.80078125, "loss/logits": 0.1351454257965088, "loss/reg": 0.011747847311198711, "step": 1865 }, { "epoch": 0.23325, "grad_norm": 3.0198545455932617, "grad_norm_var": 0.45415120043857066, "learning_rate": 0.0001, "loss": 1.3085, "loss/crossentropy": 2.2313005924224854, "loss/hidden": 0.96875, "loss/logits": 0.22236411273479462, "loss/reg": 0.01174143236130476, "step": 1866 }, { "epoch": 0.233375, "grad_norm": 2.4949746131896973, "grad_norm_var": 0.46993664201424584, "learning_rate": 0.0001, "loss": 1.0675, "loss/crossentropy": 2.530468225479126, "loss/hidden": 0.8046875, "loss/logits": 0.14545810222625732, "loss/reg": 0.01173495315015316, "step": 1867 }, { "epoch": 0.2335, "grad_norm": 3.2556769847869873, "grad_norm_var": 0.15119857978987525, "learning_rate": 0.0001, "loss": 1.2375, "loss/crossentropy": 2.5190608501434326, "loss/hidden": 0.9296875, "loss/logits": 0.1905001997947693, "loss/reg": 0.011728293262422085, "step": 1868 }, { "epoch": 0.233625, "grad_norm": 5.818284034729004, "grad_norm_var": 0.6434646637531138, "learning_rate": 0.0001, "loss": 1.7541, "loss/crossentropy": 2.4520504474639893, "loss/hidden": 1.328125, "loss/logits": 0.30873966217041016, "loss/reg": 0.011721369810402393, "step": 1869 }, { "epoch": 0.23375, "grad_norm": 18.115583419799805, "grad_norm_var": 14.629645381532592, "learning_rate": 0.0001, "loss": 1.626, "loss/crossentropy": 2.765774965286255, "loss/hidden": 1.265625, "loss/logits": 0.24319738149642944, "loss/reg": 0.011714870110154152, "step": 1870 }, { "epoch": 0.233875, "grad_norm": 4.950761795043945, "grad_norm_var": 14.58186333788439, "learning_rate": 0.0001, "loss": 1.3825, "loss/crossentropy": 2.3341286182403564, "loss/hidden": 1.0859375, "loss/logits": 0.17949844896793365, "loss/reg": 0.011708397418260574, "step": 1871 }, { "epoch": 0.234, "grad_norm": 3.149343729019165, "grad_norm_var": 14.557933702264693, "learning_rate": 0.0001, "loss": 1.2519, "loss/crossentropy": 2.2633235454559326, "loss/hidden": 0.98046875, "loss/logits": 0.1543847918510437, "loss/reg": 0.011701937764883041, "step": 1872 }, { "epoch": 0.234125, "grad_norm": 3.2733466625213623, "grad_norm_var": 14.4066140044379, "learning_rate": 0.0001, "loss": 0.9954, "loss/crossentropy": 2.3169283866882324, "loss/hidden": 0.75, "loss/logits": 0.12840038537979126, "loss/reg": 0.011695554479956627, "step": 1873 }, { "epoch": 0.23425, "grad_norm": 2.9650564193725586, "grad_norm_var": 14.482709435196355, "learning_rate": 0.0001, "loss": 1.0411, "loss/crossentropy": 2.499809741973877, "loss/hidden": 0.78125, "loss/logits": 0.14294317364692688, "loss/reg": 0.011689374223351479, "step": 1874 }, { "epoch": 0.234375, "grad_norm": 3.3756163120269775, "grad_norm_var": 14.361621179597174, "learning_rate": 0.0001, "loss": 1.2119, "loss/crossentropy": 2.484644889831543, "loss/hidden": 0.92578125, "loss/logits": 0.16932502388954163, "loss/reg": 0.011682825163006783, "step": 1875 }, { "epoch": 0.2345, "grad_norm": 4.519603729248047, "grad_norm_var": 14.171825990122963, "learning_rate": 0.0001, "loss": 1.2405, "loss/crossentropy": 2.1961562633514404, "loss/hidden": 0.95703125, "loss/logits": 0.1666729897260666, "loss/reg": 0.01167618203908205, "step": 1876 }, { "epoch": 0.234625, "grad_norm": 7.144575595855713, "grad_norm_var": 14.370738345950354, "learning_rate": 0.0001, "loss": 1.5206, "loss/crossentropy": 3.1038763523101807, "loss/hidden": 1.1171875, "loss/logits": 0.28672921657562256, "loss/reg": 0.01166961807757616, "step": 1877 }, { "epoch": 0.23475, "grad_norm": 3.3410837650299072, "grad_norm_var": 14.403365735245997, "learning_rate": 0.0001, "loss": 1.0188, "loss/crossentropy": 2.6396772861480713, "loss/hidden": 0.76953125, "loss/logits": 0.1326141357421875, "loss/reg": 0.011662835255265236, "step": 1878 }, { "epoch": 0.234875, "grad_norm": 3.955864429473877, "grad_norm_var": 14.274587966883617, "learning_rate": 0.0001, "loss": 1.2218, "loss/crossentropy": 2.5625252723693848, "loss/hidden": 0.9453125, "loss/logits": 0.1599578559398651, "loss/reg": 0.011656397022306919, "step": 1879 }, { "epoch": 0.235, "grad_norm": 3.702489137649536, "grad_norm_var": 14.162786868506032, "learning_rate": 0.0001, "loss": 1.1474, "loss/crossentropy": 2.570443868637085, "loss/hidden": 0.8828125, "loss/logits": 0.14810335636138916, "loss/reg": 0.011648759245872498, "step": 1880 }, { "epoch": 0.235125, "grad_norm": 3.006218194961548, "grad_norm_var": 14.165986485307208, "learning_rate": 0.0001, "loss": 1.1945, "loss/crossentropy": 2.175870895385742, "loss/hidden": 0.91015625, "loss/logits": 0.1679525077342987, "loss/reg": 0.01164237316697836, "step": 1881 }, { "epoch": 0.23525, "grad_norm": 3.0096826553344727, "grad_norm_var": 14.168346952953607, "learning_rate": 0.0001, "loss": 1.0846, "loss/crossentropy": 2.7216742038726807, "loss/hidden": 0.8203125, "loss/logits": 0.14791440963745117, "loss/reg": 0.01163501013070345, "step": 1882 }, { "epoch": 0.235375, "grad_norm": 3.3620381355285645, "grad_norm_var": 13.954069607905243, "learning_rate": 0.0001, "loss": 1.339, "loss/crossentropy": 2.6900413036346436, "loss/hidden": 1.0234375, "loss/logits": 0.19929756224155426, "loss/reg": 0.01162752602249384, "step": 1883 }, { "epoch": 0.2355, "grad_norm": 2.5984811782836914, "grad_norm_var": 14.117182111852275, "learning_rate": 0.0001, "loss": 1.0653, "loss/crossentropy": 2.392869472503662, "loss/hidden": 0.8046875, "loss/logits": 0.14441151916980743, "loss/reg": 0.011621098034083843, "step": 1884 }, { "epoch": 0.235625, "grad_norm": 3.761037588119507, "grad_norm_var": 14.093606633107498, "learning_rate": 0.0001, "loss": 1.0553, "loss/crossentropy": 2.8546535968780518, "loss/hidden": 0.77734375, "loss/logits": 0.16185110807418823, "loss/reg": 0.011613850481808186, "step": 1885 }, { "epoch": 0.23575, "grad_norm": 5.239340782165527, "grad_norm_var": 1.319651559267754, "learning_rate": 0.0001, "loss": 1.0582, "loss/crossentropy": 2.560152769088745, "loss/hidden": 0.80078125, "loss/logits": 0.14137983322143555, "loss/reg": 0.011606672778725624, "step": 1886 }, { "epoch": 0.235875, "grad_norm": 2.783051013946533, "grad_norm_var": 1.2907520410126019, "learning_rate": 0.0001, "loss": 1.0424, "loss/crossentropy": 2.37328839302063, "loss/hidden": 0.79296875, "loss/logits": 0.13344892859458923, "loss/reg": 0.011599456891417503, "step": 1887 }, { "epoch": 0.236, "grad_norm": 3.554786443710327, "grad_norm_var": 1.2713025846844552, "learning_rate": 0.0001, "loss": 1.1632, "loss/crossentropy": 2.494732141494751, "loss/hidden": 0.85546875, "loss/logits": 0.1918359100818634, "loss/reg": 0.011593029834330082, "step": 1888 }, { "epoch": 0.236125, "grad_norm": 3.321483612060547, "grad_norm_var": 1.2685516790340434, "learning_rate": 0.0001, "loss": 1.2471, "loss/crossentropy": 2.4095706939697266, "loss/hidden": 0.9765625, "loss/logits": 0.15465524792671204, "loss/reg": 0.011586642824113369, "step": 1889 }, { "epoch": 0.23625, "grad_norm": 2.9616763591766357, "grad_norm_var": 1.2688960186311133, "learning_rate": 0.0001, "loss": 1.4451, "loss/crossentropy": 2.239219903945923, "loss/hidden": 1.140625, "loss/logits": 0.1887097954750061, "loss/reg": 0.011579960584640503, "step": 1890 }, { "epoch": 0.236375, "grad_norm": 3.4970099925994873, "grad_norm_var": 1.2641245233408538, "learning_rate": 0.0001, "loss": 1.1386, "loss/crossentropy": 2.5096256732940674, "loss/hidden": 0.87109375, "loss/logits": 0.15174484252929688, "loss/reg": 0.011572892777621746, "step": 1891 }, { "epoch": 0.2365, "grad_norm": 2.6197335720062256, "grad_norm_var": 1.2909410184264962, "learning_rate": 0.0001, "loss": 1.0721, "loss/crossentropy": 2.704296112060547, "loss/hidden": 0.8125, "loss/logits": 0.14390236139297485, "loss/reg": 0.01156578678637743, "step": 1892 }, { "epoch": 0.236625, "grad_norm": 5.325662136077881, "grad_norm_var": 0.6420011074287828, "learning_rate": 0.0001, "loss": 1.5009, "loss/crossentropy": 2.394350528717041, "loss/hidden": 1.2109375, "loss/logits": 0.1743205487728119, "loss/reg": 0.011559294536709785, "step": 1893 }, { "epoch": 0.23675, "grad_norm": 3.4110569953918457, "grad_norm_var": 0.6408013583584702, "learning_rate": 0.0001, "loss": 1.3052, "loss/crossentropy": 2.3642685413360596, "loss/hidden": 1.0, "loss/logits": 0.18962880969047546, "loss/reg": 0.011552795767784119, "step": 1894 }, { "epoch": 0.236875, "grad_norm": 4.094106674194336, "grad_norm_var": 0.6502721450147265, "learning_rate": 0.0001, "loss": 1.3815, "loss/crossentropy": 2.5077645778656006, "loss/hidden": 1.0859375, "loss/logits": 0.18015095591545105, "loss/reg": 0.01154585275799036, "step": 1895 }, { "epoch": 0.237, "grad_norm": 3.3601646423339844, "grad_norm_var": 0.6490610636632368, "learning_rate": 0.0001, "loss": 1.1372, "loss/crossentropy": 2.4785971641540527, "loss/hidden": 0.8671875, "loss/logits": 0.15463027358055115, "loss/reg": 0.011538945138454437, "step": 1896 }, { "epoch": 0.237125, "grad_norm": 2.699127197265625, "grad_norm_var": 0.6749314875548348, "learning_rate": 0.0001, "loss": 1.0438, "loss/crossentropy": 2.562880754470825, "loss/hidden": 0.80078125, "loss/logits": 0.12774690985679626, "loss/reg": 0.011531817726790905, "step": 1897 }, { "epoch": 0.23725, "grad_norm": 3.408806324005127, "grad_norm_var": 0.6601303555653694, "learning_rate": 0.0001, "loss": 1.2514, "loss/crossentropy": 2.7521603107452393, "loss/hidden": 0.95703125, "loss/logits": 0.17915445566177368, "loss/reg": 0.01152490172535181, "step": 1898 }, { "epoch": 0.237375, "grad_norm": 2.782313346862793, "grad_norm_var": 0.6917876208904978, "learning_rate": 0.0001, "loss": 1.2561, "loss/crossentropy": 2.768892765045166, "loss/hidden": 0.96484375, "loss/logits": 0.17611053586006165, "loss/reg": 0.011517412029206753, "step": 1899 }, { "epoch": 0.2375, "grad_norm": 2.913452625274658, "grad_norm_var": 0.6616557378995289, "learning_rate": 0.0001, "loss": 1.1729, "loss/crossentropy": 2.365873336791992, "loss/hidden": 0.90234375, "loss/logits": 0.15543469786643982, "loss/reg": 0.011509752832353115, "step": 1900 }, { "epoch": 0.237625, "grad_norm": 2.3335838317871094, "grad_norm_var": 0.736146354285043, "learning_rate": 0.0001, "loss": 0.9944, "loss/crossentropy": 2.6142027378082275, "loss/hidden": 0.75390625, "loss/logits": 0.12545104324817657, "loss/reg": 0.011503461748361588, "step": 1901 }, { "epoch": 0.23775, "grad_norm": 3.5080623626708984, "grad_norm_var": 0.4975255652152091, "learning_rate": 0.0001, "loss": 1.204, "loss/crossentropy": 2.880540609359741, "loss/hidden": 0.9140625, "loss/logits": 0.17492809891700745, "loss/reg": 0.01149703934788704, "step": 1902 }, { "epoch": 0.237875, "grad_norm": 2.856538772583008, "grad_norm_var": 0.49293619178501635, "learning_rate": 0.0001, "loss": 1.1438, "loss/crossentropy": 2.411937713623047, "loss/hidden": 0.875, "loss/logits": 0.153866246342659, "loss/reg": 0.01148985605686903, "step": 1903 }, { "epoch": 0.238, "grad_norm": 2.922060012817383, "grad_norm_var": 0.49565918281477345, "learning_rate": 0.0001, "loss": 1.0083, "loss/crossentropy": 2.714773416519165, "loss/hidden": 0.76171875, "loss/logits": 0.1317380964756012, "loss/reg": 0.011483217589557171, "step": 1904 }, { "epoch": 0.238125, "grad_norm": 3.1696555614471436, "grad_norm_var": 0.49567159607848527, "learning_rate": 0.0001, "loss": 1.2106, "loss/crossentropy": 2.5127952098846436, "loss/hidden": 0.95703125, "loss/logits": 0.1388104110956192, "loss/reg": 0.011476212181150913, "step": 1905 }, { "epoch": 0.23825, "grad_norm": 2.5425992012023926, "grad_norm_var": 0.5222804369498891, "learning_rate": 0.0001, "loss": 1.0714, "loss/crossentropy": 2.6032824516296387, "loss/hidden": 0.80859375, "loss/logits": 0.14816129207611084, "loss/reg": 0.011469176970422268, "step": 1906 }, { "epoch": 0.238375, "grad_norm": 2.748033285140991, "grad_norm_var": 0.5292028458853089, "learning_rate": 0.0001, "loss": 1.1389, "loss/crossentropy": 2.697582721710205, "loss/hidden": 0.859375, "loss/logits": 0.16493722796440125, "loss/reg": 0.011462134309113026, "step": 1907 }, { "epoch": 0.2385, "grad_norm": 4.973997592926025, "grad_norm_var": 0.7033744522376982, "learning_rate": 0.0001, "loss": 1.1519, "loss/crossentropy": 2.451634168624878, "loss/hidden": 0.8828125, "loss/logits": 0.15450610220432281, "loss/reg": 0.011455683968961239, "step": 1908 }, { "epoch": 0.238625, "grad_norm": 2.706803321838379, "grad_norm_var": 0.43014165554197537, "learning_rate": 0.0001, "loss": 1.0876, "loss/crossentropy": 2.553027629852295, "loss/hidden": 0.8203125, "loss/logits": 0.15277239680290222, "loss/reg": 0.011448659934103489, "step": 1909 }, { "epoch": 0.23875, "grad_norm": 2.650634288787842, "grad_norm_var": 0.44000573292169826, "learning_rate": 0.0001, "loss": 1.2477, "loss/crossentropy": 2.413283586502075, "loss/hidden": 0.94921875, "loss/logits": 0.1840301901102066, "loss/reg": 0.011441958136856556, "step": 1910 }, { "epoch": 0.238875, "grad_norm": 3.073920965194702, "grad_norm_var": 0.37042588009566063, "learning_rate": 0.0001, "loss": 1.24, "loss/crossentropy": 2.647343158721924, "loss/hidden": 0.92578125, "loss/logits": 0.19982083141803741, "loss/reg": 0.011434913612902164, "step": 1911 }, { "epoch": 0.239, "grad_norm": 4.049685478210449, "grad_norm_var": 0.42951946606552244, "learning_rate": 0.0001, "loss": 1.3637, "loss/crossentropy": 2.2610855102539062, "loss/hidden": 1.046875, "loss/logits": 0.20253784954547882, "loss/reg": 0.01142791099846363, "step": 1912 }, { "epoch": 0.239125, "grad_norm": 2.926657199859619, "grad_norm_var": 0.42108803087631347, "learning_rate": 0.0001, "loss": 1.2487, "loss/crossentropy": 2.795015335083008, "loss/hidden": 0.9609375, "loss/logits": 0.17352721095085144, "loss/reg": 0.011420897208154202, "step": 1913 }, { "epoch": 0.23925, "grad_norm": 2.98282790184021, "grad_norm_var": 0.4147719819065732, "learning_rate": 0.0001, "loss": 1.1135, "loss/crossentropy": 2.5007472038269043, "loss/hidden": 0.8359375, "loss/logits": 0.1634053736925125, "loss/reg": 0.011413791216909885, "step": 1914 }, { "epoch": 0.239375, "grad_norm": 2.743699073791504, "grad_norm_var": 0.4163530495107797, "learning_rate": 0.0001, "loss": 1.1331, "loss/crossentropy": 2.5021727085113525, "loss/hidden": 0.875, "loss/logits": 0.14406922459602356, "loss/reg": 0.011406795121729374, "step": 1915 }, { "epoch": 0.2395, "grad_norm": 2.8699593544006348, "grad_norm_var": 0.41737266552279284, "learning_rate": 0.0001, "loss": 1.1846, "loss/crossentropy": 2.409477949142456, "loss/hidden": 0.91015625, "loss/logits": 0.1604822278022766, "loss/reg": 0.011399611830711365, "step": 1916 }, { "epoch": 0.239625, "grad_norm": 2.5404951572418213, "grad_norm_var": 0.3998377204796666, "learning_rate": 0.0001, "loss": 1.0893, "loss/crossentropy": 2.4783318042755127, "loss/hidden": 0.8203125, "loss/logits": 0.1551009714603424, "loss/reg": 0.011393279768526554, "step": 1917 }, { "epoch": 0.23975, "grad_norm": 2.9057133197784424, "grad_norm_var": 0.38806304932068847, "learning_rate": 0.0001, "loss": 1.055, "loss/crossentropy": 2.5464046001434326, "loss/hidden": 0.796875, "loss/logits": 0.1442166268825531, "loss/reg": 0.011386997997760773, "step": 1918 }, { "epoch": 0.239875, "grad_norm": 3.536914587020874, "grad_norm_var": 0.4002199957694375, "learning_rate": 0.0001, "loss": 1.388, "loss/crossentropy": 2.1978702545166016, "loss/hidden": 1.0625, "loss/logits": 0.21171514689922333, "loss/reg": 0.011380769312381744, "step": 1919 }, { "epoch": 0.24, "grad_norm": 2.766721725463867, "grad_norm_var": 0.4050817388615691, "learning_rate": 0.0001, "loss": 1.1325, "loss/crossentropy": 2.6648097038269043, "loss/hidden": 0.859375, "loss/logits": 0.1593654751777649, "loss/reg": 0.011374091729521751, "step": 1920 }, { "epoch": 0.240125, "grad_norm": 3.2392373085021973, "grad_norm_var": 0.4062692871011743, "learning_rate": 0.0001, "loss": 1.2079, "loss/crossentropy": 2.6759071350097656, "loss/hidden": 0.89453125, "loss/logits": 0.19964686036109924, "loss/reg": 0.01136783231049776, "step": 1921 }, { "epoch": 0.24025, "grad_norm": 3.552058696746826, "grad_norm_var": 0.39781198223739217, "learning_rate": 0.0001, "loss": 1.2685, "loss/crossentropy": 2.4821066856384277, "loss/hidden": 0.984375, "loss/logits": 0.170503631234169, "loss/reg": 0.011361614800989628, "step": 1922 }, { "epoch": 0.240375, "grad_norm": 3.387380361557007, "grad_norm_var": 0.38980030472856225, "learning_rate": 0.0001, "loss": 1.1798, "loss/crossentropy": 2.5588643550872803, "loss/hidden": 0.91015625, "loss/logits": 0.1560502052307129, "loss/reg": 0.011355074122548103, "step": 1923 }, { "epoch": 0.2405, "grad_norm": 2.437252998352051, "grad_norm_var": 0.18576844254103358, "learning_rate": 0.0001, "loss": 1.2037, "loss/crossentropy": 2.2251975536346436, "loss/hidden": 0.9453125, "loss/logits": 0.1448952853679657, "loss/reg": 0.011348758824169636, "step": 1924 }, { "epoch": 0.240625, "grad_norm": 2.870936393737793, "grad_norm_var": 0.18052971078875166, "learning_rate": 0.0001, "loss": 1.0809, "loss/crossentropy": 2.538954973220825, "loss/hidden": 0.8203125, "loss/logits": 0.1471145749092102, "loss/reg": 0.011342531070113182, "step": 1925 }, { "epoch": 0.24075, "grad_norm": 2.4371042251586914, "grad_norm_var": 0.19427645895656553, "learning_rate": 0.0001, "loss": 1.0965, "loss/crossentropy": 2.3479714393615723, "loss/hidden": 0.828125, "loss/logits": 0.15495869517326355, "loss/reg": 0.01133667305111885, "step": 1926 }, { "epoch": 0.240875, "grad_norm": 2.7412068843841553, "grad_norm_var": 0.19880465575710388, "learning_rate": 0.0001, "loss": 1.0415, "loss/crossentropy": 2.402764081954956, "loss/hidden": 0.79296875, "loss/logits": 0.13523459434509277, "loss/reg": 0.01133043598383665, "step": 1927 }, { "epoch": 0.241, "grad_norm": 2.7866973876953125, "grad_norm_var": 0.12160759981975294, "learning_rate": 0.0001, "loss": 1.3153, "loss/crossentropy": 2.327364683151245, "loss/hidden": 0.9921875, "loss/logits": 0.20987921953201294, "loss/reg": 0.011324429884552956, "step": 1928 }, { "epoch": 0.241125, "grad_norm": 2.642634153366089, "grad_norm_var": 0.1264088206080712, "learning_rate": 0.0001, "loss": 1.1514, "loss/crossentropy": 2.319239616394043, "loss/hidden": 0.8828125, "loss/logits": 0.15535815060138702, "loss/reg": 0.011318519711494446, "step": 1929 }, { "epoch": 0.24125, "grad_norm": 2.6170005798339844, "grad_norm_var": 0.13085757964727993, "learning_rate": 0.0001, "loss": 1.1138, "loss/crossentropy": 2.5678954124450684, "loss/hidden": 0.84375, "loss/logits": 0.15691521763801575, "loss/reg": 0.011312729679048061, "step": 1930 }, { "epoch": 0.241375, "grad_norm": 3.271930456161499, "grad_norm_var": 0.13871901991367938, "learning_rate": 0.0001, "loss": 1.0944, "loss/crossentropy": 2.6121537685394287, "loss/hidden": 0.82421875, "loss/logits": 0.15707923471927643, "loss/reg": 0.01130701508373022, "step": 1931 }, { "epoch": 0.2415, "grad_norm": 4.04671573638916, "grad_norm_var": 0.21855977270830043, "learning_rate": 0.0001, "loss": 1.0816, "loss/crossentropy": 2.6852362155914307, "loss/hidden": 0.82421875, "loss/logits": 0.1443198323249817, "loss/reg": 0.011301231570541859, "step": 1932 }, { "epoch": 0.241625, "grad_norm": 2.44777250289917, "grad_norm_var": 0.22460799214468258, "learning_rate": 0.0001, "loss": 1.1546, "loss/crossentropy": 2.544693946838379, "loss/hidden": 0.87890625, "loss/logits": 0.1627037525177002, "loss/reg": 0.011295530013740063, "step": 1933 }, { "epoch": 0.24175, "grad_norm": 3.0974671840667725, "grad_norm_var": 0.22499515882713012, "learning_rate": 0.0001, "loss": 1.2202, "loss/crossentropy": 2.6100382804870605, "loss/hidden": 0.9296875, "loss/logits": 0.1776452660560608, "loss/reg": 0.011289400048553944, "step": 1934 }, { "epoch": 0.241875, "grad_norm": 2.6123459339141846, "grad_norm_var": 0.21130123911614854, "learning_rate": 0.0001, "loss": 1.087, "loss/crossentropy": 2.5938217639923096, "loss/hidden": 0.83203125, "loss/logits": 0.14210623502731323, "loss/reg": 0.011283115483820438, "step": 1935 }, { "epoch": 0.242, "grad_norm": 2.2436466217041016, "grad_norm_var": 0.24011386438526852, "learning_rate": 0.0001, "loss": 1.0124, "loss/crossentropy": 2.5183980464935303, "loss/hidden": 0.7734375, "loss/logits": 0.12614327669143677, "loss/reg": 0.011277486570179462, "step": 1936 }, { "epoch": 0.242125, "grad_norm": 5.555136680603027, "grad_norm_var": 0.6794719115699573, "learning_rate": 0.0001, "loss": 1.1126, "loss/crossentropy": 2.543240547180176, "loss/hidden": 0.8671875, "loss/logits": 0.13274727761745453, "loss/reg": 0.011271213181316853, "step": 1937 }, { "epoch": 0.24225, "grad_norm": 2.610562562942505, "grad_norm_var": 0.6714344269587417, "learning_rate": 0.0001, "loss": 0.8577, "loss/crossentropy": 2.4972715377807617, "loss/hidden": 0.6484375, "loss/logits": 0.09660603851079941, "loss/reg": 0.01126556284725666, "step": 1938 }, { "epoch": 0.242375, "grad_norm": 3.4906604290008545, "grad_norm_var": 0.6776027391572443, "learning_rate": 0.0001, "loss": 1.2131, "loss/crossentropy": 2.576024293899536, "loss/hidden": 0.91796875, "loss/logits": 0.18251577019691467, "loss/reg": 0.01125932577997446, "step": 1939 }, { "epoch": 0.2425, "grad_norm": 2.422001361846924, "grad_norm_var": 0.6787500956269599, "learning_rate": 0.0001, "loss": 0.9879, "loss/crossentropy": 2.3907670974731445, "loss/hidden": 0.74609375, "loss/logits": 0.1292981505393982, "loss/reg": 0.011253675445914268, "step": 1940 }, { "epoch": 0.242625, "grad_norm": 3.6441028118133545, "grad_norm_var": 0.7034908497749977, "learning_rate": 0.0001, "loss": 1.2193, "loss/crossentropy": 2.268623113632202, "loss/hidden": 0.921875, "loss/logits": 0.18494915962219238, "loss/reg": 0.011247408576309681, "step": 1941 }, { "epoch": 0.24275, "grad_norm": 3.2040059566497803, "grad_norm_var": 0.6784287892697456, "learning_rate": 0.0001, "loss": 1.3474, "loss/crossentropy": 2.369084596633911, "loss/hidden": 1.0234375, "loss/logits": 0.21159708499908447, "loss/reg": 0.011241290718317032, "step": 1942 }, { "epoch": 0.242875, "grad_norm": 3.0895087718963623, "grad_norm_var": 0.6698306293757182, "learning_rate": 0.0001, "loss": 1.4156, "loss/crossentropy": 2.0874648094177246, "loss/hidden": 1.09375, "loss/logits": 0.20947730541229248, "loss/reg": 0.011235379613935947, "step": 1943 }, { "epoch": 0.243, "grad_norm": 2.4965028762817383, "grad_norm_var": 0.687657011627713, "learning_rate": 0.0001, "loss": 1.1098, "loss/crossentropy": 2.210761547088623, "loss/hidden": 0.8515625, "loss/logits": 0.14598006010055542, "loss/reg": 0.011229581199586391, "step": 1944 }, { "epoch": 0.243125, "grad_norm": 2.9449803829193115, "grad_norm_var": 0.6752047525480802, "learning_rate": 0.0001, "loss": 1.1668, "loss/crossentropy": 2.4518120288848877, "loss/hidden": 0.8671875, "loss/logits": 0.18732941150665283, "loss/reg": 0.011223935522139072, "step": 1945 }, { "epoch": 0.24325, "grad_norm": 3.066868305206299, "grad_norm_var": 0.6581535524958798, "learning_rate": 0.0001, "loss": 1.29, "loss/crossentropy": 2.822751522064209, "loss/hidden": 0.96484375, "loss/logits": 0.21297214925289154, "loss/reg": 0.01121827308088541, "step": 1946 }, { "epoch": 0.243375, "grad_norm": 2.5983641147613525, "grad_norm_var": 0.6746843795057048, "learning_rate": 0.0001, "loss": 1.1594, "loss/crossentropy": 2.397672414779663, "loss/hidden": 0.890625, "loss/logits": 0.156642884016037, "loss/reg": 0.01121209841221571, "step": 1947 }, { "epoch": 0.2435, "grad_norm": 2.9095447063446045, "grad_norm_var": 0.6116848502456292, "learning_rate": 0.0001, "loss": 1.0915, "loss/crossentropy": 2.527217149734497, "loss/hidden": 0.81640625, "loss/logits": 0.16299590468406677, "loss/reg": 0.01120635587722063, "step": 1948 }, { "epoch": 0.243625, "grad_norm": 5.4240403175354, "grad_norm_var": 0.9354258383958637, "learning_rate": 0.0001, "loss": 1.4684, "loss/crossentropy": 2.551121473312378, "loss/hidden": 1.1015625, "loss/logits": 0.25484058260917664, "loss/reg": 0.011200698092579842, "step": 1949 }, { "epoch": 0.24375, "grad_norm": 3.1100833415985107, "grad_norm_var": 0.9352412595018517, "learning_rate": 0.0001, "loss": 1.2134, "loss/crossentropy": 2.401557683944702, "loss/hidden": 0.9140625, "loss/logits": 0.18738284707069397, "loss/reg": 0.011194508522748947, "step": 1950 }, { "epoch": 0.243875, "grad_norm": 3.456061363220215, "grad_norm_var": 0.9120604979018725, "learning_rate": 0.0001, "loss": 1.1817, "loss/crossentropy": 2.688356637954712, "loss/hidden": 0.875, "loss/logits": 0.19481763243675232, "loss/reg": 0.011188787408173084, "step": 1951 }, { "epoch": 0.244, "grad_norm": 3.2770729064941406, "grad_norm_var": 0.8378516417593855, "learning_rate": 0.0001, "loss": 1.4059, "loss/crossentropy": 2.156545877456665, "loss/hidden": 1.09375, "loss/logits": 0.2003558874130249, "loss/reg": 0.01118260808289051, "step": 1952 }, { "epoch": 0.244125, "grad_norm": 3.4741783142089844, "grad_norm_var": 0.4914500706308291, "learning_rate": 0.0001, "loss": 1.259, "loss/crossentropy": 2.633277177810669, "loss/hidden": 0.95703125, "loss/logits": 0.19024121761322021, "loss/reg": 0.011176753789186478, "step": 1953 }, { "epoch": 0.24425, "grad_norm": 3.2239580154418945, "grad_norm_var": 0.46666341661654465, "learning_rate": 0.0001, "loss": 1.1213, "loss/crossentropy": 2.5291545391082764, "loss/hidden": 0.8671875, "loss/logits": 0.14238449931144714, "loss/reg": 0.011171228252351284, "step": 1954 }, { "epoch": 0.244375, "grad_norm": 2.7568724155426025, "grad_norm_var": 0.4757426809575984, "learning_rate": 0.0001, "loss": 1.0901, "loss/crossentropy": 2.6687417030334473, "loss/hidden": 0.82421875, "loss/logits": 0.15422269701957703, "loss/reg": 0.01116592064499855, "step": 1955 }, { "epoch": 0.2445, "grad_norm": 3.1235272884368896, "grad_norm_var": 0.4343252933982605, "learning_rate": 0.0001, "loss": 1.2777, "loss/crossentropy": 2.0929863452911377, "loss/hidden": 1.0078125, "loss/logits": 0.15828515589237213, "loss/reg": 0.011160776950418949, "step": 1956 }, { "epoch": 0.244625, "grad_norm": 2.8191616535186768, "grad_norm_var": 0.43213291318467645, "learning_rate": 0.0001, "loss": 1.1685, "loss/crossentropy": 2.562255620956421, "loss/hidden": 0.87109375, "loss/logits": 0.1858426034450531, "loss/reg": 0.011155403219163418, "step": 1957 }, { "epoch": 0.24475, "grad_norm": 7.025396823883057, "grad_norm_var": 1.35403696610659, "learning_rate": 0.0001, "loss": 1.313, "loss/crossentropy": 2.0798208713531494, "loss/hidden": 1.0546875, "loss/logits": 0.1467897593975067, "loss/reg": 0.01114996150135994, "step": 1958 }, { "epoch": 0.244875, "grad_norm": 3.6586244106292725, "grad_norm_var": 1.348840874134669, "learning_rate": 0.0001, "loss": 1.0528, "loss/crossentropy": 2.80499267578125, "loss/hidden": 0.79296875, "loss/logits": 0.14840048551559448, "loss/reg": 0.011143793351948261, "step": 1959 }, { "epoch": 0.245, "grad_norm": 3.549360752105713, "grad_norm_var": 1.2828200422537663, "learning_rate": 0.0001, "loss": 1.1735, "loss/crossentropy": 2.612431764602661, "loss/hidden": 0.89453125, "loss/logits": 0.16759154200553894, "loss/reg": 0.01113795768469572, "step": 1960 }, { "epoch": 0.245125, "grad_norm": 3.2019731998443604, "grad_norm_var": 1.2670343380172766, "learning_rate": 0.0001, "loss": 1.2087, "loss/crossentropy": 2.242535352706909, "loss/hidden": 0.93359375, "loss/logits": 0.1637311577796936, "loss/reg": 0.011132647283375263, "step": 1961 }, { "epoch": 0.24525, "grad_norm": 2.65508770942688, "grad_norm_var": 1.3037293062655784, "learning_rate": 0.0001, "loss": 1.066, "loss/crossentropy": 2.53322172164917, "loss/hidden": 0.80859375, "loss/logits": 0.1461862325668335, "loss/reg": 0.011126426979899406, "step": 1962 }, { "epoch": 0.245375, "grad_norm": 2.4140734672546387, "grad_norm_var": 1.3284114469095243, "learning_rate": 0.0001, "loss": 1.1738, "loss/crossentropy": 2.441091299057007, "loss/hidden": 0.91015625, "loss/logits": 0.1524820178747177, "loss/reg": 0.011120946146547794, "step": 1963 }, { "epoch": 0.2455, "grad_norm": 4.649234771728516, "grad_norm_var": 1.3794622764879967, "learning_rate": 0.0001, "loss": 1.2865, "loss/crossentropy": 2.4274308681488037, "loss/hidden": 1.015625, "loss/logits": 0.15970450639724731, "loss/reg": 0.011115475557744503, "step": 1964 }, { "epoch": 0.245625, "grad_norm": 2.690500259399414, "grad_norm_var": 1.1866477483632742, "learning_rate": 0.0001, "loss": 1.0634, "loss/crossentropy": 2.594061851501465, "loss/hidden": 0.8125, "loss/logits": 0.13979078829288483, "loss/reg": 0.011109288781881332, "step": 1965 }, { "epoch": 0.24575, "grad_norm": 3.1549224853515625, "grad_norm_var": 1.1847841066358076, "learning_rate": 0.0001, "loss": 1.037, "loss/crossentropy": 2.6761364936828613, "loss/hidden": 0.796875, "loss/logits": 0.12911373376846313, "loss/reg": 0.011103102937340736, "step": 1966 }, { "epoch": 0.245875, "grad_norm": 2.8074398040771484, "grad_norm_var": 1.210175941928484, "learning_rate": 0.0001, "loss": 1.0844, "loss/crossentropy": 2.2973666191101074, "loss/hidden": 0.81640625, "loss/logits": 0.15705451369285583, "loss/reg": 0.011097206734120846, "step": 1967 }, { "epoch": 0.246, "grad_norm": 2.813947916030884, "grad_norm_var": 1.2314860795345413, "learning_rate": 0.0001, "loss": 1.0939, "loss/crossentropy": 2.4095160961151123, "loss/hidden": 0.8359375, "loss/logits": 0.14708581566810608, "loss/reg": 0.01109134592115879, "step": 1968 }, { "epoch": 0.246125, "grad_norm": 3.996295928955078, "grad_norm_var": 1.2553489249469367, "learning_rate": 0.0001, "loss": 1.203, "loss/crossentropy": 2.5568830966949463, "loss/hidden": 0.921875, "loss/logits": 0.17023617029190063, "loss/reg": 0.011085476726293564, "step": 1969 }, { "epoch": 0.24625, "grad_norm": 3.1737494468688965, "grad_norm_var": 1.2567437243872042, "learning_rate": 0.0001, "loss": 1.19, "loss/crossentropy": 2.820736885070801, "loss/hidden": 0.8984375, "loss/logits": 0.18073034286499023, "loss/reg": 0.011080092750489712, "step": 1970 }, { "epoch": 0.246375, "grad_norm": 3.1411452293395996, "grad_norm_var": 1.2327325542410335, "learning_rate": 0.0001, "loss": 1.0159, "loss/crossentropy": 2.550328493118286, "loss/hidden": 0.78515625, "loss/logits": 0.1200067549943924, "loss/reg": 0.011074939742684364, "step": 1971 }, { "epoch": 0.2465, "grad_norm": 2.7992095947265625, "grad_norm_var": 1.2525440065906799, "learning_rate": 0.0001, "loss": 1.0909, "loss/crossentropy": 2.4028799533843994, "loss/hidden": 0.84375, "loss/logits": 0.13644427061080933, "loss/reg": 0.011069980449974537, "step": 1972 }, { "epoch": 0.246625, "grad_norm": 3.7964675426483154, "grad_norm_var": 1.2353292289717575, "learning_rate": 0.0001, "loss": 1.1583, "loss/crossentropy": 2.6295735836029053, "loss/hidden": 0.890625, "loss/logits": 0.1569831818342209, "loss/reg": 0.01106490008533001, "step": 1973 }, { "epoch": 0.24675, "grad_norm": 2.4510080814361572, "grad_norm_var": 0.3749246635942266, "learning_rate": 0.0001, "loss": 1.0464, "loss/crossentropy": 2.5560388565063477, "loss/hidden": 0.79296875, "loss/logits": 0.14279651641845703, "loss/reg": 0.011059917509555817, "step": 1974 }, { "epoch": 0.246875, "grad_norm": 2.3039627075195312, "grad_norm_var": 0.4039935905054485, "learning_rate": 0.0001, "loss": 0.9695, "loss/crossentropy": 2.500410556793213, "loss/hidden": 0.7421875, "loss/logits": 0.11676067113876343, "loss/reg": 0.011055227369070053, "step": 1975 }, { "epoch": 0.247, "grad_norm": 2.6313462257385254, "grad_norm_var": 0.4016504793812447, "learning_rate": 0.0001, "loss": 1.1421, "loss/crossentropy": 2.510258197784424, "loss/hidden": 0.87109375, "loss/logits": 0.16046884655952454, "loss/reg": 0.011049150489270687, "step": 1976 }, { "epoch": 0.247125, "grad_norm": 3.447317361831665, "grad_norm_var": 0.41062862008729606, "learning_rate": 0.0001, "loss": 1.2887, "loss/crossentropy": 2.2219176292419434, "loss/hidden": 1.0, "loss/logits": 0.17826291918754578, "loss/reg": 0.011043058708310127, "step": 1977 }, { "epoch": 0.24725, "grad_norm": 2.774780035018921, "grad_norm_var": 0.40509622860432326, "learning_rate": 0.0001, "loss": 1.1503, "loss/crossentropy": 2.6890835762023926, "loss/hidden": 0.8671875, "loss/logits": 0.1727658063173294, "loss/reg": 0.011036898009479046, "step": 1978 }, { "epoch": 0.247375, "grad_norm": 2.6805171966552734, "grad_norm_var": 0.38639654731341744, "learning_rate": 0.0001, "loss": 1.063, "loss/crossentropy": 2.730665683746338, "loss/hidden": 0.80859375, "loss/logits": 0.14413277804851532, "loss/reg": 0.01103129331022501, "step": 1979 }, { "epoch": 0.2475, "grad_norm": 2.5546305179595947, "grad_norm_var": 0.2229060548882482, "learning_rate": 0.0001, "loss": 1.0903, "loss/crossentropy": 2.460444688796997, "loss/hidden": 0.84375, "loss/logits": 0.13627831637859344, "loss/reg": 0.011025946587324142, "step": 1980 }, { "epoch": 0.247625, "grad_norm": 3.147676706314087, "grad_norm_var": 0.22008522732602553, "learning_rate": 0.0001, "loss": 1.0293, "loss/crossentropy": 2.780017614364624, "loss/hidden": 0.78515625, "loss/logits": 0.13396324217319489, "loss/reg": 0.011021067388355732, "step": 1981 }, { "epoch": 0.24775, "grad_norm": 2.545849561691284, "grad_norm_var": 0.22903709663108504, "learning_rate": 0.0001, "loss": 1.0831, "loss/crossentropy": 2.723945379257202, "loss/hidden": 0.84375, "loss/logits": 0.1291472613811493, "loss/reg": 0.011016342788934708, "step": 1982 }, { "epoch": 0.247875, "grad_norm": 3.5800068378448486, "grad_norm_var": 0.2525227852681437, "learning_rate": 0.0001, "loss": 1.7206, "loss/crossentropy": 1.9947932958602905, "loss/hidden": 1.3203125, "loss/logits": 0.29013922810554504, "loss/reg": 0.01101152878254652, "step": 1983 }, { "epoch": 0.248, "grad_norm": 2.3426930904388428, "grad_norm_var": 0.27745670304271336, "learning_rate": 0.0001, "loss": 1.0741, "loss/crossentropy": 2.498370885848999, "loss/hidden": 0.82421875, "loss/logits": 0.13984939455986023, "loss/reg": 0.011006931774318218, "step": 1984 }, { "epoch": 0.248125, "grad_norm": 2.4072375297546387, "grad_norm_var": 0.21579937260172882, "learning_rate": 0.0001, "loss": 1.1224, "loss/crossentropy": 2.3923802375793457, "loss/hidden": 0.83984375, "loss/logits": 0.17257341742515564, "loss/reg": 0.011000873520970345, "step": 1985 }, { "epoch": 0.24825, "grad_norm": 4.206304550170898, "grad_norm_var": 0.32547872452598553, "learning_rate": 0.0001, "loss": 1.3701, "loss/crossentropy": 2.5781798362731934, "loss/hidden": 1.0234375, "loss/logits": 0.23670879006385803, "loss/reg": 0.010995452292263508, "step": 1986 }, { "epoch": 0.248375, "grad_norm": 5.367579460144043, "grad_norm_var": 0.6992678587503592, "learning_rate": 0.0001, "loss": 1.381, "loss/crossentropy": 2.560682535171509, "loss/hidden": 1.109375, "loss/logits": 0.16169926524162292, "loss/reg": 0.010989362373948097, "step": 1987 }, { "epoch": 0.2485, "grad_norm": 2.6005303859710693, "grad_norm_var": 0.708770234220439, "learning_rate": 0.0001, "loss": 1.0953, "loss/crossentropy": 2.4954795837402344, "loss/hidden": 0.8359375, "loss/logits": 0.149558424949646, "loss/reg": 0.010983383283019066, "step": 1988 }, { "epoch": 0.248625, "grad_norm": 3.366856098175049, "grad_norm_var": 0.6776825224044613, "learning_rate": 0.0001, "loss": 1.1398, "loss/crossentropy": 2.572629451751709, "loss/hidden": 0.8515625, "loss/logits": 0.17850361764431, "loss/reg": 0.01097831316292286, "step": 1989 }, { "epoch": 0.24875, "grad_norm": 2.9385433197021484, "grad_norm_var": 0.6551923075237018, "learning_rate": 0.0001, "loss": 1.114, "loss/crossentropy": 2.4854507446289062, "loss/hidden": 0.83203125, "loss/logits": 0.17227290570735931, "loss/reg": 0.010973170399665833, "step": 1990 }, { "epoch": 0.248875, "grad_norm": 2.1237387657165527, "grad_norm_var": 0.675293446442898, "learning_rate": 0.0001, "loss": 1.0324, "loss/crossentropy": 2.4779257774353027, "loss/hidden": 0.78125, "loss/logits": 0.14143508672714233, "loss/reg": 0.01096703764051199, "step": 1991 }, { "epoch": 0.249, "grad_norm": 17.35088539123535, "grad_norm_var": 13.405545245645559, "learning_rate": 0.0001, "loss": 1.5438, "loss/crossentropy": 2.1613028049468994, "loss/hidden": 1.2265625, "loss/logits": 0.20763814449310303, "loss/reg": 0.010961621068418026, "step": 1992 }, { "epoch": 0.249125, "grad_norm": 3.643308162689209, "grad_norm_var": 13.394425808799774, "learning_rate": 0.0001, "loss": 1.5124, "loss/crossentropy": 2.740511894226074, "loss/hidden": 1.125, "loss/logits": 0.27779725193977356, "loss/reg": 0.010955520905554295, "step": 1993 }, { "epoch": 0.24925, "grad_norm": 2.6261227130889893, "grad_norm_var": 13.419635101303081, "learning_rate": 0.0001, "loss": 1.1103, "loss/crossentropy": 2.4874839782714844, "loss/hidden": 0.859375, "loss/logits": 0.14138638973236084, "loss/reg": 0.010949719697237015, "step": 1994 }, { "epoch": 0.249375, "grad_norm": 2.8772454261779785, "grad_norm_var": 13.388291796772698, "learning_rate": 0.0001, "loss": 1.1465, "loss/crossentropy": 2.4892168045043945, "loss/hidden": 0.875, "loss/logits": 0.1620655208826065, "loss/reg": 0.010943672619760036, "step": 1995 }, { "epoch": 0.2495, "grad_norm": 4.019129753112793, "grad_norm_var": 13.244021829599607, "learning_rate": 0.0001, "loss": 1.1853, "loss/crossentropy": 2.284893751144409, "loss/hidden": 0.90234375, "loss/logits": 0.17355592548847198, "loss/reg": 0.010938170365989208, "step": 1996 }, { "epoch": 0.249625, "grad_norm": 3.0385830402374268, "grad_norm_var": 13.258203172483755, "learning_rate": 0.0001, "loss": 1.1565, "loss/crossentropy": 2.5010361671447754, "loss/hidden": 0.8671875, "loss/logits": 0.18000578880310059, "loss/reg": 0.010932603850960732, "step": 1997 }, { "epoch": 0.24975, "grad_norm": 2.5925393104553223, "grad_norm_var": 13.248884346858516, "learning_rate": 0.0001, "loss": 1.083, "loss/crossentropy": 2.6594510078430176, "loss/hidden": 0.8359375, "loss/logits": 0.1377822607755661, "loss/reg": 0.010927069000899792, "step": 1998 }, { "epoch": 0.249875, "grad_norm": 2.673877239227295, "grad_norm_var": 13.359108718093754, "learning_rate": 0.0001, "loss": 0.9031, "loss/crossentropy": 2.683049440383911, "loss/hidden": 0.67578125, "loss/logits": 0.11807288229465485, "loss/reg": 0.010921536944806576, "step": 1999 }, { "epoch": 0.25, "grad_norm": 2.3662562370300293, "grad_norm_var": 13.353902173571798, "learning_rate": 0.0001, "loss": 1.0005, "loss/crossentropy": 2.5584921836853027, "loss/hidden": 0.765625, "loss/logits": 0.1257750689983368, "loss/reg": 0.010914883576333523, "step": 2000 } ], "logging_steps": 1, "max_steps": 8000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.28811723128832e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }