diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,24018 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 250, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005, + "grad_norm": 1.8149200677871704, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.6055, + "loss/crossentropy": 2.1694753170013428, + "loss/hidden": 0.296875, + "loss/logits": 0.04434104636311531, + "loss/reg": 0.026429571211338043, + "step": 1 + }, + { + "epoch": 0.001, + "grad_norm": 2.5396013259887695, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.6507, + "loss/crossentropy": 2.5328911542892456, + "loss/hidden": 0.314453125, + "loss/logits": 0.07194863818585873, + "loss/reg": 0.026429571211338043, + "step": 2 + }, + { + "epoch": 0.0015, + "grad_norm": 1.488558292388916, + "learning_rate": 6.000000000000001e-07, + "loss": 0.5344, + "loss/crossentropy": 2.451871395111084, + "loss/hidden": 0.2373046875, + "loss/logits": 0.03276057913899422, + "loss/reg": 0.02642955631017685, + "step": 3 + }, + { + "epoch": 0.002, + "grad_norm": 2.1853861808776855, + "learning_rate": 8.000000000000001e-07, + "loss": 0.5659, + "loss/crossentropy": 2.3267983198165894, + "loss/hidden": 0.2646484375, + "loss/logits": 0.03696209378540516, + "loss/reg": 0.02642953395843506, + "step": 4 + }, + { + "epoch": 0.0025, + "grad_norm": 1.4397950172424316, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.5414, + "loss/crossentropy": 2.410401225090027, + "loss/hidden": 0.24462890625, + "loss/logits": 0.03243397735059261, + "loss/reg": 0.02642953023314476, + "step": 5 + }, + { + "epoch": 0.003, + "grad_norm": 5.599375247955322, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.7887, + "loss/crossentropy": 2.808457851409912, + "loss/hidden": 0.4482421875, + "loss/logits": 0.07614399120211601, + "loss/reg": 0.02642950788140297, + "step": 6 + }, + { + "epoch": 0.0035, + "grad_norm": 1.8009779453277588, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.6491, + "loss/crossentropy": 2.0596200227737427, + "loss/hidden": 0.3349609375, + "loss/logits": 0.049886807799339294, + "loss/reg": 0.02642947994172573, + "step": 7 + }, + { + "epoch": 0.004, + "grad_norm": 1.524167776107788, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.5283, + "loss/crossentropy": 2.5316779613494873, + "loss/hidden": 0.234375, + "loss/logits": 0.029637396335601807, + "loss/reg": 0.026429446414113045, + "step": 8 + }, + { + "epoch": 0.0045, + "grad_norm": 1.5922240018844604, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.5713, + "loss/crossentropy": 2.3954519033432007, + "loss/hidden": 0.26171875, + "loss/logits": 0.04526849649846554, + "loss/reg": 0.02642940729856491, + "step": 9 + }, + { + "epoch": 0.005, + "grad_norm": 1.6532399654388428, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.5624, + "loss/crossentropy": 2.3280714750289917, + "loss/hidden": 0.2578125, + "loss/logits": 0.040291883051395416, + "loss/reg": 0.02642936259508133, + "step": 10 + }, + { + "epoch": 0.0055, + "grad_norm": 2.008364200592041, + "learning_rate": 2.2e-06, + "loss": 0.5498, + "loss/crossentropy": 2.3053948879241943, + "loss/hidden": 0.24609375, + "loss/logits": 0.039378101006150246, + "loss/reg": 0.026429304853081703, + "step": 11 + }, + { + "epoch": 0.006, + "grad_norm": 1.6782885789871216, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.5776, + "loss/crossentropy": 2.244460344314575, + "loss/hidden": 0.2724609375, + "loss/logits": 0.04084986075758934, + "loss/reg": 0.026429247111082077, + "step": 12 + }, + { + "epoch": 0.0065, + "grad_norm": 1.4042738676071167, + "learning_rate": 2.6e-06, + "loss": 0.5512, + "loss/crossentropy": 2.2852554321289062, + "loss/hidden": 0.25634765625, + "loss/logits": 0.03055955469608307, + "loss/reg": 0.026429180055856705, + "step": 13 + }, + { + "epoch": 0.007, + "grad_norm": 3.2632105350494385, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.5593, + "loss/crossentropy": 2.300649642944336, + "loss/hidden": 0.2568359375, + "loss/logits": 0.03812449052929878, + "loss/reg": 0.02642911858856678, + "step": 14 + }, + { + "epoch": 0.0075, + "grad_norm": 1.1468082666397095, + "learning_rate": 3e-06, + "loss": 0.5263, + "loss/crossentropy": 2.4939738512039185, + "loss/hidden": 0.23046875, + "loss/logits": 0.03151892125606537, + "loss/reg": 0.02642902545630932, + "step": 15 + }, + { + "epoch": 0.008, + "grad_norm": 1.2633907794952393, + "grad_norm_var": 1.1838536622732618, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.5162, + "loss/crossentropy": 2.3341073989868164, + "loss/hidden": 0.22216796875, + "loss/logits": 0.02972777932882309, + "loss/reg": 0.02642892673611641, + "step": 16 + }, + { + "epoch": 0.0085, + "grad_norm": 1.3773301839828491, + "grad_norm_var": 1.2080880649963361, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.57, + "loss/crossentropy": 2.4178860187530518, + "loss/hidden": 0.2705078125, + "loss/logits": 0.03520551137626171, + "loss/reg": 0.02642882987856865, + "step": 17 + }, + { + "epoch": 0.009, + "grad_norm": 2.9784727096557617, + "grad_norm_var": 1.2518295142571243, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.7119, + "loss/crossentropy": 2.143317699432373, + "loss/hidden": 0.39453125, + "loss/logits": 0.053122956305742264, + "loss/reg": 0.026428721845149994, + "step": 18 + }, + { + "epoch": 0.0095, + "grad_norm": 3.6081793308258057, + "grad_norm_var": 1.3809537706703447, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.616, + "loss/crossentropy": 2.280970811843872, + "loss/hidden": 0.306640625, + "loss/logits": 0.0450353492051363, + "loss/reg": 0.02642863430082798, + "step": 19 + }, + { + "epoch": 0.01, + "grad_norm": 2.2921319007873535, + "grad_norm_var": 1.3820597339022322, + "learning_rate": 4.000000000000001e-06, + "loss": 0.6631, + "loss/crossentropy": 2.447663187980652, + "loss/hidden": 0.3447265625, + "loss/logits": 0.05406281352043152, + "loss/reg": 0.02642853744328022, + "step": 20 + }, + { + "epoch": 0.0105, + "grad_norm": 1.4713051319122314, + "grad_norm_var": 1.3790775157724358, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.5979, + "loss/crossentropy": 2.0740893483161926, + "loss/hidden": 0.28515625, + "loss/logits": 0.04845273308455944, + "loss/reg": 0.02642839401960373, + "step": 21 + }, + { + "epoch": 0.011, + "grad_norm": 1.3936915397644043, + "grad_norm_var": 0.5594726223515398, + "learning_rate": 4.4e-06, + "loss": 0.5342, + "loss/crossentropy": 2.308709979057312, + "loss/hidden": 0.23681640625, + "loss/logits": 0.03306800499558449, + "loss/reg": 0.026428284123539925, + "step": 22 + }, + { + "epoch": 0.0115, + "grad_norm": 1.5905181169509888, + "grad_norm_var": 0.5651179587326415, + "learning_rate": 4.600000000000001e-06, + "loss": 0.5387, + "loss/crossentropy": 2.518093228340149, + "loss/hidden": 0.2392578125, + "loss/logits": 0.03512653335928917, + "loss/reg": 0.02642817609012127, + "step": 23 + }, + { + "epoch": 0.012, + "grad_norm": 1.5539664030075073, + "grad_norm_var": 0.5637185598957045, + "learning_rate": 4.800000000000001e-06, + "loss": 0.5216, + "loss/crossentropy": 2.4222742319107056, + "loss/hidden": 0.22900390625, + "loss/logits": 0.028284232132136822, + "loss/reg": 0.02642805129289627, + "step": 24 + }, + { + "epoch": 0.0125, + "grad_norm": 1.4515613317489624, + "grad_norm_var": 0.5705814698960205, + "learning_rate": 5e-06, + "loss": 0.5546, + "loss/crossentropy": 2.1840826272964478, + "loss/hidden": 0.26025390625, + "loss/logits": 0.03005337156355381, + "loss/reg": 0.02642790600657463, + "step": 25 + }, + { + "epoch": 0.013, + "grad_norm": 1.3925954103469849, + "grad_norm_var": 0.5828268281563851, + "learning_rate": 5.2e-06, + "loss": 0.5187, + "loss/crossentropy": 2.417304754257202, + "loss/hidden": 0.2255859375, + "loss/logits": 0.028857468627393246, + "loss/reg": 0.0264277420938015, + "step": 26 + }, + { + "epoch": 0.0135, + "grad_norm": 1.3494521379470825, + "grad_norm_var": 0.5975540703483029, + "learning_rate": 5.400000000000001e-06, + "loss": 0.581, + "loss/crossentropy": 2.4872124195098877, + "loss/hidden": 0.275390625, + "loss/logits": 0.04128789156675339, + "loss/reg": 0.02642754837870598, + "step": 27 + }, + { + "epoch": 0.014, + "grad_norm": 1.7983005046844482, + "grad_norm_var": 0.5960914554887113, + "learning_rate": 5.600000000000001e-06, + "loss": 0.5793, + "loss/crossentropy": 2.5761152505874634, + "loss/hidden": 0.28125, + "loss/logits": 0.03379652462899685, + "loss/reg": 0.02642735280096531, + "step": 28 + }, + { + "epoch": 0.0145, + "grad_norm": 1.2769767045974731, + "grad_norm_var": 0.6043886156117831, + "learning_rate": 5.8e-06, + "loss": 0.5439, + "loss/crossentropy": 2.338332176208496, + "loss/hidden": 0.24658203125, + "loss/logits": 0.03306223638355732, + "loss/reg": 0.02642717957496643, + "step": 29 + }, + { + "epoch": 0.015, + "grad_norm": 1.1405447721481323, + "grad_norm_var": 0.47908970131792705, + "learning_rate": 6e-06, + "loss": 0.4911, + "loss/crossentropy": 2.541923403739929, + "loss/hidden": 0.201171875, + "loss/logits": 0.025660399347543716, + "loss/reg": 0.026426956057548523, + "step": 30 + }, + { + "epoch": 0.0155, + "grad_norm": 1.4948232173919678, + "grad_norm_var": 0.4613230136594038, + "learning_rate": 6.200000000000001e-06, + "loss": 0.5016, + "loss/crossentropy": 2.3482922315597534, + "loss/hidden": 0.2109375, + "loss/logits": 0.026443324983119965, + "loss/reg": 0.026426780968904495, + "step": 31 + }, + { + "epoch": 0.016, + "grad_norm": 1.9969562292099, + "grad_norm_var": 0.45082540579723746, + "learning_rate": 6.4000000000000006e-06, + "loss": 0.5719, + "loss/crossentropy": 2.294642925262451, + "loss/hidden": 0.27392578125, + "loss/logits": 0.03373559284955263, + "loss/reg": 0.02642657607793808, + "step": 32 + }, + { + "epoch": 0.0165, + "grad_norm": 1.2221813201904297, + "grad_norm_var": 0.46025475791477294, + "learning_rate": 6.600000000000001e-06, + "loss": 0.5252, + "loss/crossentropy": 2.3495378494262695, + "loss/hidden": 0.22900390625, + "loss/logits": 0.03191899135708809, + "loss/reg": 0.026426387950778008, + "step": 33 + }, + { + "epoch": 0.017, + "grad_norm": 1.5299986600875854, + "grad_norm_var": 0.3542705004937232, + "learning_rate": 6.800000000000001e-06, + "loss": 0.5302, + "loss/crossentropy": 2.421632170677185, + "loss/hidden": 0.23095703125, + "loss/logits": 0.03493742551654577, + "loss/reg": 0.02642618492245674, + "step": 34 + }, + { + "epoch": 0.0175, + "grad_norm": 1.3914459943771362, + "grad_norm_var": 0.08563591942271481, + "learning_rate": 7e-06, + "loss": 0.5396, + "loss/crossentropy": 2.501790404319763, + "loss/hidden": 0.2412109375, + "loss/logits": 0.03411697968840599, + "loss/reg": 0.02642594650387764, + "step": 35 + }, + { + "epoch": 0.018, + "grad_norm": 1.2283117771148682, + "grad_norm_var": 0.04708121548131293, + "learning_rate": 7.2000000000000005e-06, + "loss": 0.5276, + "loss/crossentropy": 2.369629979133606, + "loss/hidden": 0.22802734375, + "loss/logits": 0.03534604236483574, + "loss/reg": 0.026425734162330627, + "step": 36 + }, + { + "epoch": 0.0185, + "grad_norm": 2.441415548324585, + "grad_norm_var": 0.1079886358057666, + "learning_rate": 7.4e-06, + "loss": 0.6458, + "loss/crossentropy": 2.286492109298706, + "loss/hidden": 0.333984375, + "loss/logits": 0.047583552077412605, + "loss/reg": 0.02642551064491272, + "step": 37 + }, + { + "epoch": 0.019, + "grad_norm": 1.4532129764556885, + "grad_norm_var": 0.107241014688942, + "learning_rate": 7.600000000000001e-06, + "loss": 0.5401, + "loss/crossentropy": 2.5449851751327515, + "loss/hidden": 0.24072265625, + "loss/logits": 0.035121435299515724, + "loss/reg": 0.026425251737236977, + "step": 38 + }, + { + "epoch": 0.0195, + "grad_norm": 2.312504768371582, + "grad_norm_var": 0.1466550400336051, + "learning_rate": 7.800000000000002e-06, + "loss": 0.6225, + "loss/crossentropy": 2.248945951461792, + "loss/hidden": 0.3115234375, + "loss/logits": 0.04672851786017418, + "loss/reg": 0.02642502635717392, + "step": 39 + }, + { + "epoch": 0.02, + "grad_norm": 1.8888795375823975, + "grad_norm_var": 0.15318881349217175, + "learning_rate": 8.000000000000001e-06, + "loss": 0.5732, + "loss/crossentropy": 2.476779580116272, + "loss/hidden": 0.26953125, + "loss/logits": 0.03942425549030304, + "loss/reg": 0.02642476186156273, + "step": 40 + }, + { + "epoch": 0.0205, + "grad_norm": 1.5608994960784912, + "grad_norm_var": 0.15198231846540716, + "learning_rate": 8.2e-06, + "loss": 0.5381, + "loss/crossentropy": 2.38312304019928, + "loss/hidden": 0.24072265625, + "loss/logits": 0.03310199826955795, + "loss/reg": 0.0264244694262743, + "step": 41 + }, + { + "epoch": 0.021, + "grad_norm": 1.2987440824508667, + "grad_norm_var": 0.15503315722776131, + "learning_rate": 8.400000000000001e-06, + "loss": 0.4821, + "loss/crossentropy": 2.515058755874634, + "loss/hidden": 0.193359375, + "loss/logits": 0.024530705995857716, + "loss/reg": 0.026424190029501915, + "step": 42 + }, + { + "epoch": 0.0215, + "grad_norm": 1.6956250667572021, + "grad_norm_var": 0.15157974603150895, + "learning_rate": 8.6e-06, + "loss": 0.6312, + "loss/crossentropy": 2.2517104148864746, + "loss/hidden": 0.3251953125, + "loss/logits": 0.041729243472218513, + "loss/reg": 0.026423903182148933, + "step": 43 + }, + { + "epoch": 0.022, + "grad_norm": 1.4108027219772339, + "grad_norm_var": 0.15114137609056771, + "learning_rate": 8.8e-06, + "loss": 0.5171, + "loss/crossentropy": 2.4841147661209106, + "loss/hidden": 0.22412109375, + "loss/logits": 0.02869710698723793, + "loss/reg": 0.026423608884215355, + "step": 44 + }, + { + "epoch": 0.0225, + "grad_norm": 1.3235130310058594, + "grad_norm_var": 0.14937195903162886, + "learning_rate": 9e-06, + "loss": 0.52, + "loss/crossentropy": 2.2738723754882812, + "loss/hidden": 0.2275390625, + "loss/logits": 0.028268495574593544, + "loss/reg": 0.026423312723636627, + "step": 45 + }, + { + "epoch": 0.023, + "grad_norm": 1.3061593770980835, + "grad_norm_var": 0.14123057510749676, + "learning_rate": 9.200000000000002e-06, + "loss": 0.4971, + "loss/crossentropy": 2.326944351196289, + "loss/hidden": 0.2080078125, + "loss/logits": 0.024814478121697903, + "loss/reg": 0.026423051953315735, + "step": 46 + }, + { + "epoch": 0.0235, + "grad_norm": 1.972931146621704, + "grad_norm_var": 0.14898989683507768, + "learning_rate": 9.4e-06, + "loss": 0.5559, + "loss/crossentropy": 2.4830580949783325, + "loss/hidden": 0.26025390625, + "loss/logits": 0.03139444626867771, + "loss/reg": 0.026422718539834023, + "step": 47 + }, + { + "epoch": 0.024, + "grad_norm": 1.698211669921875, + "grad_norm_var": 0.13983553268258544, + "learning_rate": 9.600000000000001e-06, + "loss": 0.5664, + "loss/crossentropy": 2.416160821914673, + "loss/hidden": 0.2578125, + "loss/logits": 0.04433598928153515, + "loss/reg": 0.026422368362545967, + "step": 48 + }, + { + "epoch": 0.0245, + "grad_norm": 7.356233596801758, + "grad_norm_var": 2.1755974425683684, + "learning_rate": 9.800000000000001e-06, + "loss": 0.6608, + "loss/crossentropy": 2.1511300802230835, + "loss/hidden": 0.3544921875, + "loss/logits": 0.042037611827254295, + "loss/reg": 0.02642207033932209, + "step": 49 + }, + { + "epoch": 0.025, + "grad_norm": 1.6962363719940186, + "grad_norm_var": 2.1670886649573458, + "learning_rate": 1e-05, + "loss": 0.5114, + "loss/crossentropy": 2.493433117866516, + "loss/hidden": 0.21875, + "loss/logits": 0.028404117561876774, + "loss/reg": 0.026421738788485527, + "step": 50 + }, + { + "epoch": 0.0255, + "grad_norm": 1.5135979652404785, + "grad_norm_var": 2.1580740006997834, + "learning_rate": 1.02e-05, + "loss": 0.4992, + "loss/crossentropy": 2.4469869136810303, + "loss/hidden": 0.20947265625, + "loss/logits": 0.025465765967965126, + "loss/reg": 0.02642141655087471, + "step": 51 + }, + { + "epoch": 0.026, + "grad_norm": 2.1058454513549805, + "grad_norm_var": 2.1147619503580235, + "learning_rate": 1.04e-05, + "loss": 0.5947, + "loss/crossentropy": 2.0783703327178955, + "loss/hidden": 0.29736328125, + "loss/logits": 0.03310043551027775, + "loss/reg": 0.02642105147242546, + "step": 52 + }, + { + "epoch": 0.0265, + "grad_norm": 1.4466326236724854, + "grad_norm_var": 2.126641614633889, + "learning_rate": 1.0600000000000002e-05, + "loss": 0.5175, + "loss/crossentropy": 2.5233154296875, + "loss/hidden": 0.22607421875, + "loss/logits": 0.027255047112703323, + "loss/reg": 0.0264207124710083, + "step": 53 + }, + { + "epoch": 0.027, + "grad_norm": 1.2315421104431152, + "grad_norm_var": 2.145947583831748, + "learning_rate": 1.0800000000000002e-05, + "loss": 0.4939, + "loss/crossentropy": 2.482948899269104, + "loss/hidden": 0.20263671875, + "loss/logits": 0.02701568230986595, + "loss/reg": 0.02642032690346241, + "step": 54 + }, + { + "epoch": 0.0275, + "grad_norm": 1.3502835035324097, + "grad_norm_var": 2.1622647893893476, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.5642, + "loss/crossentropy": 2.4037868976593018, + "loss/hidden": 0.26220703125, + "loss/logits": 0.03778073936700821, + "loss/reg": 0.02641993761062622, + "step": 55 + }, + { + "epoch": 0.028, + "grad_norm": 1.66973078250885, + "grad_norm_var": 2.166424380346859, + "learning_rate": 1.1200000000000001e-05, + "loss": 0.5326, + "loss/crossentropy": 2.3202576637268066, + "loss/hidden": 0.23681640625, + "loss/logits": 0.03157219849526882, + "loss/reg": 0.02641947939991951, + "step": 56 + }, + { + "epoch": 0.0285, + "grad_norm": 1.4568390846252441, + "grad_norm_var": 2.1720116007570036, + "learning_rate": 1.14e-05, + "loss": 0.5873, + "loss/crossentropy": 2.3086094856262207, + "loss/hidden": 0.27783203125, + "loss/logits": 0.0453144833445549, + "loss/reg": 0.026419078931212425, + "step": 57 + }, + { + "epoch": 0.029, + "grad_norm": 1.2021527290344238, + "grad_norm_var": 2.1804451998311927, + "learning_rate": 1.16e-05, + "loss": 0.4861, + "loss/crossentropy": 2.5664894580841064, + "loss/hidden": 0.19580078125, + "loss/logits": 0.0260773915797472, + "loss/reg": 0.02641867846250534, + "step": 58 + }, + { + "epoch": 0.0295, + "grad_norm": 1.2372887134552002, + "grad_norm_var": 2.2062031636320434, + "learning_rate": 1.18e-05, + "loss": 0.5491, + "loss/crossentropy": 2.3016046285629272, + "loss/hidden": 0.2490234375, + "loss/logits": 0.035935116931796074, + "loss/reg": 0.0264182947576046, + "step": 59 + }, + { + "epoch": 0.03, + "grad_norm": 1.4047211408615112, + "grad_norm_var": 2.206580767441871, + "learning_rate": 1.2e-05, + "loss": 0.5279, + "loss/crossentropy": 2.2995004653930664, + "loss/hidden": 0.23095703125, + "loss/logits": 0.032775900326669216, + "loss/reg": 0.026417918503284454, + "step": 60 + }, + { + "epoch": 0.0305, + "grad_norm": 1.2555537223815918, + "grad_norm_var": 2.211850675210066, + "learning_rate": 1.22e-05, + "loss": 0.5124, + "loss/crossentropy": 2.3773516416549683, + "loss/hidden": 0.22021484375, + "loss/logits": 0.028029106557369232, + "loss/reg": 0.026417534798383713, + "step": 61 + }, + { + "epoch": 0.031, + "grad_norm": 1.3694956302642822, + "grad_norm_var": 2.207348318396743, + "learning_rate": 1.2400000000000002e-05, + "loss": 0.5246, + "loss/crossentropy": 2.462360382080078, + "loss/hidden": 0.2294921875, + "loss/logits": 0.030931759625673294, + "loss/reg": 0.026417037472128868, + "step": 62 + }, + { + "epoch": 0.0315, + "grad_norm": 0.8940879106521606, + "grad_norm_var": 2.2657112396397707, + "learning_rate": 1.2600000000000001e-05, + "loss": 0.4918, + "loss/crossentropy": 2.4237685203552246, + "loss/hidden": 0.20166015625, + "loss/logits": 0.026003433391451836, + "loss/reg": 0.02641662023961544, + "step": 63 + }, + { + "epoch": 0.032, + "grad_norm": 1.3153444528579712, + "grad_norm_var": 2.2803513495186505, + "learning_rate": 1.2800000000000001e-05, + "loss": 0.5112, + "loss/crossentropy": 2.3414171934127808, + "loss/hidden": 0.220703125, + "loss/logits": 0.026362700387835503, + "loss/reg": 0.026416106149554253, + "step": 64 + }, + { + "epoch": 0.0325, + "grad_norm": 1.281063437461853, + "grad_norm_var": 0.0715017189536231, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.52, + "loss/crossentropy": 2.420620918273926, + "loss/hidden": 0.2255859375, + "loss/logits": 0.030298630706965923, + "loss/reg": 0.026415672153234482, + "step": 65 + }, + { + "epoch": 0.033, + "grad_norm": 1.3108336925506592, + "grad_norm_var": 0.0656601505461642, + "learning_rate": 1.3200000000000002e-05, + "loss": 0.5189, + "loss/crossentropy": 2.2853455543518066, + "loss/hidden": 0.22265625, + "loss/logits": 0.0321119399741292, + "loss/reg": 0.026415223255753517, + "step": 66 + }, + { + "epoch": 0.0335, + "grad_norm": 1.0983670949935913, + "grad_norm_var": 0.06891859533181677, + "learning_rate": 1.3400000000000002e-05, + "loss": 0.5318, + "loss/crossentropy": 2.3410117626190186, + "loss/hidden": 0.23681640625, + "loss/logits": 0.030876386910676956, + "loss/reg": 0.026414690539240837, + "step": 67 + }, + { + "epoch": 0.034, + "grad_norm": 1.7166627645492554, + "grad_norm_var": 0.039260036839271824, + "learning_rate": 1.3600000000000002e-05, + "loss": 0.5701, + "loss/crossentropy": 2.407397150993347, + "loss/hidden": 0.2685546875, + "loss/logits": 0.03741579130291939, + "loss/reg": 0.026414209976792336, + "step": 68 + }, + { + "epoch": 0.0345, + "grad_norm": 0.9616859555244446, + "grad_norm_var": 0.046257726034885954, + "learning_rate": 1.38e-05, + "loss": 0.455, + "loss/crossentropy": 2.5552138090133667, + "loss/hidden": 0.169921875, + "loss/logits": 0.02096631657332182, + "loss/reg": 0.02641364373266697, + "step": 69 + }, + { + "epoch": 0.035, + "grad_norm": 1.3926982879638672, + "grad_norm_var": 0.046469501868423045, + "learning_rate": 1.4e-05, + "loss": 0.5899, + "loss/crossentropy": 2.184352159500122, + "loss/hidden": 0.2880859375, + "loss/logits": 0.03772860765457153, + "loss/reg": 0.02641312964260578, + "step": 70 + }, + { + "epoch": 0.0355, + "grad_norm": 1.6911873817443848, + "grad_norm_var": 0.055686708202271486, + "learning_rate": 1.4200000000000001e-05, + "loss": 0.5065, + "loss/crossentropy": 2.3083138465881348, + "loss/hidden": 0.21533203125, + "loss/logits": 0.026994884945452213, + "loss/reg": 0.02641255594789982, + "step": 71 + }, + { + "epoch": 0.036, + "grad_norm": 1.5207164287567139, + "grad_norm_var": 0.05029689369081134, + "learning_rate": 1.4400000000000001e-05, + "loss": 0.5155, + "loss/crossentropy": 2.514549493789673, + "loss/hidden": 0.2265625, + "loss/logits": 0.024821529164910316, + "loss/reg": 0.02641192451119423, + "step": 72 + }, + { + "epoch": 0.0365, + "grad_norm": 1.5217493772506714, + "grad_norm_var": 0.05175064306116064, + "learning_rate": 1.46e-05, + "loss": 0.5331, + "loss/crossentropy": 2.2549461126327515, + "loss/hidden": 0.2353515625, + "loss/logits": 0.03362779691815376, + "loss/reg": 0.026411263272166252, + "step": 73 + }, + { + "epoch": 0.037, + "grad_norm": 1.4319448471069336, + "grad_norm_var": 0.05133754544456459, + "learning_rate": 1.48e-05, + "loss": 0.543, + "loss/crossentropy": 2.2208691835403442, + "loss/hidden": 0.251953125, + "loss/logits": 0.026933430694043636, + "loss/reg": 0.026410607621073723, + "step": 74 + }, + { + "epoch": 0.0375, + "grad_norm": 1.5548027753829956, + "grad_norm_var": 0.05338703002904901, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.5053, + "loss/crossentropy": 2.4420419931411743, + "loss/hidden": 0.21630859375, + "loss/logits": 0.02489750273525715, + "loss/reg": 0.026409907266497612, + "step": 75 + }, + { + "epoch": 0.038, + "grad_norm": 1.0714695453643799, + "grad_norm_var": 0.058232407176660186, + "learning_rate": 1.5200000000000002e-05, + "loss": 0.4938, + "loss/crossentropy": 2.3792872428894043, + "loss/hidden": 0.20458984375, + "loss/logits": 0.025158749893307686, + "loss/reg": 0.026409264653921127, + "step": 76 + }, + { + "epoch": 0.0385, + "grad_norm": 1.2519381046295166, + "grad_norm_var": 0.05827235736891852, + "learning_rate": 1.54e-05, + "loss": 0.4813, + "loss/crossentropy": 2.3257339000701904, + "loss/hidden": 0.1962890625, + "loss/logits": 0.02092126850038767, + "loss/reg": 0.026408692821860313, + "step": 77 + }, + { + "epoch": 0.039, + "grad_norm": 1.2653789520263672, + "grad_norm_var": 0.05849186368942368, + "learning_rate": 1.5600000000000003e-05, + "loss": 0.5246, + "loss/crossentropy": 2.5811800956726074, + "loss/hidden": 0.23095703125, + "loss/logits": 0.029558134265244007, + "loss/reg": 0.02640816569328308, + "step": 78 + }, + { + "epoch": 0.0395, + "grad_norm": 2.259216070175171, + "grad_norm_var": 0.09562263018362811, + "learning_rate": 1.58e-05, + "loss": 0.5206, + "loss/crossentropy": 2.4250094890594482, + "loss/hidden": 0.2265625, + "loss/logits": 0.03000534698367119, + "loss/reg": 0.026407474651932716, + "step": 79 + }, + { + "epoch": 0.04, + "grad_norm": 1.7354488372802734, + "grad_norm_var": 0.10105330191861767, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.5139, + "loss/crossentropy": 2.3155272006988525, + "loss/hidden": 0.22509765625, + "loss/logits": 0.02471769694238901, + "loss/reg": 0.026406768709421158, + "step": 80 + }, + { + "epoch": 0.0405, + "grad_norm": 1.6819829940795898, + "grad_norm_var": 0.1025191577706432, + "learning_rate": 1.62e-05, + "loss": 0.5918, + "loss/crossentropy": 2.446201205253601, + "loss/hidden": 0.2861328125, + "loss/logits": 0.04164840281009674, + "loss/reg": 0.02640613541007042, + "step": 81 + }, + { + "epoch": 0.041, + "grad_norm": 1.1699199676513672, + "grad_norm_var": 0.1066873821895888, + "learning_rate": 1.64e-05, + "loss": 0.5134, + "loss/crossentropy": 2.456650495529175, + "loss/hidden": 0.2177734375, + "loss/logits": 0.031566061079502106, + "loss/reg": 0.026405589655041695, + "step": 82 + }, + { + "epoch": 0.0415, + "grad_norm": 1.0190843343734741, + "grad_norm_var": 0.11088006372520322, + "learning_rate": 1.66e-05, + "loss": 0.4661, + "loss/crossentropy": 2.4336618185043335, + "loss/hidden": 0.18115234375, + "loss/logits": 0.02087457850575447, + "loss/reg": 0.0264048483222723, + "step": 83 + }, + { + "epoch": 0.042, + "grad_norm": 1.3154826164245605, + "grad_norm_var": 0.10682859054876676, + "learning_rate": 1.6800000000000002e-05, + "loss": 0.5325, + "loss/crossentropy": 2.4335875511169434, + "loss/hidden": 0.2392578125, + "loss/logits": 0.029201870784163475, + "loss/reg": 0.026404235512018204, + "step": 84 + }, + { + "epoch": 0.0425, + "grad_norm": 1.1499496698379517, + "grad_norm_var": 0.0973436240677034, + "learning_rate": 1.7e-05, + "loss": 0.4708, + "loss/crossentropy": 2.3389049768447876, + "loss/hidden": 0.1826171875, + "loss/logits": 0.024132695980370045, + "loss/reg": 0.02640344202518463, + "step": 85 + }, + { + "epoch": 0.043, + "grad_norm": 1.07028067111969, + "grad_norm_var": 0.10585526029347007, + "learning_rate": 1.72e-05, + "loss": 0.4749, + "loss/crossentropy": 2.347036838531494, + "loss/hidden": 0.18896484375, + "loss/logits": 0.02186472900211811, + "loss/reg": 0.02640284039080143, + "step": 86 + }, + { + "epoch": 0.0435, + "grad_norm": 2.0228259563446045, + "grad_norm_var": 0.12474687162745439, + "learning_rate": 1.7400000000000003e-05, + "loss": 0.5076, + "loss/crossentropy": 2.3726253509521484, + "loss/hidden": 0.21240234375, + "loss/logits": 0.03117395006120205, + "loss/reg": 0.026402218267321587, + "step": 87 + }, + { + "epoch": 0.044, + "grad_norm": 1.689095377922058, + "grad_norm_var": 0.12832789033596606, + "learning_rate": 1.76e-05, + "loss": 0.5393, + "loss/crossentropy": 2.6106048822402954, + "loss/hidden": 0.2451171875, + "loss/logits": 0.030183403752744198, + "loss/reg": 0.026401378214359283, + "step": 88 + }, + { + "epoch": 0.0445, + "grad_norm": 1.4513983726501465, + "grad_norm_var": 0.1279703973651166, + "learning_rate": 1.7800000000000002e-05, + "loss": 0.5203, + "loss/crossentropy": 2.3146345615386963, + "loss/hidden": 0.22705078125, + "loss/logits": 0.029247512109577656, + "loss/reg": 0.02640063315629959, + "step": 89 + }, + { + "epoch": 0.045, + "grad_norm": 1.0706562995910645, + "grad_norm_var": 0.13681825045996157, + "learning_rate": 1.8e-05, + "loss": 0.472, + "loss/crossentropy": 2.458780884742737, + "loss/hidden": 0.18310546875, + "loss/logits": 0.024928967468440533, + "loss/reg": 0.026399986818432808, + "step": 90 + }, + { + "epoch": 0.0455, + "grad_norm": 1.243531346321106, + "grad_norm_var": 0.13743203065561993, + "learning_rate": 1.8200000000000002e-05, + "loss": 0.46, + "loss/crossentropy": 2.273237943649292, + "loss/hidden": 0.1748046875, + "loss/logits": 0.021158389747142792, + "loss/reg": 0.026399333029985428, + "step": 91 + }, + { + "epoch": 0.046, + "grad_norm": 1.248246669769287, + "grad_norm_var": 0.13154193773160655, + "learning_rate": 1.8400000000000003e-05, + "loss": 0.5025, + "loss/crossentropy": 2.7035024166107178, + "loss/hidden": 0.2138671875, + "loss/logits": 0.024649174883961678, + "loss/reg": 0.026398882269859314, + "step": 92 + }, + { + "epoch": 0.0465, + "grad_norm": 1.5103347301483154, + "grad_norm_var": 0.13008748368884535, + "learning_rate": 1.86e-05, + "loss": 0.4831, + "loss/crossentropy": 2.3471440076828003, + "loss/hidden": 0.193359375, + "loss/logits": 0.025768487714231014, + "loss/reg": 0.026398463174700737, + "step": 93 + }, + { + "epoch": 0.047, + "grad_norm": 1.6160238981246948, + "grad_norm_var": 0.1300087857040161, + "learning_rate": 1.88e-05, + "loss": 0.5294, + "loss/crossentropy": 2.2618273496627808, + "loss/hidden": 0.23681640625, + "loss/logits": 0.028604180552065372, + "loss/reg": 0.02639804780483246, + "step": 94 + }, + { + "epoch": 0.0475, + "grad_norm": 1.6858937740325928, + "grad_norm_var": 0.08894905728247575, + "learning_rate": 1.9e-05, + "loss": 0.5078, + "loss/crossentropy": 2.2833045721054077, + "loss/hidden": 0.21630859375, + "loss/logits": 0.027537615969777107, + "loss/reg": 0.0263975840061903, + "step": 95 + }, + { + "epoch": 0.048, + "grad_norm": 1.3516885042190552, + "grad_norm_var": 0.08188523397349545, + "learning_rate": 1.9200000000000003e-05, + "loss": 0.5472, + "loss/crossentropy": 2.288330078125, + "loss/hidden": 0.25244140625, + "loss/logits": 0.030751955695450306, + "loss/reg": 0.026396671310067177, + "step": 96 + }, + { + "epoch": 0.0485, + "grad_norm": 1.1249408721923828, + "grad_norm_var": 0.07985427321204851, + "learning_rate": 1.94e-05, + "loss": 0.4777, + "loss/crossentropy": 2.3718440532684326, + "loss/hidden": 0.1923828125, + "loss/logits": 0.021322906017303467, + "loss/reg": 0.026395753026008606, + "step": 97 + }, + { + "epoch": 0.049, + "grad_norm": 1.2627309560775757, + "grad_norm_var": 0.07805640745137694, + "learning_rate": 1.9600000000000002e-05, + "loss": 0.5154, + "loss/crossentropy": 2.2158303260803223, + "loss/hidden": 0.21923828125, + "loss/logits": 0.03221841435879469, + "loss/reg": 0.026394877582788467, + "step": 98 + }, + { + "epoch": 0.0495, + "grad_norm": 1.2408559322357178, + "grad_norm_var": 0.07091623482815752, + "learning_rate": 1.98e-05, + "loss": 0.5423, + "loss/crossentropy": 2.3226230144500732, + "loss/hidden": 0.23828125, + "loss/logits": 0.04008352570235729, + "loss/reg": 0.026393810287117958, + "step": 99 + }, + { + "epoch": 0.05, + "grad_norm": 1.1801763772964478, + "grad_norm_var": 0.07319502933235617, + "learning_rate": 2e-05, + "loss": 0.5028, + "loss/crossentropy": 2.242385983467102, + "loss/hidden": 0.20947265625, + "loss/logits": 0.02935761120170355, + "loss/reg": 0.026393063366413116, + "step": 100 + }, + { + "epoch": 0.0505, + "grad_norm": 1.55876624584198, + "grad_norm_var": 0.07165068938641829, + "learning_rate": 2e-05, + "loss": 0.6025, + "loss/crossentropy": 2.240237832069397, + "loss/hidden": 0.28955078125, + "loss/logits": 0.04904773272573948, + "loss/reg": 0.02639206498861313, + "step": 101 + }, + { + "epoch": 0.051, + "grad_norm": 2.615293025970459, + "grad_norm_var": 0.15385355345349763, + "learning_rate": 2e-05, + "loss": 0.5793, + "loss/crossentropy": 2.32190477848053, + "loss/hidden": 0.283203125, + "loss/logits": 0.03213760443031788, + "loss/reg": 0.026391005143523216, + "step": 102 + }, + { + "epoch": 0.0515, + "grad_norm": 1.30605149269104, + "grad_norm_var": 0.1352356444631638, + "learning_rate": 2e-05, + "loss": 0.4823, + "loss/crossentropy": 2.4284926652908325, + "loss/hidden": 0.193359375, + "loss/logits": 0.025081547908484936, + "loss/reg": 0.02638992853462696, + "step": 103 + }, + { + "epoch": 0.052, + "grad_norm": 1.141875147819519, + "grad_norm_var": 0.13630413553074583, + "learning_rate": 2e-05, + "loss": 0.508, + "loss/crossentropy": 2.3841702938079834, + "loss/hidden": 0.2158203125, + "loss/logits": 0.02830567955970764, + "loss/reg": 0.026389040052890778, + "step": 104 + }, + { + "epoch": 0.0525, + "grad_norm": 1.3670423030853271, + "grad_norm_var": 0.1363173233399147, + "learning_rate": 2e-05, + "loss": 0.5535, + "loss/crossentropy": 2.3601726293563843, + "loss/hidden": 0.25048828125, + "loss/logits": 0.03918229416012764, + "loss/reg": 0.02638789638876915, + "step": 105 + }, + { + "epoch": 0.053, + "grad_norm": 1.5876195430755615, + "grad_norm_var": 0.1297847067338589, + "learning_rate": 2e-05, + "loss": 0.5052, + "loss/crossentropy": 2.3636826276779175, + "loss/hidden": 0.21484375, + "loss/logits": 0.026523033156991005, + "loss/reg": 0.02638677880167961, + "step": 106 + }, + { + "epoch": 0.0535, + "grad_norm": 1.3877314329147339, + "grad_norm_var": 0.12730558444343335, + "learning_rate": 2e-05, + "loss": 0.5385, + "loss/crossentropy": 2.2610294818878174, + "loss/hidden": 0.2421875, + "loss/logits": 0.03246981091797352, + "loss/reg": 0.02638590894639492, + "step": 107 + }, + { + "epoch": 0.054, + "grad_norm": 1.3986035585403442, + "grad_norm_var": 0.12469232217100033, + "learning_rate": 2e-05, + "loss": 0.5716, + "loss/crossentropy": 2.212199330329895, + "loss/hidden": 0.2626953125, + "loss/logits": 0.045047592371702194, + "loss/reg": 0.026384945958852768, + "step": 108 + }, + { + "epoch": 0.0545, + "grad_norm": 1.056304931640625, + "grad_norm_var": 0.1344369200763623, + "learning_rate": 2e-05, + "loss": 0.4859, + "loss/crossentropy": 2.617898106575012, + "loss/hidden": 0.193359375, + "loss/logits": 0.02872647438198328, + "loss/reg": 0.026384029537439346, + "step": 109 + }, + { + "epoch": 0.055, + "grad_norm": 3.573809862136841, + "grad_norm_var": 0.42252804674691424, + "learning_rate": 2e-05, + "loss": 0.6846, + "loss/crossentropy": 2.3089191913604736, + "loss/hidden": 0.33984375, + "loss/logits": 0.08094017207622528, + "loss/reg": 0.026383111253380775, + "step": 110 + }, + { + "epoch": 0.0555, + "grad_norm": 1.2283390760421753, + "grad_norm_var": 0.42747247360055096, + "learning_rate": 2e-05, + "loss": 0.5406, + "loss/crossentropy": 2.066853880882263, + "loss/hidden": 0.24658203125, + "loss/logits": 0.030207395553588867, + "loss/reg": 0.026382330805063248, + "step": 111 + }, + { + "epoch": 0.056, + "grad_norm": 1.1344459056854248, + "grad_norm_var": 0.4354093100708122, + "learning_rate": 2e-05, + "loss": 0.4825, + "loss/crossentropy": 2.4759345054626465, + "loss/hidden": 0.19384765625, + "loss/logits": 0.02485422883182764, + "loss/reg": 0.02638155035674572, + "step": 112 + }, + { + "epoch": 0.0565, + "grad_norm": 1.476331353187561, + "grad_norm_var": 0.425072268588233, + "learning_rate": 2e-05, + "loss": 0.4962, + "loss/crossentropy": 2.257875084877014, + "loss/hidden": 0.20751953125, + "loss/logits": 0.024832582101225853, + "loss/reg": 0.026380501687526703, + "step": 113 + }, + { + "epoch": 0.057, + "grad_norm": 1.1095691919326782, + "grad_norm_var": 0.43204239892278623, + "learning_rate": 2e-05, + "loss": 0.494, + "loss/crossentropy": 2.5208946466445923, + "loss/hidden": 0.20361328125, + "loss/logits": 0.02655248437076807, + "loss/reg": 0.026379412040114403, + "step": 114 + }, + { + "epoch": 0.0575, + "grad_norm": 1.2755762338638306, + "grad_norm_var": 0.4308130924435341, + "learning_rate": 2e-05, + "loss": 0.494, + "loss/crossentropy": 2.5310138463974, + "loss/hidden": 0.20458984375, + "loss/logits": 0.025613101199269295, + "loss/reg": 0.026378460228443146, + "step": 115 + }, + { + "epoch": 0.058, + "grad_norm": 1.1098158359527588, + "grad_norm_var": 0.4343559906164728, + "learning_rate": 2e-05, + "loss": 0.4746, + "loss/crossentropy": 2.6709823608398438, + "loss/hidden": 0.18701171875, + "loss/logits": 0.023796855472028255, + "loss/reg": 0.02637753076851368, + "step": 116 + }, + { + "epoch": 0.0585, + "grad_norm": 1.940610647201538, + "grad_norm_var": 0.44541967059291204, + "learning_rate": 2e-05, + "loss": 0.5661, + "loss/crossentropy": 2.4929665327072144, + "loss/hidden": 0.26611328125, + "loss/logits": 0.036208903416991234, + "loss/reg": 0.026376651600003242, + "step": 117 + }, + { + "epoch": 0.059, + "grad_norm": 2.426042318344116, + "grad_norm_var": 0.42063368115552174, + "learning_rate": 2e-05, + "loss": 0.5937, + "loss/crossentropy": 2.052187740802765, + "loss/hidden": 0.298828125, + "loss/logits": 0.031148137524724007, + "loss/reg": 0.026375625282526016, + "step": 118 + }, + { + "epoch": 0.0595, + "grad_norm": 1.9228861331939697, + "grad_norm_var": 0.4257910091262336, + "learning_rate": 2e-05, + "loss": 0.6001, + "loss/crossentropy": 2.324827551841736, + "loss/hidden": 0.291015625, + "loss/logits": 0.04536169767379761, + "loss/reg": 0.02637471817433834, + "step": 119 + }, + { + "epoch": 0.06, + "grad_norm": 1.3524922132492065, + "grad_norm_var": 0.41651161943220427, + "learning_rate": 2e-05, + "loss": 0.5096, + "loss/crossentropy": 2.5075470209121704, + "loss/hidden": 0.2177734375, + "loss/logits": 0.028110167011618614, + "loss/reg": 0.02637365460395813, + "step": 120 + }, + { + "epoch": 0.0605, + "grad_norm": 1.4671199321746826, + "grad_norm_var": 0.4142398893830403, + "learning_rate": 2e-05, + "loss": 0.5239, + "loss/crossentropy": 2.441853404045105, + "loss/hidden": 0.22607421875, + "loss/logits": 0.03414294868707657, + "loss/reg": 0.026372529566287994, + "step": 121 + }, + { + "epoch": 0.061, + "grad_norm": 1.0777528285980225, + "grad_norm_var": 0.4306804814207595, + "learning_rate": 2e-05, + "loss": 0.5097, + "loss/crossentropy": 2.321939468383789, + "loss/hidden": 0.21533203125, + "loss/logits": 0.03067285381257534, + "loss/reg": 0.026371382176876068, + "step": 122 + }, + { + "epoch": 0.0615, + "grad_norm": 1.3190155029296875, + "grad_norm_var": 0.4325410213447808, + "learning_rate": 2e-05, + "loss": 0.5256, + "loss/crossentropy": 2.414122700691223, + "loss/hidden": 0.2294921875, + "loss/logits": 0.032370791770517826, + "loss/reg": 0.026370296254754066, + "step": 123 + }, + { + "epoch": 0.062, + "grad_norm": 1.133116364479065, + "grad_norm_var": 0.44245743827275397, + "learning_rate": 2e-05, + "loss": 0.5005, + "loss/crossentropy": 2.212061285972595, + "loss/hidden": 0.2080078125, + "loss/logits": 0.02877300512045622, + "loss/reg": 0.026369236409664154, + "step": 124 + }, + { + "epoch": 0.0625, + "grad_norm": 1.609708547592163, + "grad_norm_var": 0.4260775107173165, + "learning_rate": 2e-05, + "loss": 0.5155, + "loss/crossentropy": 2.397601008415222, + "loss/hidden": 0.21923828125, + "loss/logits": 0.03255470283329487, + "loss/reg": 0.026368040591478348, + "step": 125 + }, + { + "epoch": 0.063, + "grad_norm": 1.7017152309417725, + "grad_norm_var": 0.14551891758514066, + "learning_rate": 2e-05, + "loss": 0.5923, + "loss/crossentropy": 2.1400970220565796, + "loss/hidden": 0.283203125, + "loss/logits": 0.04546245560050011, + "loss/reg": 0.026366816833615303, + "step": 126 + }, + { + "epoch": 0.0635, + "grad_norm": 1.1147139072418213, + "grad_norm_var": 0.14976404939792326, + "learning_rate": 2e-05, + "loss": 0.4842, + "loss/crossentropy": 2.1656835079193115, + "loss/hidden": 0.19775390625, + "loss/logits": 0.022837044671177864, + "loss/reg": 0.02636570855975151, + "step": 127 + }, + { + "epoch": 0.064, + "grad_norm": 1.277297854423523, + "grad_norm_var": 0.14506375834877663, + "learning_rate": 2e-05, + "loss": 0.5123, + "loss/crossentropy": 2.5118154287338257, + "loss/hidden": 0.21875, + "loss/logits": 0.029942544177174568, + "loss/reg": 0.02636442333459854, + "step": 128 + }, + { + "epoch": 0.0645, + "grad_norm": 1.191677451133728, + "grad_norm_var": 0.14939848388545987, + "learning_rate": 2e-05, + "loss": 0.4912, + "loss/crossentropy": 2.3038079738616943, + "loss/hidden": 0.19921875, + "loss/logits": 0.02833767607808113, + "loss/reg": 0.026363197714090347, + "step": 129 + }, + { + "epoch": 0.065, + "grad_norm": 1.2800445556640625, + "grad_norm_var": 0.14371961156019347, + "learning_rate": 2e-05, + "loss": 0.4781, + "loss/crossentropy": 2.164215087890625, + "loss/hidden": 0.18896484375, + "loss/logits": 0.02550451084971428, + "loss/reg": 0.026361893862485886, + "step": 130 + }, + { + "epoch": 0.0655, + "grad_norm": 2.83632230758667, + "grad_norm_var": 0.2596730266397259, + "learning_rate": 2e-05, + "loss": 0.5199, + "loss/crossentropy": 2.4381964206695557, + "loss/hidden": 0.228515625, + "loss/logits": 0.027756940573453903, + "loss/reg": 0.02636053040623665, + "step": 131 + }, + { + "epoch": 0.066, + "grad_norm": 1.4346998929977417, + "grad_norm_var": 0.24730943436193792, + "learning_rate": 2e-05, + "loss": 0.4843, + "loss/crossentropy": 2.401941180229187, + "loss/hidden": 0.1904296875, + "loss/logits": 0.030300754122436047, + "loss/reg": 0.026359396055340767, + "step": 132 + }, + { + "epoch": 0.0665, + "grad_norm": 1.3330755233764648, + "grad_norm_var": 0.24018081345897185, + "learning_rate": 2e-05, + "loss": 0.5346, + "loss/crossentropy": 2.3078893423080444, + "loss/hidden": 0.244140625, + "loss/logits": 0.02686551958322525, + "loss/reg": 0.02635829895734787, + "step": 133 + }, + { + "epoch": 0.067, + "grad_norm": 3.5527265071868896, + "grad_norm_var": 0.4541487312436425, + "learning_rate": 2e-05, + "loss": 0.5719, + "loss/crossentropy": 2.3654850721359253, + "loss/hidden": 0.271484375, + "loss/logits": 0.03688213415443897, + "loss/reg": 0.02635718323290348, + "step": 134 + }, + { + "epoch": 0.0675, + "grad_norm": 1.5558003187179565, + "grad_norm_var": 0.4467804937083296, + "learning_rate": 2e-05, + "loss": 0.5577, + "loss/crossentropy": 2.413025140762329, + "loss/hidden": 0.255859375, + "loss/logits": 0.03832230344414711, + "loss/reg": 0.026355979964137077, + "step": 135 + }, + { + "epoch": 0.068, + "grad_norm": 1.61518394947052, + "grad_norm_var": 0.44321835982398144, + "learning_rate": 2e-05, + "loss": 0.5304, + "loss/crossentropy": 2.3400243520736694, + "loss/hidden": 0.232421875, + "loss/logits": 0.034462086856365204, + "loss/reg": 0.026354758068919182, + "step": 136 + }, + { + "epoch": 0.0685, + "grad_norm": 1.122028112411499, + "grad_norm_var": 0.45648783165066575, + "learning_rate": 2e-05, + "loss": 0.5084, + "loss/crossentropy": 2.297537922859192, + "loss/hidden": 0.2138671875, + "loss/logits": 0.030963504686951637, + "loss/reg": 0.02635359950363636, + "step": 137 + }, + { + "epoch": 0.069, + "grad_norm": 1.678496241569519, + "grad_norm_var": 0.43944044570977725, + "learning_rate": 2e-05, + "loss": 0.5425, + "loss/crossentropy": 1.9657554626464844, + "loss/hidden": 0.25048828125, + "loss/logits": 0.028447046875953674, + "loss/reg": 0.02635251358151436, + "step": 138 + }, + { + "epoch": 0.0695, + "grad_norm": 1.2920198440551758, + "grad_norm_var": 0.44053238449102083, + "learning_rate": 2e-05, + "loss": 0.5061, + "loss/crossentropy": 2.2413735389709473, + "loss/hidden": 0.212890625, + "loss/logits": 0.029677780345082283, + "loss/reg": 0.026351330801844597, + "step": 139 + }, + { + "epoch": 0.07, + "grad_norm": 1.7133574485778809, + "grad_norm_var": 0.4248322374530742, + "learning_rate": 2e-05, + "loss": 0.5116, + "loss/crossentropy": 2.4616912603378296, + "loss/hidden": 0.21728515625, + "loss/logits": 0.03084972407668829, + "loss/reg": 0.026350252330303192, + "step": 140 + }, + { + "epoch": 0.0705, + "grad_norm": 1.637211561203003, + "grad_norm_var": 0.42475264869840973, + "learning_rate": 2e-05, + "loss": 0.5188, + "loss/crossentropy": 2.404749631881714, + "loss/hidden": 0.22119140625, + "loss/logits": 0.03416546434164047, + "loss/reg": 0.02634907327592373, + "step": 141 + }, + { + "epoch": 0.071, + "grad_norm": 1.6117165088653564, + "grad_norm_var": 0.4245905890698488, + "learning_rate": 2e-05, + "loss": 0.5128, + "loss/crossentropy": 2.2999398708343506, + "loss/hidden": 0.220703125, + "loss/logits": 0.0286036329343915, + "loss/reg": 0.026347877457737923, + "step": 142 + }, + { + "epoch": 0.0715, + "grad_norm": 1.5995277166366577, + "grad_norm_var": 0.40529966216021474, + "learning_rate": 2e-05, + "loss": 0.5082, + "loss/crossentropy": 2.391393780708313, + "loss/hidden": 0.2138671875, + "loss/logits": 0.030896139331161976, + "loss/reg": 0.02634662576019764, + "step": 143 + }, + { + "epoch": 0.072, + "grad_norm": 1.5376816987991333, + "grad_norm_var": 0.3958791020628865, + "learning_rate": 2e-05, + "loss": 0.4819, + "loss/crossentropy": 2.288747191429138, + "loss/hidden": 0.19091796875, + "loss/logits": 0.027577555738389492, + "loss/reg": 0.026345305144786835, + "step": 144 + }, + { + "epoch": 0.0725, + "grad_norm": 1.2494720220565796, + "grad_norm_var": 0.39227114538706565, + "learning_rate": 2e-05, + "loss": 0.4809, + "loss/crossentropy": 2.2762606143951416, + "loss/hidden": 0.19091796875, + "loss/logits": 0.026529721915721893, + "loss/reg": 0.02634395658969879, + "step": 145 + }, + { + "epoch": 0.073, + "grad_norm": 1.2957813739776611, + "grad_norm_var": 0.39142520941635195, + "learning_rate": 2e-05, + "loss": 0.5373, + "loss/crossentropy": 2.247607469558716, + "loss/hidden": 0.236328125, + "loss/logits": 0.037526827305555344, + "loss/reg": 0.026342619210481644, + "step": 146 + }, + { + "epoch": 0.0735, + "grad_norm": 1.5920614004135132, + "grad_norm_var": 0.2982705153831809, + "learning_rate": 2e-05, + "loss": 0.5551, + "loss/crossentropy": 2.5578393936157227, + "loss/hidden": 0.25634765625, + "loss/logits": 0.035370574332773685, + "loss/reg": 0.026341425254940987, + "step": 147 + }, + { + "epoch": 0.074, + "grad_norm": 1.115143895149231, + "grad_norm_var": 0.3122838762450205, + "learning_rate": 2e-05, + "loss": 0.4949, + "loss/crossentropy": 2.293186843395233, + "loss/hidden": 0.20263671875, + "loss/logits": 0.028887784108519554, + "loss/reg": 0.02634003758430481, + "step": 148 + }, + { + "epoch": 0.0745, + "grad_norm": 1.2242144346237183, + "grad_norm_var": 0.3168093531880851, + "learning_rate": 2e-05, + "loss": 0.4976, + "loss/crossentropy": 2.541364312171936, + "loss/hidden": 0.205078125, + "loss/logits": 0.029183853417634964, + "loss/reg": 0.026338616386055946, + "step": 149 + }, + { + "epoch": 0.075, + "grad_norm": 1.2801847457885742, + "grad_norm_var": 0.043969165908166435, + "learning_rate": 2e-05, + "loss": 0.5246, + "loss/crossentropy": 2.365533709526062, + "loss/hidden": 0.22607421875, + "loss/logits": 0.035141369327902794, + "loss/reg": 0.02633722312748432, + "step": 150 + }, + { + "epoch": 0.0755, + "grad_norm": 1.456945538520813, + "grad_norm_var": 0.0431194160041447, + "learning_rate": 2e-05, + "loss": 0.4969, + "loss/crossentropy": 2.5154623985290527, + "loss/hidden": 0.20361328125, + "loss/logits": 0.029950300231575966, + "loss/reg": 0.02633577026426792, + "step": 151 + }, + { + "epoch": 0.076, + "grad_norm": 1.2066655158996582, + "grad_norm_var": 0.043943164667008955, + "learning_rate": 2e-05, + "loss": 0.4752, + "loss/crossentropy": 2.528536558151245, + "loss/hidden": 0.18798828125, + "loss/logits": 0.023877541534602642, + "loss/reg": 0.026334302499890327, + "step": 152 + }, + { + "epoch": 0.0765, + "grad_norm": 1.2901597023010254, + "grad_norm_var": 0.03918073743505299, + "learning_rate": 2e-05, + "loss": 0.521, + "loss/crossentropy": 2.3224003314971924, + "loss/hidden": 0.21484375, + "loss/logits": 0.04283316247165203, + "loss/reg": 0.02633279375731945, + "step": 153 + }, + { + "epoch": 0.077, + "grad_norm": 1.74579656124115, + "grad_norm_var": 0.04174939581046431, + "learning_rate": 2e-05, + "loss": 0.4896, + "loss/crossentropy": 2.3139768838882446, + "loss/hidden": 0.201171875, + "loss/logits": 0.02508167363703251, + "loss/reg": 0.026331480592489243, + "step": 154 + }, + { + "epoch": 0.0775, + "grad_norm": 1.2306878566741943, + "grad_norm_var": 0.04309645701489041, + "learning_rate": 2e-05, + "loss": 0.4816, + "loss/crossentropy": 2.252236247062683, + "loss/hidden": 0.18896484375, + "loss/logits": 0.029315452091395855, + "loss/reg": 0.02633025124669075, + "step": 155 + }, + { + "epoch": 0.078, + "grad_norm": 1.297144889831543, + "grad_norm_var": 0.03787466463763825, + "learning_rate": 2e-05, + "loss": 0.5241, + "loss/crossentropy": 2.2772055864334106, + "loss/hidden": 0.22900390625, + "loss/logits": 0.03178275562822819, + "loss/reg": 0.026328938081860542, + "step": 156 + }, + { + "epoch": 0.0785, + "grad_norm": 1.3461697101593018, + "grad_norm_var": 0.033891815904075646, + "learning_rate": 2e-05, + "loss": 0.5533, + "loss/crossentropy": 2.2572057247161865, + "loss/hidden": 0.2568359375, + "loss/logits": 0.03316341433674097, + "loss/reg": 0.02632747031748295, + "step": 157 + }, + { + "epoch": 0.079, + "grad_norm": 1.6142765283584595, + "grad_norm_var": 0.033971332471514334, + "learning_rate": 2e-05, + "loss": 0.477, + "loss/crossentropy": 2.3103591203689575, + "loss/hidden": 0.189453125, + "loss/logits": 0.02428613882511854, + "loss/reg": 0.026326211169362068, + "step": 158 + }, + { + "epoch": 0.0795, + "grad_norm": 1.0435117483139038, + "grad_norm_var": 0.03702752005093206, + "learning_rate": 2e-05, + "loss": 0.4774, + "loss/crossentropy": 2.236763596534729, + "loss/hidden": 0.18994140625, + "loss/logits": 0.02417835220694542, + "loss/reg": 0.026324694976210594, + "step": 159 + }, + { + "epoch": 0.08, + "grad_norm": 3.194502115249634, + "grad_norm_var": 0.251077157144137, + "learning_rate": 2e-05, + "loss": 0.5767, + "loss/crossentropy": 2.4404300451278687, + "loss/hidden": 0.2236328125, + "loss/logits": 0.0898615438491106, + "loss/reg": 0.02632344886660576, + "step": 160 + }, + { + "epoch": 0.0805, + "grad_norm": 1.223811149597168, + "grad_norm_var": 0.25180071296473483, + "learning_rate": 2e-05, + "loss": 0.4781, + "loss/crossentropy": 2.2644309997558594, + "loss/hidden": 0.19140625, + "loss/logits": 0.023457905277609825, + "loss/reg": 0.026322180405259132, + "step": 161 + }, + { + "epoch": 0.081, + "grad_norm": 1.5841586589813232, + "grad_norm_var": 0.25117174878087756, + "learning_rate": 2e-05, + "loss": 0.5629, + "loss/crossentropy": 1.9194682240486145, + "loss/hidden": 0.25048828125, + "loss/logits": 0.049169132485985756, + "loss/reg": 0.026320787146687508, + "step": 162 + }, + { + "epoch": 0.0815, + "grad_norm": 1.2795405387878418, + "grad_norm_var": 0.2519956540566284, + "learning_rate": 2e-05, + "loss": 0.5141, + "loss/crossentropy": 2.444055438041687, + "loss/hidden": 0.2177734375, + "loss/logits": 0.03311499021947384, + "loss/reg": 0.026319410651922226, + "step": 163 + }, + { + "epoch": 0.082, + "grad_norm": 1.0281555652618408, + "grad_norm_var": 0.25630376830439533, + "learning_rate": 2e-05, + "loss": 0.4718, + "loss/crossentropy": 2.4007210731506348, + "loss/hidden": 0.18359375, + "loss/logits": 0.024980327114462852, + "loss/reg": 0.026317832991480827, + "step": 164 + }, + { + "epoch": 0.0825, + "grad_norm": 1.3523935079574585, + "grad_norm_var": 0.25363641385507896, + "learning_rate": 2e-05, + "loss": 0.5099, + "loss/crossentropy": 2.6051762104034424, + "loss/hidden": 0.21630859375, + "loss/logits": 0.030417022295296192, + "loss/reg": 0.026316583156585693, + "step": 165 + }, + { + "epoch": 0.083, + "grad_norm": 1.538618564605713, + "grad_norm_var": 0.2520149682902304, + "learning_rate": 2e-05, + "loss": 0.5429, + "loss/crossentropy": 2.453674077987671, + "loss/hidden": 0.24951171875, + "loss/logits": 0.030248504132032394, + "loss/reg": 0.026315055787563324, + "step": 166 + }, + { + "epoch": 0.0835, + "grad_norm": 1.152441143989563, + "grad_norm_var": 0.25811823232553094, + "learning_rate": 2e-05, + "loss": 0.5287, + "loss/crossentropy": 2.23244309425354, + "loss/hidden": 0.2294921875, + "loss/logits": 0.03604980930685997, + "loss/reg": 0.02631353586912155, + "step": 167 + }, + { + "epoch": 0.084, + "grad_norm": 3.3678812980651855, + "grad_norm_var": 0.4812229304062583, + "learning_rate": 2e-05, + "loss": 0.6537, + "loss/crossentropy": 2.2121087312698364, + "loss/hidden": 0.322265625, + "loss/logits": 0.06832050159573555, + "loss/reg": 0.026312291622161865, + "step": 168 + }, + { + "epoch": 0.0845, + "grad_norm": 1.3094780445098877, + "grad_norm_var": 0.48049820171389107, + "learning_rate": 2e-05, + "loss": 0.5458, + "loss/crossentropy": 2.29573655128479, + "loss/hidden": 0.24365234375, + "loss/logits": 0.03900368791073561, + "loss/reg": 0.026311108842492104, + "step": 169 + }, + { + "epoch": 0.085, + "grad_norm": 1.4413907527923584, + "grad_norm_var": 0.47963284313486815, + "learning_rate": 2e-05, + "loss": 0.5115, + "loss/crossentropy": 2.3498464822769165, + "loss/hidden": 0.22021484375, + "loss/logits": 0.028182944282889366, + "loss/reg": 0.026309916749596596, + "step": 170 + }, + { + "epoch": 0.0855, + "grad_norm": 1.1035057306289673, + "grad_norm_var": 0.48627495331464554, + "learning_rate": 2e-05, + "loss": 0.5094, + "loss/crossentropy": 2.3309890031814575, + "loss/hidden": 0.20947265625, + "loss/logits": 0.036838797852396965, + "loss/reg": 0.02630869299173355, + "step": 171 + }, + { + "epoch": 0.086, + "grad_norm": 1.0321089029312134, + "grad_norm_var": 0.4997706555859033, + "learning_rate": 2e-05, + "loss": 0.4599, + "loss/crossentropy": 2.512625813484192, + "loss/hidden": 0.17333984375, + "loss/logits": 0.023489498533308506, + "loss/reg": 0.026307322084903717, + "step": 172 + }, + { + "epoch": 0.0865, + "grad_norm": 1.2687665224075317, + "grad_norm_var": 0.5021274230125977, + "learning_rate": 2e-05, + "loss": 0.4478, + "loss/crossentropy": 2.55221164226532, + "loss/hidden": 0.1640625, + "loss/logits": 0.020719519816339016, + "loss/reg": 0.026306064799427986, + "step": 173 + }, + { + "epoch": 0.087, + "grad_norm": 1.6230545043945312, + "grad_norm_var": 0.5022268861494524, + "learning_rate": 2e-05, + "loss": 0.5206, + "loss/crossentropy": 2.54874849319458, + "loss/hidden": 0.22216796875, + "loss/logits": 0.03534366935491562, + "loss/reg": 0.026304682716727257, + "step": 174 + }, + { + "epoch": 0.0875, + "grad_norm": 1.4153763055801392, + "grad_norm_var": 0.4865523407786817, + "learning_rate": 2e-05, + "loss": 0.4923, + "loss/crossentropy": 2.5351545810699463, + "loss/hidden": 0.18896484375, + "loss/logits": 0.04026305489242077, + "loss/reg": 0.026303274556994438, + "step": 175 + }, + { + "epoch": 0.088, + "grad_norm": 1.0160194635391235, + "grad_norm_var": 0.3075858037077518, + "learning_rate": 2e-05, + "loss": 0.439, + "loss/crossentropy": 2.543141722679138, + "loss/hidden": 0.15869140625, + "loss/logits": 0.017316540703177452, + "loss/reg": 0.0263019111007452, + "step": 176 + }, + { + "epoch": 0.0885, + "grad_norm": 1.3745949268341064, + "grad_norm_var": 0.3050415235722406, + "learning_rate": 2e-05, + "loss": 0.5442, + "loss/crossentropy": 2.3582804203033447, + "loss/hidden": 0.24169921875, + "loss/logits": 0.03952281177043915, + "loss/reg": 0.026300618425011635, + "step": 177 + }, + { + "epoch": 0.089, + "grad_norm": 1.2340662479400635, + "grad_norm_var": 0.30552768222984095, + "learning_rate": 2e-05, + "loss": 0.5201, + "loss/crossentropy": 2.3681315183639526, + "loss/hidden": 0.22412109375, + "loss/logits": 0.03298753686249256, + "loss/reg": 0.026299230754375458, + "step": 178 + }, + { + "epoch": 0.0895, + "grad_norm": 2.601248264312744, + "grad_norm_var": 0.39196807835765096, + "learning_rate": 2e-05, + "loss": 0.5363, + "loss/crossentropy": 2.617705225944519, + "loss/hidden": 0.240234375, + "loss/logits": 0.03311354760080576, + "loss/reg": 0.026297833770513535, + "step": 179 + }, + { + "epoch": 0.09, + "grad_norm": 1.4031890630722046, + "grad_norm_var": 0.37760473459329563, + "learning_rate": 2e-05, + "loss": 0.5719, + "loss/crossentropy": 2.3656851053237915, + "loss/hidden": 0.26318359375, + "loss/logits": 0.045768093317747116, + "loss/reg": 0.02629653364419937, + "step": 180 + }, + { + "epoch": 0.0905, + "grad_norm": 1.2391202449798584, + "grad_norm_var": 0.38085698610252045, + "learning_rate": 2e-05, + "loss": 0.4815, + "loss/crossentropy": 2.306247353553772, + "loss/hidden": 0.18798828125, + "loss/logits": 0.03056285623461008, + "loss/reg": 0.026295220479369164, + "step": 181 + }, + { + "epoch": 0.091, + "grad_norm": 1.3922662734985352, + "grad_norm_var": 0.3815894855763109, + "learning_rate": 2e-05, + "loss": 0.5416, + "loss/crossentropy": 2.421887755393982, + "loss/hidden": 0.2333984375, + "loss/logits": 0.04522215947508812, + "loss/reg": 0.026293916627764702, + "step": 182 + }, + { + "epoch": 0.0915, + "grad_norm": 1.1777185201644897, + "grad_norm_var": 0.38046340604863593, + "learning_rate": 2e-05, + "loss": 0.481, + "loss/crossentropy": 2.294826030731201, + "loss/hidden": 0.193359375, + "loss/logits": 0.024756859987974167, + "loss/reg": 0.026292625814676285, + "step": 183 + }, + { + "epoch": 0.092, + "grad_norm": 1.3863762617111206, + "grad_norm_var": 0.13236200174767798, + "learning_rate": 2e-05, + "loss": 0.5306, + "loss/crossentropy": 2.2481424808502197, + "loss/hidden": 0.234375, + "loss/logits": 0.03326253779232502, + "loss/reg": 0.026291374117136, + "step": 184 + }, + { + "epoch": 0.0925, + "grad_norm": 1.0816987752914429, + "grad_norm_var": 0.13762935148172814, + "learning_rate": 2e-05, + "loss": 0.4559, + "loss/crossentropy": 2.3464468717575073, + "loss/hidden": 0.1689453125, + "loss/logits": 0.02402583882212639, + "loss/reg": 0.026290148496627808, + "step": 185 + }, + { + "epoch": 0.093, + "grad_norm": 1.0776005983352661, + "grad_norm_var": 0.1420453846262613, + "learning_rate": 2e-05, + "loss": 0.4634, + "loss/crossentropy": 2.316567063331604, + "loss/hidden": 0.1748046875, + "loss/logits": 0.025691150687634945, + "loss/reg": 0.02628881298005581, + "step": 186 + }, + { + "epoch": 0.0935, + "grad_norm": 2.1526918411254883, + "grad_norm_var": 0.17787751141178104, + "learning_rate": 2e-05, + "loss": 0.4898, + "loss/crossentropy": 2.2931246757507324, + "loss/hidden": 0.20361328125, + "loss/logits": 0.023339038714766502, + "loss/reg": 0.026287470012903214, + "step": 187 + }, + { + "epoch": 0.094, + "grad_norm": 1.3883178234100342, + "grad_norm_var": 0.16810970663468652, + "learning_rate": 2e-05, + "loss": 0.4682, + "loss/crossentropy": 2.4850372076034546, + "loss/hidden": 0.16845703125, + "loss/logits": 0.036926812492311, + "loss/reg": 0.0262861680239439, + "step": 188 + }, + { + "epoch": 0.0945, + "grad_norm": 1.1316860914230347, + "grad_norm_var": 0.172176362699476, + "learning_rate": 2e-05, + "loss": 0.4799, + "loss/crossentropy": 2.5390676259994507, + "loss/hidden": 0.19091796875, + "loss/logits": 0.02617151476442814, + "loss/reg": 0.026284806430339813, + "step": 189 + }, + { + "epoch": 0.095, + "grad_norm": 1.310356616973877, + "grad_norm_var": 0.1697565690965554, + "learning_rate": 2e-05, + "loss": 0.5577, + "loss/crossentropy": 2.2394298315048218, + "loss/hidden": 0.2578125, + "loss/logits": 0.03705478459596634, + "loss/reg": 0.026283571496605873, + "step": 190 + }, + { + "epoch": 0.0955, + "grad_norm": 1.224501371383667, + "grad_norm_var": 0.1716142091861707, + "learning_rate": 2e-05, + "loss": 0.4853, + "loss/crossentropy": 2.3653067350387573, + "loss/hidden": 0.18701171875, + "loss/logits": 0.03547767084091902, + "loss/reg": 0.026282308623194695, + "step": 191 + }, + { + "epoch": 0.096, + "grad_norm": 1.1369792222976685, + "grad_norm_var": 0.16654605297517922, + "learning_rate": 2e-05, + "loss": 0.4612, + "loss/crossentropy": 2.4437299966812134, + "loss/hidden": 0.173828125, + "loss/logits": 0.024531416594982147, + "loss/reg": 0.02628110721707344, + "step": 192 + }, + { + "epoch": 0.0965, + "grad_norm": 1.639382004737854, + "grad_norm_var": 0.1702244083590602, + "learning_rate": 2e-05, + "loss": 0.5584, + "loss/crossentropy": 2.369232177734375, + "loss/hidden": 0.251953125, + "loss/logits": 0.04362649656832218, + "loss/reg": 0.026279788464307785, + "step": 193 + }, + { + "epoch": 0.097, + "grad_norm": 1.7320666313171387, + "grad_norm_var": 0.17397129527364066, + "learning_rate": 2e-05, + "loss": 0.584, + "loss/crossentropy": 2.290635347366333, + "loss/hidden": 0.25537109375, + "loss/logits": 0.0658609364181757, + "loss/reg": 0.026278505101799965, + "step": 194 + }, + { + "epoch": 0.0975, + "grad_norm": 1.3818726539611816, + "grad_norm_var": 0.07845907156529677, + "learning_rate": 2e-05, + "loss": 0.47, + "loss/crossentropy": 2.1524158716201782, + "loss/hidden": 0.1826171875, + "loss/logits": 0.024603160098195076, + "loss/reg": 0.02627725526690483, + "step": 195 + }, + { + "epoch": 0.098, + "grad_norm": 1.499199628829956, + "grad_norm_var": 0.07951141157999278, + "learning_rate": 2e-05, + "loss": 0.5272, + "loss/crossentropy": 2.4975160360336304, + "loss/hidden": 0.22265625, + "loss/logits": 0.04179301019757986, + "loss/reg": 0.02627602592110634, + "step": 196 + }, + { + "epoch": 0.0985, + "grad_norm": 1.5929518938064575, + "grad_norm_var": 0.0810677599209079, + "learning_rate": 2e-05, + "loss": 0.6116, + "loss/crossentropy": 2.5046887397766113, + "loss/hidden": 0.306640625, + "loss/logits": 0.04225216433405876, + "loss/reg": 0.026274660602211952, + "step": 197 + }, + { + "epoch": 0.099, + "grad_norm": 1.1331342458724976, + "grad_norm_var": 0.0853280978459693, + "learning_rate": 2e-05, + "loss": 0.4498, + "loss/crossentropy": 2.4783315658569336, + "loss/hidden": 0.1640625, + "loss/logits": 0.022989329881966114, + "loss/reg": 0.026273364201188087, + "step": 198 + }, + { + "epoch": 0.0995, + "grad_norm": 1.2823922634124756, + "grad_norm_var": 0.0832189351924588, + "learning_rate": 2e-05, + "loss": 0.5028, + "loss/crossentropy": 2.428224563598633, + "loss/hidden": 0.21044921875, + "loss/logits": 0.029634020291268826, + "loss/reg": 0.02627207711338997, + "step": 199 + }, + { + "epoch": 0.1, + "grad_norm": 1.5657204389572144, + "grad_norm_var": 0.0852752560623344, + "learning_rate": 2e-05, + "loss": 0.5566, + "loss/crossentropy": 2.205379009246826, + "loss/hidden": 0.25341796875, + "loss/logits": 0.04043233580887318, + "loss/reg": 0.026270678266882896, + "step": 200 + }, + { + "epoch": 0.1005, + "grad_norm": 2.498617649078369, + "grad_norm_var": 0.15143969810336458, + "learning_rate": 2e-05, + "loss": 0.5453, + "loss/crossentropy": 2.2436362504959106, + "loss/hidden": 0.25048828125, + "loss/logits": 0.03208579681813717, + "loss/reg": 0.026269439607858658, + "step": 201 + }, + { + "epoch": 0.101, + "grad_norm": 1.1255189180374146, + "grad_norm_var": 0.1489852922170759, + "learning_rate": 2e-05, + "loss": 0.4846, + "loss/crossentropy": 2.423098921775818, + "loss/hidden": 0.19384765625, + "loss/logits": 0.02810557559132576, + "loss/reg": 0.02626824378967285, + "step": 202 + }, + { + "epoch": 0.1015, + "grad_norm": 1.7557874917984009, + "grad_norm_var": 0.1236135205651595, + "learning_rate": 2e-05, + "loss": 0.5679, + "loss/crossentropy": 2.62020206451416, + "loss/hidden": 0.2685546875, + "loss/logits": 0.03669197857379913, + "loss/reg": 0.026267159730196, + "step": 203 + }, + { + "epoch": 0.102, + "grad_norm": 1.1842639446258545, + "grad_norm_var": 0.12823160649832796, + "learning_rate": 2e-05, + "loss": 0.5182, + "loss/crossentropy": 2.43496835231781, + "loss/hidden": 0.22412109375, + "loss/logits": 0.03145230747759342, + "loss/reg": 0.026265980675816536, + "step": 204 + }, + { + "epoch": 0.1025, + "grad_norm": 3.2057254314422607, + "grad_norm_var": 0.30915423120596724, + "learning_rate": 2e-05, + "loss": 0.495, + "loss/crossentropy": 2.6262258291244507, + "loss/hidden": 0.2099609375, + "loss/logits": 0.022387961857020855, + "loss/reg": 0.026264773681759834, + "step": 205 + }, + { + "epoch": 0.103, + "grad_norm": 1.5268100500106812, + "grad_norm_var": 0.3043212521218976, + "learning_rate": 2e-05, + "loss": 0.5423, + "loss/crossentropy": 2.3472981452941895, + "loss/hidden": 0.2353515625, + "loss/logits": 0.04430149123072624, + "loss/reg": 0.026263581588864326, + "step": 206 + }, + { + "epoch": 0.1035, + "grad_norm": 1.2183657884597778, + "grad_norm_var": 0.30462490819346133, + "learning_rate": 2e-05, + "loss": 0.5171, + "loss/crossentropy": 2.2207844257354736, + "loss/hidden": 0.22119140625, + "loss/logits": 0.0333048552274704, + "loss/reg": 0.026262367144227028, + "step": 207 + }, + { + "epoch": 0.104, + "grad_norm": 1.3168612718582153, + "grad_norm_var": 0.29572373678734315, + "learning_rate": 2e-05, + "loss": 0.4704, + "loss/crossentropy": 2.4785603284835815, + "loss/hidden": 0.18212890625, + "loss/logits": 0.025700876489281654, + "loss/reg": 0.026260942220687866, + "step": 208 + }, + { + "epoch": 0.1045, + "grad_norm": 1.104201316833496, + "grad_norm_var": 0.31107634650352317, + "learning_rate": 2e-05, + "loss": 0.5125, + "loss/crossentropy": 2.440949320793152, + "loss/hidden": 0.21435546875, + "loss/logits": 0.03553314134478569, + "loss/reg": 0.026259683072566986, + "step": 209 + }, + { + "epoch": 0.105, + "grad_norm": 1.179359793663025, + "grad_norm_var": 0.3182418442890669, + "learning_rate": 2e-05, + "loss": 0.5404, + "loss/crossentropy": 2.4222298860549927, + "loss/hidden": 0.23583984375, + "loss/logits": 0.04196472465991974, + "loss/reg": 0.026258250698447227, + "step": 210 + }, + { + "epoch": 0.1055, + "grad_norm": 1.9198130369186401, + "grad_norm_var": 0.3252966300662526, + "learning_rate": 2e-05, + "loss": 0.7362, + "loss/crossentropy": 2.1343027353286743, + "loss/hidden": 0.42236328125, + "loss/logits": 0.051308806985616684, + "loss/reg": 0.02625690959393978, + "step": 211 + }, + { + "epoch": 0.106, + "grad_norm": 1.945879578590393, + "grad_norm_var": 0.33359211146878015, + "learning_rate": 2e-05, + "loss": 0.5191, + "loss/crossentropy": 2.629801630973816, + "loss/hidden": 0.22802734375, + "loss/logits": 0.028506163507699966, + "loss/reg": 0.026255663484334946, + "step": 212 + }, + { + "epoch": 0.1065, + "grad_norm": 1.10844886302948, + "grad_norm_var": 0.3485388100979046, + "learning_rate": 2e-05, + "loss": 0.5026, + "loss/crossentropy": 2.4873945713043213, + "loss/hidden": 0.2060546875, + "loss/logits": 0.033975718542933464, + "loss/reg": 0.026254238560795784, + "step": 213 + }, + { + "epoch": 0.107, + "grad_norm": 1.5501041412353516, + "grad_norm_var": 0.3352879309740613, + "learning_rate": 2e-05, + "loss": 0.5128, + "loss/crossentropy": 2.2922143936157227, + "loss/hidden": 0.220703125, + "loss/logits": 0.029572436586022377, + "loss/reg": 0.026252800598740578, + "step": 214 + }, + { + "epoch": 0.1075, + "grad_norm": 1.4351506233215332, + "grad_norm_var": 0.3304201508174941, + "learning_rate": 2e-05, + "loss": 0.5019, + "loss/crossentropy": 2.3728071451187134, + "loss/hidden": 0.208984375, + "loss/logits": 0.030389025807380676, + "loss/reg": 0.02625151537358761, + "step": 215 + }, + { + "epoch": 0.108, + "grad_norm": 1.1031361818313599, + "grad_norm_var": 0.3460650712842908, + "learning_rate": 2e-05, + "loss": 0.491, + "loss/crossentropy": 2.4348747730255127, + "loss/hidden": 0.19970703125, + "loss/logits": 0.028779378160834312, + "loss/reg": 0.026250220835208893, + "step": 216 + }, + { + "epoch": 0.1085, + "grad_norm": 1.664985179901123, + "grad_norm_var": 0.28668546672827777, + "learning_rate": 2e-05, + "loss": 0.5599, + "loss/crossentropy": 2.399816870689392, + "loss/hidden": 0.248046875, + "loss/logits": 0.04935701750218868, + "loss/reg": 0.026248781010508537, + "step": 217 + }, + { + "epoch": 0.109, + "grad_norm": 1.4927318096160889, + "grad_norm_var": 0.2757241244708178, + "learning_rate": 2e-05, + "loss": 0.5111, + "loss/crossentropy": 2.343783974647522, + "loss/hidden": 0.22021484375, + "loss/logits": 0.02840256877243519, + "loss/reg": 0.026247289031744003, + "step": 218 + }, + { + "epoch": 0.1095, + "grad_norm": 1.477570652961731, + "grad_norm_var": 0.2727232102389791, + "learning_rate": 2e-05, + "loss": 0.5197, + "loss/crossentropy": 2.4229984283447266, + "loss/hidden": 0.22314453125, + "loss/logits": 0.034054605290293694, + "loss/reg": 0.026245808228850365, + "step": 219 + }, + { + "epoch": 0.11, + "grad_norm": 1.3535478115081787, + "grad_norm_var": 0.26677633070284795, + "learning_rate": 2e-05, + "loss": 0.4955, + "loss/crossentropy": 2.4335192441940308, + "loss/hidden": 0.2021484375, + "loss/logits": 0.03086682688444853, + "loss/reg": 0.02624441497027874, + "step": 220 + }, + { + "epoch": 0.1105, + "grad_norm": 1.4819797277450562, + "grad_norm_var": 0.06910834048590857, + "learning_rate": 2e-05, + "loss": 0.494, + "loss/crossentropy": 2.3202240467071533, + "loss/hidden": 0.1982421875, + "loss/logits": 0.03335183020681143, + "loss/reg": 0.026242973282933235, + "step": 221 + }, + { + "epoch": 0.111, + "grad_norm": 3.001047372817993, + "grad_norm_var": 0.2239867367636629, + "learning_rate": 2e-05, + "loss": 0.5824, + "loss/crossentropy": 2.5001453161239624, + "loss/hidden": 0.259765625, + "loss/logits": 0.06020928919315338, + "loss/reg": 0.02624150738120079, + "step": 222 + }, + { + "epoch": 0.1115, + "grad_norm": 1.3792407512664795, + "grad_norm_var": 0.21908974537501535, + "learning_rate": 2e-05, + "loss": 0.5162, + "loss/crossentropy": 2.067277252674103, + "loss/hidden": 0.22119140625, + "loss/logits": 0.03264045529067516, + "loss/reg": 0.026240520179271698, + "step": 223 + }, + { + "epoch": 0.112, + "grad_norm": 1.0752317905426025, + "grad_norm_var": 0.2296741211773119, + "learning_rate": 2e-05, + "loss": 0.4715, + "loss/crossentropy": 2.3376221656799316, + "loss/hidden": 0.18017578125, + "loss/logits": 0.028950211592018604, + "loss/reg": 0.026239832863211632, + "step": 224 + }, + { + "epoch": 0.1125, + "grad_norm": 1.2668484449386597, + "grad_norm_var": 0.22237485135677842, + "learning_rate": 2e-05, + "loss": 0.4997, + "loss/crossentropy": 2.0572392344474792, + "loss/hidden": 0.20654296875, + "loss/logits": 0.03081146441400051, + "loss/reg": 0.026238473132252693, + "step": 225 + }, + { + "epoch": 0.113, + "grad_norm": 1.2038859128952026, + "grad_norm_var": 0.2212749830240483, + "learning_rate": 2e-05, + "loss": 0.496, + "loss/crossentropy": 2.3101898431777954, + "loss/hidden": 0.20068359375, + "loss/logits": 0.032975198701024055, + "loss/reg": 0.026237143203616142, + "step": 226 + }, + { + "epoch": 0.1135, + "grad_norm": 1.1962757110595703, + "grad_norm_var": 0.21626523006927076, + "learning_rate": 2e-05, + "loss": 0.5397, + "loss/crossentropy": 2.3421201705932617, + "loss/hidden": 0.234375, + "loss/logits": 0.042991749942302704, + "loss/reg": 0.026236219331622124, + "step": 227 + }, + { + "epoch": 0.114, + "grad_norm": 1.3072717189788818, + "grad_norm_var": 0.20238375910022696, + "learning_rate": 2e-05, + "loss": 0.4899, + "loss/crossentropy": 2.545662522315979, + "loss/hidden": 0.19873046875, + "loss/logits": 0.028848190791904926, + "loss/reg": 0.02623477764427662, + "step": 228 + }, + { + "epoch": 0.1145, + "grad_norm": 1.4646738767623901, + "grad_norm_var": 0.1943966383373566, + "learning_rate": 2e-05, + "loss": 0.5319, + "loss/crossentropy": 2.481440782546997, + "loss/hidden": 0.236328125, + "loss/logits": 0.03321322426199913, + "loss/reg": 0.02623329870402813, + "step": 229 + }, + { + "epoch": 0.115, + "grad_norm": 1.460798978805542, + "grad_norm_var": 0.1938919184279494, + "learning_rate": 2e-05, + "loss": 0.5487, + "loss/crossentropy": 2.2658169269561768, + "loss/hidden": 0.25048828125, + "loss/logits": 0.03588264063000679, + "loss/reg": 0.026232033967971802, + "step": 230 + }, + { + "epoch": 0.1155, + "grad_norm": 1.8251186609268188, + "grad_norm_var": 0.20209032603179977, + "learning_rate": 2e-05, + "loss": 0.4954, + "loss/crossentropy": 2.0918792486190796, + "loss/hidden": 0.20458984375, + "loss/logits": 0.028470170684158802, + "loss/reg": 0.02623056247830391, + "step": 231 + }, + { + "epoch": 0.116, + "grad_norm": 1.0807620286941528, + "grad_norm_var": 0.20325974318190695, + "learning_rate": 2e-05, + "loss": 0.4663, + "loss/crossentropy": 2.4854743480682373, + "loss/hidden": 0.18017578125, + "loss/logits": 0.023857600055634975, + "loss/reg": 0.026229269802570343, + "step": 232 + }, + { + "epoch": 0.1165, + "grad_norm": 1.2416105270385742, + "grad_norm_var": 0.20420357740239006, + "learning_rate": 2e-05, + "loss": 0.4939, + "loss/crossentropy": 2.5404441356658936, + "loss/hidden": 0.2001953125, + "loss/logits": 0.031377360224723816, + "loss/reg": 0.0262277964502573, + "step": 233 + }, + { + "epoch": 0.117, + "grad_norm": 1.0784002542495728, + "grad_norm_var": 0.21294726278598167, + "learning_rate": 2e-05, + "loss": 0.4764, + "loss/crossentropy": 2.1334633231163025, + "loss/hidden": 0.18603515625, + "loss/logits": 0.028105400502681732, + "loss/reg": 0.02622636966407299, + "step": 234 + }, + { + "epoch": 0.1175, + "grad_norm": 1.4805059432983398, + "grad_norm_var": 0.21296607019170413, + "learning_rate": 2e-05, + "loss": 0.5299, + "loss/crossentropy": 2.363998532295227, + "loss/hidden": 0.22998046875, + "loss/logits": 0.03765851445496082, + "loss/reg": 0.026224961504340172, + "step": 235 + }, + { + "epoch": 0.118, + "grad_norm": 1.4707082509994507, + "grad_norm_var": 0.21261289860814922, + "learning_rate": 2e-05, + "loss": 0.5063, + "loss/crossentropy": 2.5150575637817383, + "loss/hidden": 0.21142578125, + "loss/logits": 0.032685703597962856, + "loss/reg": 0.02622355706989765, + "step": 236 + }, + { + "epoch": 0.1185, + "grad_norm": 1.2693709135055542, + "grad_norm_var": 0.2142025931629329, + "learning_rate": 2e-05, + "loss": 0.4777, + "loss/crossentropy": 2.364490509033203, + "loss/hidden": 0.189453125, + "loss/logits": 0.026029310189187527, + "loss/reg": 0.026222191751003265, + "step": 237 + }, + { + "epoch": 0.119, + "grad_norm": 1.4452778100967407, + "grad_norm_var": 0.03857260853696444, + "learning_rate": 2e-05, + "loss": 0.4884, + "loss/crossentropy": 2.370029330253601, + "loss/hidden": 0.19873046875, + "loss/logits": 0.027492761611938477, + "loss/reg": 0.026220764964818954, + "step": 238 + }, + { + "epoch": 0.1195, + "grad_norm": 1.3660000562667847, + "grad_norm_var": 0.03849288132132358, + "learning_rate": 2e-05, + "loss": 0.4504, + "loss/crossentropy": 2.579773426055908, + "loss/hidden": 0.1650390625, + "loss/logits": 0.02312152460217476, + "loss/reg": 0.026219261810183525, + "step": 239 + }, + { + "epoch": 0.12, + "grad_norm": 1.1201462745666504, + "grad_norm_var": 0.03711094738648981, + "learning_rate": 2e-05, + "loss": 0.4707, + "loss/crossentropy": 2.5026135444641113, + "loss/hidden": 0.1708984375, + "loss/logits": 0.037659027613699436, + "loss/reg": 0.02621796354651451, + "step": 240 + }, + { + "epoch": 0.1205, + "grad_norm": 1.5148764848709106, + "grad_norm_var": 0.03887221528161528, + "learning_rate": 2e-05, + "loss": 0.5356, + "loss/crossentropy": 2.2153124809265137, + "loss/hidden": 0.23974609375, + "loss/logits": 0.03367053158581257, + "loss/reg": 0.02621665596961975, + "step": 241 + }, + { + "epoch": 0.121, + "grad_norm": 4.675024509429932, + "grad_norm_var": 0.726447806322074, + "learning_rate": 2e-05, + "loss": 0.9674, + "loss/crossentropy": 2.5507571697235107, + "loss/hidden": 0.47119140625, + "loss/logits": 0.23407735768705606, + "loss/reg": 0.026215286925435066, + "step": 242 + }, + { + "epoch": 0.1215, + "grad_norm": 1.3312729597091675, + "grad_norm_var": 0.7209984947184022, + "learning_rate": 2e-05, + "loss": 0.4611, + "loss/crossentropy": 2.38046658039093, + "loss/hidden": 0.1767578125, + "loss/logits": 0.022188683971762657, + "loss/reg": 0.02621396817266941, + "step": 243 + }, + { + "epoch": 0.122, + "grad_norm": 2.4490838050842285, + "grad_norm_var": 0.7623712839956812, + "learning_rate": 2e-05, + "loss": 0.6231, + "loss/crossentropy": 2.5406309366226196, + "loss/hidden": 0.3056640625, + "loss/logits": 0.0553472563624382, + "loss/reg": 0.02621266432106495, + "step": 244 + }, + { + "epoch": 0.1225, + "grad_norm": 1.5570958852767944, + "grad_norm_var": 0.7607187136014618, + "learning_rate": 2e-05, + "loss": 0.4948, + "loss/crossentropy": 2.2163580656051636, + "loss/hidden": 0.2060546875, + "loss/logits": 0.026676415465772152, + "loss/reg": 0.02621115930378437, + "step": 245 + }, + { + "epoch": 0.123, + "grad_norm": 1.2748626470565796, + "grad_norm_var": 0.767517463439591, + "learning_rate": 2e-05, + "loss": 0.5207, + "loss/crossentropy": 2.3726441860198975, + "loss/hidden": 0.2294921875, + "loss/logits": 0.02906488999724388, + "loss/reg": 0.026209814473986626, + "step": 246 + }, + { + "epoch": 0.1235, + "grad_norm": 1.5057262182235718, + "grad_norm_var": 0.7658503992600496, + "learning_rate": 2e-05, + "loss": 0.4962, + "loss/crossentropy": 2.442053437232971, + "loss/hidden": 0.2021484375, + "loss/logits": 0.0319626173004508, + "loss/reg": 0.02620851993560791, + "step": 247 + }, + { + "epoch": 0.124, + "grad_norm": 1.2367428541183472, + "grad_norm_var": 0.7562333027864989, + "learning_rate": 2e-05, + "loss": 0.4891, + "loss/crossentropy": 2.32527756690979, + "loss/hidden": 0.19775390625, + "loss/logits": 0.029276075772941113, + "loss/reg": 0.02620730549097061, + "step": 248 + }, + { + "epoch": 0.1245, + "grad_norm": 1.3585014343261719, + "grad_norm_var": 0.7510956988655692, + "learning_rate": 2e-05, + "loss": 0.505, + "loss/crossentropy": 2.4313305616378784, + "loss/hidden": 0.2060546875, + "loss/logits": 0.036865890957415104, + "loss/reg": 0.02620592899620533, + "step": 249 + }, + { + "epoch": 0.125, + "grad_norm": 1.1339526176452637, + "grad_norm_var": 0.7471780769863924, + "learning_rate": 2e-05, + "loss": 0.4488, + "loss/crossentropy": 2.309004545211792, + "loss/hidden": 0.1650390625, + "loss/logits": 0.021719621494412422, + "loss/reg": 0.0262046679854393, + "step": 250 + }, + { + "epoch": 0.1255, + "grad_norm": 1.6961034536361694, + "grad_norm_var": 0.7455897121963819, + "learning_rate": 2e-05, + "loss": 0.4853, + "loss/crossentropy": 2.3145781755447388, + "loss/hidden": 0.19189453125, + "loss/logits": 0.03134002722799778, + "loss/reg": 0.02620331011712551, + "step": 251 + }, + { + "epoch": 0.126, + "grad_norm": 1.137927532196045, + "grad_norm_var": 0.760479623698442, + "learning_rate": 2e-05, + "loss": 0.4469, + "loss/crossentropy": 2.371696949005127, + "loss/hidden": 0.1630859375, + "loss/logits": 0.021795951761305332, + "loss/reg": 0.02620198018848896, + "step": 252 + }, + { + "epoch": 0.1265, + "grad_norm": 1.1879764795303345, + "grad_norm_var": 0.7648019998891016, + "learning_rate": 2e-05, + "loss": 0.4938, + "loss/crossentropy": 2.3237900733947754, + "loss/hidden": 0.20703125, + "loss/logits": 0.02474562544375658, + "loss/reg": 0.026200512424111366, + "step": 253 + }, + { + "epoch": 0.127, + "grad_norm": 1.437303066253662, + "grad_norm_var": 0.7649964465157646, + "learning_rate": 2e-05, + "loss": 0.4641, + "loss/crossentropy": 2.4125940799713135, + "loss/hidden": 0.17626953125, + "loss/logits": 0.025886863470077515, + "loss/reg": 0.026198983192443848, + "step": 254 + }, + { + "epoch": 0.1275, + "grad_norm": 1.298660159111023, + "grad_norm_var": 0.7675955671113466, + "learning_rate": 2e-05, + "loss": 0.4572, + "loss/crossentropy": 2.531725764274597, + "loss/hidden": 0.16943359375, + "loss/logits": 0.025743640959262848, + "loss/reg": 0.02619752287864685, + "step": 255 + }, + { + "epoch": 0.128, + "grad_norm": 1.39458429813385, + "grad_norm_var": 0.7540231641910907, + "learning_rate": 2e-05, + "loss": 0.4862, + "loss/crossentropy": 2.186591327190399, + "loss/hidden": 0.20166015625, + "loss/logits": 0.022591713815927505, + "loss/reg": 0.026196002960205078, + "step": 256 + }, + { + "epoch": 0.1285, + "grad_norm": 1.212915062904358, + "grad_norm_var": 0.7646330349939954, + "learning_rate": 2e-05, + "loss": 0.5087, + "loss/crossentropy": 2.471588611602783, + "loss/hidden": 0.21337890625, + "loss/logits": 0.033330729231238365, + "loss/reg": 0.026194443926215172, + "step": 257 + }, + { + "epoch": 0.129, + "grad_norm": 1.0683094263076782, + "grad_norm_var": 0.10754076085599748, + "learning_rate": 2e-05, + "loss": 0.4712, + "loss/crossentropy": 2.2822721004486084, + "loss/hidden": 0.18115234375, + "loss/logits": 0.028153350576758385, + "loss/reg": 0.026192834600806236, + "step": 258 + }, + { + "epoch": 0.1295, + "grad_norm": 1.2983660697937012, + "grad_norm_var": 0.10787735781459536, + "learning_rate": 2e-05, + "loss": 0.5124, + "loss/crossentropy": 2.3575881719589233, + "loss/hidden": 0.22021484375, + "loss/logits": 0.03026559017598629, + "loss/reg": 0.02619129605591297, + "step": 259 + }, + { + "epoch": 0.13, + "grad_norm": 1.4962793588638306, + "grad_norm_var": 0.030134894623511776, + "learning_rate": 2e-05, + "loss": 0.4676, + "loss/crossentropy": 2.409846782684326, + "loss/hidden": 0.18212890625, + "loss/logits": 0.02358458936214447, + "loss/reg": 0.026189813390374184, + "step": 260 + }, + { + "epoch": 0.1305, + "grad_norm": 1.3754230737686157, + "grad_norm_var": 0.026719927934763098, + "learning_rate": 2e-05, + "loss": 0.5312, + "loss/crossentropy": 2.177566409111023, + "loss/hidden": 0.23876953125, + "loss/logits": 0.030562346801161766, + "loss/reg": 0.026188237592577934, + "step": 261 + }, + { + "epoch": 0.131, + "grad_norm": 1.342571496963501, + "grad_norm_var": 0.02660255745073622, + "learning_rate": 2e-05, + "loss": 0.4839, + "loss/crossentropy": 2.513023018836975, + "loss/hidden": 0.18896484375, + "loss/logits": 0.033111236058175564, + "loss/reg": 0.026186756789684296, + "step": 262 + }, + { + "epoch": 0.1315, + "grad_norm": 1.2367901802062988, + "grad_norm_var": 0.02460065001579365, + "learning_rate": 2e-05, + "loss": 0.4956, + "loss/crossentropy": 2.3763153553009033, + "loss/hidden": 0.20458984375, + "loss/logits": 0.029151923954486847, + "loss/reg": 0.02618517354130745, + "step": 263 + }, + { + "epoch": 0.132, + "grad_norm": 1.9415297508239746, + "grad_norm_var": 0.04904137234389789, + "learning_rate": 2e-05, + "loss": 0.5627, + "loss/crossentropy": 2.240867018699646, + "loss/hidden": 0.26611328125, + "loss/logits": 0.03479018062353134, + "loss/reg": 0.026183558627963066, + "step": 264 + }, + { + "epoch": 0.1325, + "grad_norm": 0.9934033751487732, + "grad_norm_var": 0.05701087259719828, + "learning_rate": 2e-05, + "loss": 0.4713, + "loss/crossentropy": 2.3560155630111694, + "loss/hidden": 0.18017578125, + "loss/logits": 0.029294829815626144, + "loss/reg": 0.026182031258940697, + "step": 265 + }, + { + "epoch": 0.133, + "grad_norm": 1.0373915433883667, + "grad_norm_var": 0.06009524379270439, + "learning_rate": 2e-05, + "loss": 0.494, + "loss/crossentropy": 2.400221347808838, + "loss/hidden": 0.2001953125, + "loss/logits": 0.031994713470339775, + "loss/reg": 0.026180392131209373, + "step": 266 + }, + { + "epoch": 0.1335, + "grad_norm": 1.267191767692566, + "grad_norm_var": 0.05021139115474562, + "learning_rate": 2e-05, + "loss": 0.5615, + "loss/crossentropy": 2.1523420810699463, + "loss/hidden": 0.2490234375, + "loss/logits": 0.05070135369896889, + "loss/reg": 0.026178674772381783, + "step": 267 + }, + { + "epoch": 0.134, + "grad_norm": 1.6182429790496826, + "grad_norm_var": 0.05454457187013859, + "learning_rate": 2e-05, + "loss": 0.5, + "loss/crossentropy": 2.299275279045105, + "loss/hidden": 0.20166015625, + "loss/logits": 0.03661351092159748, + "loss/reg": 0.02617703191936016, + "step": 268 + }, + { + "epoch": 0.1345, + "grad_norm": 1.1830179691314697, + "grad_norm_var": 0.05463698624171962, + "learning_rate": 2e-05, + "loss": 0.542, + "loss/crossentropy": 2.237685799598694, + "loss/hidden": 0.24462890625, + "loss/logits": 0.03558643162250519, + "loss/reg": 0.02617518976330757, + "step": 269 + }, + { + "epoch": 0.135, + "grad_norm": 1.0215861797332764, + "grad_norm_var": 0.05922028974216963, + "learning_rate": 2e-05, + "loss": 0.4509, + "loss/crossentropy": 2.386792778968811, + "loss/hidden": 0.1650390625, + "loss/logits": 0.02410146687179804, + "loss/reg": 0.026173612102866173, + "step": 270 + }, + { + "epoch": 0.1355, + "grad_norm": 1.2516766786575317, + "grad_norm_var": 0.059361270037086425, + "learning_rate": 2e-05, + "loss": 0.5417, + "loss/crossentropy": 2.2572768926620483, + "loss/hidden": 0.23974609375, + "loss/logits": 0.04025658965110779, + "loss/reg": 0.026171868667006493, + "step": 271 + }, + { + "epoch": 0.136, + "grad_norm": 1.1899913549423218, + "grad_norm_var": 0.05929371602914331, + "learning_rate": 2e-05, + "loss": 0.4991, + "loss/crossentropy": 2.5554966926574707, + "loss/hidden": 0.208984375, + "loss/logits": 0.028443695977330208, + "loss/reg": 0.026170162484049797, + "step": 272 + }, + { + "epoch": 0.1365, + "grad_norm": 1.716871976852417, + "grad_norm_var": 0.0704296166532296, + "learning_rate": 2e-05, + "loss": 0.512, + "loss/crossentropy": 2.3532203435897827, + "loss/hidden": 0.2060546875, + "loss/logits": 0.04425806552171707, + "loss/reg": 0.026168543845415115, + "step": 273 + }, + { + "epoch": 0.137, + "grad_norm": 1.4646930694580078, + "grad_norm_var": 0.06721621691666481, + "learning_rate": 2e-05, + "loss": 0.5178, + "loss/crossentropy": 2.343596935272217, + "loss/hidden": 0.22119140625, + "loss/logits": 0.034918731078505516, + "loss/reg": 0.026166997849941254, + "step": 274 + }, + { + "epoch": 0.1375, + "grad_norm": 1.0874199867248535, + "grad_norm_var": 0.07115961720678651, + "learning_rate": 2e-05, + "loss": 0.4609, + "loss/crossentropy": 2.172752797603607, + "loss/hidden": 0.1748046875, + "loss/logits": 0.02447379007935524, + "loss/reg": 0.026165394112467766, + "step": 275 + }, + { + "epoch": 0.138, + "grad_norm": 1.1732720136642456, + "grad_norm_var": 0.07036869627631123, + "learning_rate": 2e-05, + "loss": 0.4846, + "loss/crossentropy": 2.4434475898742676, + "loss/hidden": 0.1943359375, + "loss/logits": 0.02860554587095976, + "loss/reg": 0.026163768023252487, + "step": 276 + }, + { + "epoch": 0.1385, + "grad_norm": 1.5107827186584473, + "grad_norm_var": 0.07276105744027898, + "learning_rate": 2e-05, + "loss": 0.5892, + "loss/crossentropy": 2.5824127197265625, + "loss/hidden": 0.287109375, + "loss/logits": 0.04050498828291893, + "loss/reg": 0.026162203401327133, + "step": 277 + }, + { + "epoch": 0.139, + "grad_norm": 1.420068621635437, + "grad_norm_var": 0.07342361868488892, + "learning_rate": 2e-05, + "loss": 0.5488, + "loss/crossentropy": 2.234652519226074, + "loss/hidden": 0.2470703125, + "loss/logits": 0.04010407812893391, + "loss/reg": 0.026160722598433495, + "step": 278 + }, + { + "epoch": 0.1395, + "grad_norm": 0.9972831010818481, + "grad_norm_var": 0.0796539769611795, + "learning_rate": 2e-05, + "loss": 0.4622, + "loss/crossentropy": 2.396607279777527, + "loss/hidden": 0.17041015625, + "loss/logits": 0.030245795845985413, + "loss/reg": 0.026159239932894707, + "step": 279 + }, + { + "epoch": 0.14, + "grad_norm": 2.338772773742676, + "grad_norm_var": 0.1232493317334492, + "learning_rate": 2e-05, + "loss": 0.5912, + "loss/crossentropy": 2.0176676511764526, + "loss/hidden": 0.27783203125, + "loss/logits": 0.05181618873029947, + "loss/reg": 0.026157628744840622, + "step": 280 + }, + { + "epoch": 0.1405, + "grad_norm": 1.2386250495910645, + "grad_norm_var": 0.11601927811151326, + "learning_rate": 2e-05, + "loss": 0.454, + "loss/crossentropy": 2.2258787155151367, + "loss/hidden": 0.16943359375, + "loss/logits": 0.02302493341267109, + "loss/reg": 0.02615603432059288, + "step": 281 + }, + { + "epoch": 0.141, + "grad_norm": 3.4386959075927734, + "grad_norm_var": 0.37798476794662456, + "learning_rate": 2e-05, + "loss": 0.6987, + "loss/crossentropy": 2.291516423225403, + "loss/hidden": 0.39892578125, + "loss/logits": 0.038274774327874184, + "loss/reg": 0.026154499500989914, + "step": 282 + }, + { + "epoch": 0.1415, + "grad_norm": 2.358877658843994, + "grad_norm_var": 0.4193280072280107, + "learning_rate": 2e-05, + "loss": 0.5369, + "loss/crossentropy": 2.0343876481056213, + "loss/hidden": 0.2392578125, + "loss/logits": 0.036153580993413925, + "loss/reg": 0.026152830570936203, + "step": 283 + }, + { + "epoch": 0.142, + "grad_norm": 1.7734060287475586, + "grad_norm_var": 0.42197319133869365, + "learning_rate": 2e-05, + "loss": 0.5995, + "loss/crossentropy": 2.410479426383972, + "loss/hidden": 0.28271484375, + "loss/logits": 0.055306099355220795, + "loss/reg": 0.026151135563850403, + "step": 284 + }, + { + "epoch": 0.1425, + "grad_norm": 1.7683607339859009, + "grad_norm_var": 0.4129653376453054, + "learning_rate": 2e-05, + "loss": 0.4774, + "loss/crossentropy": 2.422680377960205, + "loss/hidden": 0.17138671875, + "loss/logits": 0.04454575851559639, + "loss/reg": 0.026149341836571693, + "step": 285 + }, + { + "epoch": 0.143, + "grad_norm": 1.890203833580017, + "grad_norm_var": 0.3920434322764975, + "learning_rate": 2e-05, + "loss": 0.6648, + "loss/crossentropy": 2.3643598556518555, + "loss/hidden": 0.3388671875, + "loss/logits": 0.06448590569198132, + "loss/reg": 0.02614753320813179, + "step": 286 + }, + { + "epoch": 0.1435, + "grad_norm": 1.29023015499115, + "grad_norm_var": 0.39001840335736465, + "learning_rate": 2e-05, + "loss": 0.4522, + "loss/crossentropy": 2.5188199281692505, + "loss/hidden": 0.16748046875, + "loss/logits": 0.02329123578965664, + "loss/reg": 0.02614591456949711, + "step": 287 + }, + { + "epoch": 0.144, + "grad_norm": 10.653407096862793, + "grad_norm_var": 5.386538257885738, + "learning_rate": 2e-05, + "loss": 0.5673, + "loss/crossentropy": 2.5609625577926636, + "loss/hidden": 0.27880859375, + "loss/logits": 0.02702578529715538, + "loss/reg": 0.026144322007894516, + "step": 288 + }, + { + "epoch": 0.1445, + "grad_norm": 1.2127407789230347, + "grad_norm_var": 5.43876626293414, + "learning_rate": 2e-05, + "loss": 0.5081, + "loss/crossentropy": 2.4100780487060547, + "loss/hidden": 0.21044921875, + "loss/logits": 0.03620042186230421, + "loss/reg": 0.02614261396229267, + "step": 289 + }, + { + "epoch": 0.145, + "grad_norm": 1.4402183294296265, + "grad_norm_var": 5.4412882443615285, + "learning_rate": 2e-05, + "loss": 0.4768, + "loss/crossentropy": 2.271009087562561, + "loss/hidden": 0.189453125, + "loss/logits": 0.02590431459248066, + "loss/reg": 0.026140958070755005, + "step": 290 + }, + { + "epoch": 0.1455, + "grad_norm": 1.5095008611679077, + "grad_norm_var": 5.388429514304694, + "learning_rate": 2e-05, + "loss": 0.5205, + "loss/crossentropy": 2.3384816646575928, + "loss/hidden": 0.22265625, + "loss/logits": 0.036461083218455315, + "loss/reg": 0.026139242574572563, + "step": 291 + }, + { + "epoch": 0.146, + "grad_norm": 1.1620471477508545, + "grad_norm_var": 5.390050224047064, + "learning_rate": 2e-05, + "loss": 0.4969, + "loss/crossentropy": 2.433851480484009, + "loss/hidden": 0.20068359375, + "loss/logits": 0.0348251610994339, + "loss/reg": 0.026137609034776688, + "step": 292 + }, + { + "epoch": 0.1465, + "grad_norm": 1.4650121927261353, + "grad_norm_var": 5.394693634841302, + "learning_rate": 2e-05, + "loss": 0.4378, + "loss/crossentropy": 2.5522985458374023, + "loss/hidden": 0.154296875, + "loss/logits": 0.022188137285411358, + "loss/reg": 0.02613597922027111, + "step": 293 + }, + { + "epoch": 0.147, + "grad_norm": 1.9892516136169434, + "grad_norm_var": 5.352159159580765, + "learning_rate": 2e-05, + "loss": 0.5504, + "loss/crossentropy": 2.1993319392204285, + "loss/hidden": 0.24267578125, + "loss/logits": 0.04638373479247093, + "loss/reg": 0.026134170591831207, + "step": 294 + }, + { + "epoch": 0.1475, + "grad_norm": 1.465166687965393, + "grad_norm_var": 5.285637901292613, + "learning_rate": 2e-05, + "loss": 0.494, + "loss/crossentropy": 2.223472237586975, + "loss/hidden": 0.19287109375, + "loss/logits": 0.03982667811214924, + "loss/reg": 0.026132365688681602, + "step": 295 + }, + { + "epoch": 0.148, + "grad_norm": 2.5565261840820312, + "grad_norm_var": 5.2893741834582775, + "learning_rate": 2e-05, + "loss": 0.5916, + "loss/crossentropy": 2.2553144693374634, + "loss/hidden": 0.27392578125, + "loss/logits": 0.056398073211312294, + "loss/reg": 0.026130499318242073, + "step": 296 + }, + { + "epoch": 0.1485, + "grad_norm": 1.2621214389801025, + "grad_norm_var": 5.286002834073586, + "learning_rate": 2e-05, + "loss": 0.4855, + "loss/crossentropy": 2.241385817527771, + "loss/hidden": 0.1953125, + "loss/logits": 0.028942352160811424, + "loss/reg": 0.026128675788640976, + "step": 297 + }, + { + "epoch": 0.149, + "grad_norm": 1.841597080230713, + "grad_norm_var": 5.2087414924686675, + "learning_rate": 2e-05, + "loss": 0.5784, + "loss/crossentropy": 2.296829104423523, + "loss/hidden": 0.2802734375, + "loss/logits": 0.03681251127272844, + "loss/reg": 0.026126863434910774, + "step": 298 + }, + { + "epoch": 0.1495, + "grad_norm": 1.258812427520752, + "grad_norm_var": 5.265093383729075, + "learning_rate": 2e-05, + "loss": 0.492, + "loss/crossentropy": 2.4392940998077393, + "loss/hidden": 0.20166015625, + "loss/logits": 0.0290931249037385, + "loss/reg": 0.026125235483050346, + "step": 299 + }, + { + "epoch": 0.15, + "grad_norm": 1.3167269229888916, + "grad_norm_var": 5.3015866088773915, + "learning_rate": 2e-05, + "loss": 0.4889, + "loss/crossentropy": 2.401396870613098, + "loss/hidden": 0.19775390625, + "loss/logits": 0.029924746602773666, + "loss/reg": 0.02612358331680298, + "step": 300 + }, + { + "epoch": 0.1505, + "grad_norm": 1.6229465007781982, + "grad_norm_var": 5.309922187137865, + "learning_rate": 2e-05, + "loss": 0.5287, + "loss/crossentropy": 2.36386775970459, + "loss/hidden": 0.2255859375, + "loss/logits": 0.04194306582212448, + "loss/reg": 0.02612200565636158, + "step": 301 + }, + { + "epoch": 0.151, + "grad_norm": 1.777891755104065, + "grad_norm_var": 5.31416719857012, + "learning_rate": 2e-05, + "loss": 0.5103, + "loss/crossentropy": 2.4092063903808594, + "loss/hidden": 0.2138671875, + "loss/logits": 0.03518137149512768, + "loss/reg": 0.026120424270629883, + "step": 302 + }, + { + "epoch": 0.1515, + "grad_norm": 1.1520640850067139, + "grad_norm_var": 5.330536147630553, + "learning_rate": 2e-05, + "loss": 0.5057, + "loss/crossentropy": 2.2741400003433228, + "loss/hidden": 0.21142578125, + "loss/logits": 0.03307824395596981, + "loss/reg": 0.0261188056319952, + "step": 303 + }, + { + "epoch": 0.152, + "grad_norm": 1.2208960056304932, + "grad_norm_var": 0.1407175104424084, + "learning_rate": 2e-05, + "loss": 0.4876, + "loss/crossentropy": 2.202619433403015, + "loss/hidden": 0.19140625, + "loss/logits": 0.03499259799718857, + "loss/reg": 0.026117179542779922, + "step": 304 + }, + { + "epoch": 0.1525, + "grad_norm": 1.2294107675552368, + "grad_norm_var": 0.14006117928402068, + "learning_rate": 2e-05, + "loss": 0.4935, + "loss/crossentropy": 2.3829336166381836, + "loss/hidden": 0.1982421875, + "loss/logits": 0.03412310779094696, + "loss/reg": 0.026115482673048973, + "step": 305 + }, + { + "epoch": 0.153, + "grad_norm": 1.2149832248687744, + "grad_norm_var": 0.1455343172725079, + "learning_rate": 2e-05, + "loss": 0.4618, + "loss/crossentropy": 2.3216532468795776, + "loss/hidden": 0.17529296875, + "loss/logits": 0.025372054427862167, + "loss/reg": 0.02611370198428631, + "step": 306 + }, + { + "epoch": 0.1535, + "grad_norm": 1.4385122060775757, + "grad_norm_var": 0.14578594604365136, + "learning_rate": 2e-05, + "loss": 0.51, + "loss/crossentropy": 2.449226975440979, + "loss/hidden": 0.20654296875, + "loss/logits": 0.04237618204206228, + "loss/reg": 0.026111874729394913, + "step": 307 + }, + { + "epoch": 0.154, + "grad_norm": 1.118850588798523, + "grad_norm_var": 0.14783964943001873, + "learning_rate": 2e-05, + "loss": 0.4752, + "loss/crossentropy": 2.3721216917037964, + "loss/hidden": 0.1865234375, + "loss/logits": 0.027595724910497665, + "loss/reg": 0.0261102132499218, + "step": 308 + }, + { + "epoch": 0.1545, + "grad_norm": 1.3892052173614502, + "grad_norm_var": 0.14850872616204683, + "learning_rate": 2e-05, + "loss": 0.4986, + "loss/crossentropy": 2.3415383100509644, + "loss/hidden": 0.205078125, + "loss/logits": 0.03244547359645367, + "loss/reg": 0.02610846608877182, + "step": 309 + }, + { + "epoch": 0.155, + "grad_norm": 1.1625828742980957, + "grad_norm_var": 0.13629436785804921, + "learning_rate": 2e-05, + "loss": 0.4995, + "loss/crossentropy": 2.3235228061676025, + "loss/hidden": 0.2099609375, + "loss/logits": 0.028443023562431335, + "loss/reg": 0.026106812059879303, + "step": 310 + }, + { + "epoch": 0.1555, + "grad_norm": 1.27708899974823, + "grad_norm_var": 0.1378557412128671, + "learning_rate": 2e-05, + "loss": 0.517, + "loss/crossentropy": 2.4090656042099, + "loss/hidden": 0.220703125, + "loss/logits": 0.035252392292022705, + "loss/reg": 0.026105303317308426, + "step": 311 + }, + { + "epoch": 0.156, + "grad_norm": 1.1573866605758667, + "grad_norm_var": 0.049585704844170665, + "learning_rate": 2e-05, + "loss": 0.509, + "loss/crossentropy": 2.1680856943130493, + "loss/hidden": 0.2158203125, + "loss/logits": 0.03210577368736267, + "loss/reg": 0.026103774085640907, + "step": 312 + }, + { + "epoch": 0.1565, + "grad_norm": 1.265214443206787, + "grad_norm_var": 0.04955415784550207, + "learning_rate": 2e-05, + "loss": 0.4997, + "loss/crossentropy": 2.3472299575805664, + "loss/hidden": 0.20458984375, + "loss/logits": 0.03409944660961628, + "loss/reg": 0.026102419942617416, + "step": 313 + }, + { + "epoch": 0.157, + "grad_norm": 1.9698238372802734, + "grad_norm_var": 0.05915308914134864, + "learning_rate": 2e-05, + "loss": 0.5882, + "loss/crossentropy": 2.3045787811279297, + "loss/hidden": 0.27392578125, + "loss/logits": 0.05324920453131199, + "loss/reg": 0.02610074356198311, + "step": 314 + }, + { + "epoch": 0.1575, + "grad_norm": 1.3615264892578125, + "grad_norm_var": 0.058587269718664695, + "learning_rate": 2e-05, + "loss": 0.5097, + "loss/crossentropy": 2.010044515132904, + "loss/hidden": 0.2138671875, + "loss/logits": 0.0348639115691185, + "loss/reg": 0.026099352166056633, + "step": 315 + }, + { + "epoch": 0.158, + "grad_norm": 1.450539231300354, + "grad_norm_var": 0.05902897578693942, + "learning_rate": 2e-05, + "loss": 0.5259, + "loss/crossentropy": 2.258197784423828, + "loss/hidden": 0.22412109375, + "loss/logits": 0.040814803913235664, + "loss/reg": 0.026097897440195084, + "step": 316 + }, + { + "epoch": 0.1585, + "grad_norm": 1.2342588901519775, + "grad_norm_var": 0.055002612504784484, + "learning_rate": 2e-05, + "loss": 0.5114, + "loss/crossentropy": 2.450056791305542, + "loss/hidden": 0.212890625, + "loss/logits": 0.03752759099006653, + "loss/reg": 0.026096461340785027, + "step": 317 + }, + { + "epoch": 0.159, + "grad_norm": 1.5264660120010376, + "grad_norm_var": 0.04423249803009378, + "learning_rate": 2e-05, + "loss": 0.5069, + "loss/crossentropy": 2.3556759357452393, + "loss/hidden": 0.212890625, + "loss/logits": 0.03307069279253483, + "loss/reg": 0.026094747707247734, + "step": 318 + }, + { + "epoch": 0.1595, + "grad_norm": 1.394983172416687, + "grad_norm_var": 0.04238248493052742, + "learning_rate": 2e-05, + "loss": 0.4826, + "loss/crossentropy": 2.3402878046035767, + "loss/hidden": 0.19384765625, + "loss/logits": 0.027848311699926853, + "loss/reg": 0.026093317195773125, + "step": 319 + }, + { + "epoch": 0.16, + "grad_norm": 1.3035892248153687, + "grad_norm_var": 0.04151614019492621, + "learning_rate": 2e-05, + "loss": 0.5366, + "loss/crossentropy": 2.4592941999435425, + "loss/hidden": 0.2333984375, + "loss/logits": 0.04223489202558994, + "loss/reg": 0.026091884821653366, + "step": 320 + }, + { + "epoch": 0.1605, + "grad_norm": 1.8944873809814453, + "grad_norm_var": 0.05905324081961657, + "learning_rate": 2e-05, + "loss": 0.5082, + "loss/crossentropy": 2.2413108348846436, + "loss/hidden": 0.21142578125, + "loss/logits": 0.03591745160520077, + "loss/reg": 0.026090335100889206, + "step": 321 + }, + { + "epoch": 0.161, + "grad_norm": 2.45639705657959, + "grad_norm_var": 0.12723620805793795, + "learning_rate": 2e-05, + "loss": 0.6455, + "loss/crossentropy": 1.9915293455123901, + "loss/hidden": 0.3408203125, + "loss/logits": 0.043817924335598946, + "loss/reg": 0.02608887292444706, + "step": 322 + }, + { + "epoch": 0.1615, + "grad_norm": 1.7480417490005493, + "grad_norm_var": 0.13223189773490632, + "learning_rate": 2e-05, + "loss": 0.5439, + "loss/crossentropy": 2.4047733545303345, + "loss/hidden": 0.22265625, + "loss/logits": 0.06036931276321411, + "loss/reg": 0.026087457314133644, + "step": 323 + }, + { + "epoch": 0.162, + "grad_norm": 1.853732705116272, + "grad_norm_var": 0.1304117384352215, + "learning_rate": 2e-05, + "loss": 0.4878, + "loss/crossentropy": 2.5980257987976074, + "loss/hidden": 0.189453125, + "loss/logits": 0.037442656233906746, + "loss/reg": 0.02608575113117695, + "step": 324 + }, + { + "epoch": 0.1625, + "grad_norm": 1.894579291343689, + "grad_norm_var": 0.13703325521176069, + "learning_rate": 2e-05, + "loss": 0.6235, + "loss/crossentropy": 2.3670873641967773, + "loss/hidden": 0.2626953125, + "loss/logits": 0.09991350024938583, + "loss/reg": 0.02608424238860607, + "step": 325 + }, + { + "epoch": 0.163, + "grad_norm": 1.3630772829055786, + "grad_norm_var": 0.1289371841207372, + "learning_rate": 2e-05, + "loss": 0.5014, + "loss/crossentropy": 2.1478903889656067, + "loss/hidden": 0.2099609375, + "loss/logits": 0.030608470551669598, + "loss/reg": 0.026082569733262062, + "step": 326 + }, + { + "epoch": 0.1635, + "grad_norm": 1.2252777814865112, + "grad_norm_var": 0.13114190368244535, + "learning_rate": 2e-05, + "loss": 0.5137, + "loss/crossentropy": 2.228654980659485, + "loss/hidden": 0.216796875, + "loss/logits": 0.036048877984285355, + "loss/reg": 0.026081033051013947, + "step": 327 + }, + { + "epoch": 0.164, + "grad_norm": 1.1830672025680542, + "grad_norm_var": 0.12977471644483457, + "learning_rate": 2e-05, + "loss": 0.4567, + "loss/crossentropy": 2.5576133728027344, + "loss/hidden": 0.16796875, + "loss/logits": 0.02798423543572426, + "loss/reg": 0.026079514995217323, + "step": 328 + }, + { + "epoch": 0.1645, + "grad_norm": 1.9584394693374634, + "grad_norm_var": 0.13160920382139138, + "learning_rate": 2e-05, + "loss": 0.5043, + "loss/crossentropy": 2.321745753288269, + "loss/hidden": 0.2109375, + "loss/logits": 0.032608283683657646, + "loss/reg": 0.02607780508697033, + "step": 329 + }, + { + "epoch": 0.165, + "grad_norm": 2.176175355911255, + "grad_norm_var": 0.14407030947683092, + "learning_rate": 2e-05, + "loss": 0.5287, + "loss/crossentropy": 2.5101382732391357, + "loss/hidden": 0.23388671875, + "loss/logits": 0.03408687189221382, + "loss/reg": 0.026076283305883408, + "step": 330 + }, + { + "epoch": 0.1655, + "grad_norm": 1.3150840997695923, + "grad_norm_var": 0.14584616287976904, + "learning_rate": 2e-05, + "loss": 0.5012, + "loss/crossentropy": 2.4776118993759155, + "loss/hidden": 0.20751953125, + "loss/logits": 0.032900793477892876, + "loss/reg": 0.026074586436152458, + "step": 331 + }, + { + "epoch": 0.166, + "grad_norm": 1.6297320127487183, + "grad_norm_var": 0.14371731927044115, + "learning_rate": 2e-05, + "loss": 0.5161, + "loss/crossentropy": 2.4321337938308716, + "loss/hidden": 0.22119140625, + "loss/logits": 0.03421984426677227, + "loss/reg": 0.02607305720448494, + "step": 332 + }, + { + "epoch": 0.1665, + "grad_norm": 1.3825711011886597, + "grad_norm_var": 0.13717068867274657, + "learning_rate": 2e-05, + "loss": 0.4764, + "loss/crossentropy": 2.212525486946106, + "loss/hidden": 0.18701171875, + "loss/logits": 0.028680726885795593, + "loss/reg": 0.026071617379784584, + "step": 333 + }, + { + "epoch": 0.167, + "grad_norm": 1.1411490440368652, + "grad_norm_var": 0.15249385172816404, + "learning_rate": 2e-05, + "loss": 0.4697, + "loss/crossentropy": 2.309118866920471, + "loss/hidden": 0.1806640625, + "loss/logits": 0.028305926360189915, + "loss/reg": 0.026069944724440575, + "step": 334 + }, + { + "epoch": 0.1675, + "grad_norm": 1.5472488403320312, + "grad_norm_var": 0.14937410499563786, + "learning_rate": 2e-05, + "loss": 0.5375, + "loss/crossentropy": 2.2855429649353027, + "loss/hidden": 0.24072265625, + "loss/logits": 0.03612595796585083, + "loss/reg": 0.026068488135933876, + "step": 335 + }, + { + "epoch": 0.168, + "grad_norm": 5.691341400146484, + "grad_norm_var": 1.161954663002865, + "learning_rate": 2e-05, + "loss": 0.5703, + "loss/crossentropy": 2.4927643537521362, + "loss/hidden": 0.26953125, + "loss/logits": 0.04008688498288393, + "loss/reg": 0.026066886261105537, + "step": 336 + }, + { + "epoch": 0.1685, + "grad_norm": 2.456817388534546, + "grad_norm_var": 1.1810217336131037, + "learning_rate": 2e-05, + "loss": 0.7493, + "loss/crossentropy": 2.5297993421554565, + "loss/hidden": 0.37744140625, + "loss/logits": 0.11121575441211462, + "loss/reg": 0.026065362617373466, + "step": 337 + }, + { + "epoch": 0.169, + "grad_norm": 1.8542064428329468, + "grad_norm_var": 1.1621370785149523, + "learning_rate": 2e-05, + "loss": 0.5021, + "loss/crossentropy": 2.4378503561019897, + "loss/hidden": 0.2021484375, + "loss/logits": 0.03930371440947056, + "loss/reg": 0.026063820347189903, + "step": 338 + }, + { + "epoch": 0.1695, + "grad_norm": 1.8168195486068726, + "grad_norm_var": 1.1610274406018892, + "learning_rate": 2e-05, + "loss": 0.4994, + "loss/crossentropy": 2.1696581840515137, + "loss/hidden": 0.201171875, + "loss/logits": 0.03763199597597122, + "loss/reg": 0.02606227435171604, + "step": 339 + }, + { + "epoch": 0.17, + "grad_norm": 1.1088804006576538, + "grad_norm_var": 1.20085213579918, + "learning_rate": 2e-05, + "loss": 0.4517, + "loss/crossentropy": 2.512749671936035, + "loss/hidden": 0.16748046875, + "loss/logits": 0.02365376614034176, + "loss/reg": 0.026060722768306732, + "step": 340 + }, + { + "epoch": 0.1705, + "grad_norm": 1.490470051765442, + "grad_norm_var": 1.2091431686160143, + "learning_rate": 2e-05, + "loss": 0.4908, + "loss/crossentropy": 2.487561345100403, + "loss/hidden": 0.2001953125, + "loss/logits": 0.030045345425605774, + "loss/reg": 0.026059186086058617, + "step": 341 + }, + { + "epoch": 0.171, + "grad_norm": 1.0408867597579956, + "grad_norm_var": 1.2358515542870572, + "learning_rate": 2e-05, + "loss": 0.4727, + "loss/crossentropy": 2.2461780309677124, + "loss/hidden": 0.1865234375, + "loss/logits": 0.025552313774824142, + "loss/reg": 0.02605745941400528, + "step": 342 + }, + { + "epoch": 0.1715, + "grad_norm": 1.5709069967269897, + "grad_norm_var": 1.2162039640705784, + "learning_rate": 2e-05, + "loss": 0.4876, + "loss/crossentropy": 2.2603927850723267, + "loss/hidden": 0.19775390625, + "loss/logits": 0.02933008223772049, + "loss/reg": 0.026055721566081047, + "step": 343 + }, + { + "epoch": 0.172, + "grad_norm": 1.2913998365402222, + "grad_norm_var": 1.2075172882359821, + "learning_rate": 2e-05, + "loss": 0.5198, + "loss/crossentropy": 2.196273446083069, + "loss/hidden": 0.2216796875, + "loss/logits": 0.03755245357751846, + "loss/reg": 0.026053981855511665, + "step": 344 + }, + { + "epoch": 0.1725, + "grad_norm": 1.2019914388656616, + "grad_norm_var": 1.2315373969600656, + "learning_rate": 2e-05, + "loss": 0.4743, + "loss/crossentropy": 2.2306121587753296, + "loss/hidden": 0.18994140625, + "loss/logits": 0.023790341801941395, + "loss/reg": 0.02605227194726467, + "step": 345 + }, + { + "epoch": 0.173, + "grad_norm": 1.3895491361618042, + "grad_norm_var": 1.2302038798211163, + "learning_rate": 2e-05, + "loss": 0.4963, + "loss/crossentropy": 2.4988722801208496, + "loss/hidden": 0.2080078125, + "loss/logits": 0.02780199982225895, + "loss/reg": 0.02605038322508335, + "step": 346 + }, + { + "epoch": 0.1735, + "grad_norm": 1.50831937789917, + "grad_norm_var": 1.2214463856538722, + "learning_rate": 2e-05, + "loss": 0.4932, + "loss/crossentropy": 2.409302234649658, + "loss/hidden": 0.19921875, + "loss/logits": 0.03346476424485445, + "loss/reg": 0.02604857087135315, + "step": 347 + }, + { + "epoch": 0.174, + "grad_norm": 1.7516964673995972, + "grad_norm_var": 1.220296012686515, + "learning_rate": 2e-05, + "loss": 0.5642, + "loss/crossentropy": 2.2191531658172607, + "loss/hidden": 0.25927734375, + "loss/logits": 0.04449588805437088, + "loss/reg": 0.02604682371020317, + "step": 348 + }, + { + "epoch": 0.1745, + "grad_norm": 2.329723358154297, + "grad_norm_var": 1.228035235727617, + "learning_rate": 2e-05, + "loss": 0.5682, + "loss/crossentropy": 2.1749590635299683, + "loss/hidden": 0.2646484375, + "loss/logits": 0.04313355404883623, + "loss/reg": 0.026045063510537148, + "step": 349 + }, + { + "epoch": 0.175, + "grad_norm": 1.3271498680114746, + "grad_norm_var": 1.2132512460490317, + "learning_rate": 2e-05, + "loss": 0.45, + "loss/crossentropy": 2.588584542274475, + "loss/hidden": 0.16650390625, + "loss/logits": 0.02306409552693367, + "loss/reg": 0.026043301448225975, + "step": 350 + }, + { + "epoch": 0.1755, + "grad_norm": 1.875108003616333, + "grad_norm_var": 1.2073429995003617, + "learning_rate": 2e-05, + "loss": 0.5202, + "loss/crossentropy": 2.287582039833069, + "loss/hidden": 0.20703125, + "loss/logits": 0.052729660645127296, + "loss/reg": 0.026041487231850624, + "step": 351 + }, + { + "epoch": 0.176, + "grad_norm": 1.146688461303711, + "grad_norm_var": 0.1745214276224876, + "learning_rate": 2e-05, + "loss": 0.4424, + "loss/crossentropy": 2.3722145557403564, + "loss/hidden": 0.159912109375, + "loss/logits": 0.022133183665573597, + "loss/reg": 0.026039764285087585, + "step": 352 + }, + { + "epoch": 0.1765, + "grad_norm": 1.562357783317566, + "grad_norm_var": 0.11906480060907014, + "learning_rate": 2e-05, + "loss": 0.5252, + "loss/crossentropy": 2.2052918672561646, + "loss/hidden": 0.2275390625, + "loss/logits": 0.0372452475130558, + "loss/reg": 0.02603817544877529, + "step": 353 + }, + { + "epoch": 0.177, + "grad_norm": 1.3673954010009766, + "grad_norm_var": 0.11196520379043946, + "learning_rate": 2e-05, + "loss": 0.462, + "loss/crossentropy": 2.3004499673843384, + "loss/hidden": 0.17578125, + "loss/logits": 0.025897801853716373, + "loss/reg": 0.026036500930786133, + "step": 354 + }, + { + "epoch": 0.1775, + "grad_norm": 1.2918845415115356, + "grad_norm_var": 0.10604762311465758, + "learning_rate": 2e-05, + "loss": 0.4731, + "loss/crossentropy": 2.265425443649292, + "loss/hidden": 0.18701171875, + "loss/logits": 0.025708286091685295, + "loss/reg": 0.026034945622086525, + "step": 355 + }, + { + "epoch": 0.178, + "grad_norm": 7.662310600280762, + "grad_norm_var": 2.4892246344001143, + "learning_rate": 2e-05, + "loss": 0.5369, + "loss/crossentropy": 2.398472547531128, + "loss/hidden": 0.2412109375, + "loss/logits": 0.0353584922850132, + "loss/reg": 0.026033204048871994, + "step": 356 + }, + { + "epoch": 0.1785, + "grad_norm": 1.422759771347046, + "grad_norm_var": 2.492874299968556, + "learning_rate": 2e-05, + "loss": 0.5149, + "loss/crossentropy": 2.226934790611267, + "loss/hidden": 0.220703125, + "loss/logits": 0.03390590753406286, + "loss/reg": 0.026031551882624626, + "step": 357 + }, + { + "epoch": 0.179, + "grad_norm": 1.271759271621704, + "grad_norm_var": 2.471029150963487, + "learning_rate": 2e-05, + "loss": 0.5199, + "loss/crossentropy": 2.3659080266952515, + "loss/hidden": 0.21630859375, + "loss/logits": 0.04329786077141762, + "loss/reg": 0.02602977305650711, + "step": 358 + }, + { + "epoch": 0.1795, + "grad_norm": 1.2337300777435303, + "grad_norm_var": 2.491724270181853, + "learning_rate": 2e-05, + "loss": 0.5058, + "loss/crossentropy": 2.3398635387420654, + "loss/hidden": 0.20849609375, + "loss/logits": 0.0370652936398983, + "loss/reg": 0.026028025895357132, + "step": 359 + }, + { + "epoch": 0.18, + "grad_norm": 1.1331290006637573, + "grad_norm_var": 2.505122499555146, + "learning_rate": 2e-05, + "loss": 0.4673, + "loss/crossentropy": 2.4402376413345337, + "loss/hidden": 0.17919921875, + "loss/logits": 0.027860145084559917, + "loss/reg": 0.026026224717497826, + "step": 360 + }, + { + "epoch": 0.1805, + "grad_norm": 1.8800278902053833, + "grad_norm_var": 2.475975881369847, + "learning_rate": 2e-05, + "loss": 0.5447, + "loss/crossentropy": 2.1927571296691895, + "loss/hidden": 0.24658203125, + "loss/logits": 0.037921242415905, + "loss/reg": 0.02602434903383255, + "step": 361 + }, + { + "epoch": 0.181, + "grad_norm": 1.1613508462905884, + "grad_norm_var": 2.4942931489268525, + "learning_rate": 2e-05, + "loss": 0.4629, + "loss/crossentropy": 2.3627922534942627, + "loss/hidden": 0.173828125, + "loss/logits": 0.02883315272629261, + "loss/reg": 0.026022551581263542, + "step": 362 + }, + { + "epoch": 0.1815, + "grad_norm": 1.2477275133132935, + "grad_norm_var": 2.5111159165951857, + "learning_rate": 2e-05, + "loss": 0.5399, + "loss/crossentropy": 2.3385051488876343, + "loss/hidden": 0.23779296875, + "loss/logits": 0.041944630444049835, + "loss/reg": 0.026020534336566925, + "step": 363 + }, + { + "epoch": 0.182, + "grad_norm": 1.0904345512390137, + "grad_norm_var": 2.547469450312644, + "learning_rate": 2e-05, + "loss": 0.4478, + "loss/crossentropy": 2.353084683418274, + "loss/hidden": 0.16259765625, + "loss/logits": 0.02504115179181099, + "loss/reg": 0.0260187778621912, + "step": 364 + }, + { + "epoch": 0.1825, + "grad_norm": 1.6713463068008423, + "grad_norm_var": 2.5291763950799013, + "learning_rate": 2e-05, + "loss": 0.4939, + "loss/crossentropy": 2.2228282690048218, + "loss/hidden": 0.2099609375, + "loss/logits": 0.023817350156605244, + "loss/reg": 0.026017041876912117, + "step": 365 + }, + { + "epoch": 0.183, + "grad_norm": 1.2542800903320312, + "grad_norm_var": 2.5338262674117384, + "learning_rate": 2e-05, + "loss": 0.5238, + "loss/crossentropy": 2.228682041168213, + "loss/hidden": 0.2275390625, + "loss/logits": 0.03613369073718786, + "loss/reg": 0.026015128940343857, + "step": 366 + }, + { + "epoch": 0.1835, + "grad_norm": 1.2646586894989014, + "grad_norm_var": 2.548319005158211, + "learning_rate": 2e-05, + "loss": 0.4777, + "loss/crossentropy": 2.3304578065872192, + "loss/hidden": 0.1875, + "loss/logits": 0.030102724209427834, + "loss/reg": 0.026013074442744255, + "step": 367 + }, + { + "epoch": 0.184, + "grad_norm": 1.0247364044189453, + "grad_norm_var": 2.5587148751606645, + "learning_rate": 2e-05, + "loss": 0.4394, + "loss/crossentropy": 2.5192004442214966, + "loss/hidden": 0.1572265625, + "loss/logits": 0.022060595452785492, + "loss/reg": 0.026011094450950623, + "step": 368 + }, + { + "epoch": 0.1845, + "grad_norm": 1.4839156866073608, + "grad_norm_var": 2.5607612202403485, + "learning_rate": 2e-05, + "loss": 0.5208, + "loss/crossentropy": 2.1315367221832275, + "loss/hidden": 0.22705078125, + "loss/logits": 0.03365152329206467, + "loss/reg": 0.02600909397006035, + "step": 369 + }, + { + "epoch": 0.185, + "grad_norm": 1.2327549457550049, + "grad_norm_var": 2.5681585055774634, + "learning_rate": 2e-05, + "loss": 0.441, + "loss/crossentropy": 2.418115019798279, + "loss/hidden": 0.158203125, + "loss/logits": 0.022698544897139072, + "loss/reg": 0.026007305830717087, + "step": 370 + }, + { + "epoch": 0.1855, + "grad_norm": 1.2444417476654053, + "grad_norm_var": 2.5709309337522748, + "learning_rate": 2e-05, + "loss": 0.467, + "loss/crossentropy": 2.2915507555007935, + "loss/hidden": 0.1806640625, + "loss/logits": 0.026262402534484863, + "loss/reg": 0.02600536122918129, + "step": 371 + }, + { + "epoch": 0.186, + "grad_norm": 1.2689179182052612, + "grad_norm_var": 0.0472904244491535, + "learning_rate": 2e-05, + "loss": 0.4999, + "loss/crossentropy": 2.518853783607483, + "loss/hidden": 0.21435546875, + "loss/logits": 0.025527067482471466, + "loss/reg": 0.026003584265708923, + "step": 372 + }, + { + "epoch": 0.1865, + "grad_norm": 1.4123287200927734, + "grad_norm_var": 0.047133962787962426, + "learning_rate": 2e-05, + "loss": 0.4964, + "loss/crossentropy": 2.3583970069885254, + "loss/hidden": 0.2080078125, + "loss/logits": 0.028406362980604172, + "loss/reg": 0.02600177377462387, + "step": 373 + }, + { + "epoch": 0.187, + "grad_norm": 1.3444428443908691, + "grad_norm_var": 0.04714470510582007, + "learning_rate": 2e-05, + "loss": 0.4758, + "loss/crossentropy": 2.250472664833069, + "loss/hidden": 0.1875, + "loss/logits": 0.02829747088253498, + "loss/reg": 0.02599998004734516, + "step": 374 + }, + { + "epoch": 0.1875, + "grad_norm": 1.2682015895843506, + "grad_norm_var": 0.046871804013897095, + "learning_rate": 2e-05, + "loss": 0.5168, + "loss/crossentropy": 2.1512317657470703, + "loss/hidden": 0.2216796875, + "loss/logits": 0.03511458821594715, + "loss/reg": 0.025998059660196304, + "step": 375 + }, + { + "epoch": 0.188, + "grad_norm": 1.2203181982040405, + "grad_norm_var": 0.04527427140258874, + "learning_rate": 2e-05, + "loss": 0.4928, + "loss/crossentropy": 2.4670302867889404, + "loss/hidden": 0.20458984375, + "loss/logits": 0.028278429992496967, + "loss/reg": 0.025996318086981773, + "step": 376 + }, + { + "epoch": 0.1885, + "grad_norm": 1.6124721765518188, + "grad_norm_var": 0.02965817159038971, + "learning_rate": 2e-05, + "loss": 0.5482, + "loss/crossentropy": 2.2732619047164917, + "loss/hidden": 0.24169921875, + "loss/logits": 0.046598936431109905, + "loss/reg": 0.02599457837641239, + "step": 377 + }, + { + "epoch": 0.189, + "grad_norm": 1.2982152700424194, + "grad_norm_var": 0.0282961065281843, + "learning_rate": 2e-05, + "loss": 0.4767, + "loss/crossentropy": 2.362215518951416, + "loss/hidden": 0.189453125, + "loss/logits": 0.02727901004254818, + "loss/reg": 0.025992868468165398, + "step": 378 + }, + { + "epoch": 0.1895, + "grad_norm": 1.4476395845413208, + "grad_norm_var": 0.02916870288535254, + "learning_rate": 2e-05, + "loss": 0.5588, + "loss/crossentropy": 2.1909669637680054, + "loss/hidden": 0.25244140625, + "loss/logits": 0.04647276923060417, + "loss/reg": 0.02599099464714527, + "step": 379 + }, + { + "epoch": 0.19, + "grad_norm": 1.3061769008636475, + "grad_norm_var": 0.025439804416175528, + "learning_rate": 2e-05, + "loss": 0.4942, + "loss/crossentropy": 2.291175603866577, + "loss/hidden": 0.19970703125, + "loss/logits": 0.034558966755867004, + "loss/reg": 0.02598922699689865, + "step": 380 + }, + { + "epoch": 0.1905, + "grad_norm": 1.635046362876892, + "grad_norm_var": 0.02389268741876922, + "learning_rate": 2e-05, + "loss": 0.5255, + "loss/crossentropy": 2.6519399881362915, + "loss/hidden": 0.22705078125, + "loss/logits": 0.0385761484503746, + "loss/reg": 0.02598743885755539, + "step": 381 + }, + { + "epoch": 0.191, + "grad_norm": 1.4028866291046143, + "grad_norm_var": 0.023724865257600848, + "learning_rate": 2e-05, + "loss": 0.4627, + "loss/crossentropy": 2.4420300722122192, + "loss/hidden": 0.17724609375, + "loss/logits": 0.025584472343325615, + "loss/reg": 0.025985730811953545, + "step": 382 + }, + { + "epoch": 0.1915, + "grad_norm": 3.43645977973938, + "grad_norm_var": 0.29621158197049285, + "learning_rate": 2e-05, + "loss": 0.509, + "loss/crossentropy": 2.3477495908737183, + "loss/hidden": 0.216796875, + "loss/logits": 0.03231562860310078, + "loss/reg": 0.02598407492041588, + "step": 383 + }, + { + "epoch": 0.192, + "grad_norm": 1.156148076057434, + "grad_norm_var": 0.28935891803297004, + "learning_rate": 2e-05, + "loss": 0.4697, + "loss/crossentropy": 2.4156278371810913, + "loss/hidden": 0.17822265625, + "loss/logits": 0.0316650066524744, + "loss/reg": 0.0259822029620409, + "step": 384 + }, + { + "epoch": 0.1925, + "grad_norm": 1.7708622217178345, + "grad_norm_var": 0.2944387889021565, + "learning_rate": 2e-05, + "loss": 0.5722, + "loss/crossentropy": 2.334781527519226, + "loss/hidden": 0.2587890625, + "loss/logits": 0.05361687205731869, + "loss/reg": 0.025980478152632713, + "step": 385 + }, + { + "epoch": 0.193, + "grad_norm": 2.3118906021118164, + "grad_norm_var": 0.3282542563962823, + "learning_rate": 2e-05, + "loss": 0.5078, + "loss/crossentropy": 2.3158434629440308, + "loss/hidden": 0.21875, + "loss/logits": 0.029261935502290726, + "loss/reg": 0.02597857639193535, + "step": 386 + }, + { + "epoch": 0.1935, + "grad_norm": 2.3003060817718506, + "grad_norm_var": 0.3519549073980194, + "learning_rate": 2e-05, + "loss": 0.5687, + "loss/crossentropy": 2.4877541065216064, + "loss/hidden": 0.2783203125, + "loss/logits": 0.03062661923468113, + "loss/reg": 0.02597683109343052, + "step": 387 + }, + { + "epoch": 0.194, + "grad_norm": 1.1055262088775635, + "grad_norm_var": 0.3616427614209148, + "learning_rate": 2e-05, + "loss": 0.5022, + "loss/crossentropy": 2.312312960624695, + "loss/hidden": 0.20947265625, + "loss/logits": 0.03301689215004444, + "loss/reg": 0.025975055992603302, + "step": 388 + }, + { + "epoch": 0.1945, + "grad_norm": 1.2821520566940308, + "grad_norm_var": 0.3664245697624286, + "learning_rate": 2e-05, + "loss": 0.461, + "loss/crossentropy": 2.3373734951019287, + "loss/hidden": 0.17431640625, + "loss/logits": 0.026956655085086823, + "loss/reg": 0.02597302943468094, + "step": 389 + }, + { + "epoch": 0.195, + "grad_norm": 1.158923625946045, + "grad_norm_var": 0.37535894838822137, + "learning_rate": 2e-05, + "loss": 0.4653, + "loss/crossentropy": 2.3373029232025146, + "loss/hidden": 0.1787109375, + "loss/logits": 0.02689830120652914, + "loss/reg": 0.025970980525016785, + "step": 390 + }, + { + "epoch": 0.1955, + "grad_norm": 4.394406318664551, + "grad_norm_var": 0.8449288503424893, + "learning_rate": 2e-05, + "loss": 0.621, + "loss/crossentropy": 2.483940362930298, + "loss/hidden": 0.22021484375, + "loss/logits": 0.14109261147677898, + "loss/reg": 0.025968806818127632, + "step": 391 + }, + { + "epoch": 0.196, + "grad_norm": 1.3220263719558716, + "grad_norm_var": 0.837680848201209, + "learning_rate": 2e-05, + "loss": 0.571, + "loss/crossentropy": 2.3968313932418823, + "loss/hidden": 0.26318359375, + "loss/logits": 0.04814390931278467, + "loss/reg": 0.02596699632704258, + "step": 392 + }, + { + "epoch": 0.1965, + "grad_norm": 1.476704478263855, + "grad_norm_var": 0.8423872820531374, + "learning_rate": 2e-05, + "loss": 0.4767, + "loss/crossentropy": 2.6665027141571045, + "loss/hidden": 0.18115234375, + "loss/logits": 0.03589140065014362, + "loss/reg": 0.025965221226215363, + "step": 393 + }, + { + "epoch": 0.197, + "grad_norm": 1.3346498012542725, + "grad_norm_var": 0.8400309797725388, + "learning_rate": 2e-05, + "loss": 0.4879, + "loss/crossentropy": 2.550223231315613, + "loss/hidden": 0.189453125, + "loss/logits": 0.03877757303416729, + "loss/reg": 0.025963468477129936, + "step": 394 + }, + { + "epoch": 0.1975, + "grad_norm": 1.381104826927185, + "grad_norm_var": 0.8434567338089674, + "learning_rate": 2e-05, + "loss": 0.4542, + "loss/crossentropy": 2.3325024843215942, + "loss/hidden": 0.16259765625, + "loss/logits": 0.03194664418697357, + "loss/reg": 0.02596171200275421, + "step": 395 + }, + { + "epoch": 0.198, + "grad_norm": 1.517006516456604, + "grad_norm_var": 0.8323965808807104, + "learning_rate": 2e-05, + "loss": 0.5682, + "loss/crossentropy": 2.144330859184265, + "loss/hidden": 0.2607421875, + "loss/logits": 0.04788592271506786, + "loss/reg": 0.025959979742765427, + "step": 396 + }, + { + "epoch": 0.1985, + "grad_norm": 1.0620001554489136, + "grad_norm_var": 0.8664126262366226, + "learning_rate": 2e-05, + "loss": 0.4494, + "loss/crossentropy": 2.410404920578003, + "loss/hidden": 0.162109375, + "loss/logits": 0.027681468054652214, + "loss/reg": 0.025957921519875526, + "step": 397 + }, + { + "epoch": 0.199, + "grad_norm": 1.4343640804290771, + "grad_norm_var": 0.8649093715486228, + "learning_rate": 2e-05, + "loss": 0.5069, + "loss/crossentropy": 2.295978307723999, + "loss/hidden": 0.21142578125, + "loss/logits": 0.035917842760682106, + "loss/reg": 0.025956083089113235, + "step": 398 + }, + { + "epoch": 0.1995, + "grad_norm": 1.325333595275879, + "grad_norm_var": 0.676572657470614, + "learning_rate": 2e-05, + "loss": 0.4739, + "loss/crossentropy": 2.3820759057998657, + "loss/hidden": 0.18603515625, + "loss/logits": 0.028354477137327194, + "loss/reg": 0.025954021140933037, + "step": 399 + }, + { + "epoch": 0.2, + "grad_norm": 2.790135622024536, + "grad_norm_var": 0.736756106069653, + "learning_rate": 2e-05, + "loss": 0.5812, + "loss/crossentropy": 2.283258855342865, + "loss/hidden": 0.26953125, + "loss/logits": 0.05216490104794502, + "loss/reg": 0.02595207281410694, + "step": 400 + }, + { + "epoch": 0.2005, + "grad_norm": 2.0463712215423584, + "grad_norm_var": 0.7423414092941923, + "learning_rate": 2e-05, + "loss": 0.5143, + "loss/crossentropy": 2.5675315856933594, + "loss/hidden": 0.21728515625, + "loss/logits": 0.037514453753829, + "loss/reg": 0.025950025767087936, + "step": 401 + }, + { + "epoch": 0.201, + "grad_norm": 1.8808186054229736, + "grad_norm_var": 0.7225325442870276, + "learning_rate": 2e-05, + "loss": 0.463, + "loss/crossentropy": 2.3908499479293823, + "loss/hidden": 0.17724609375, + "loss/logits": 0.026285232976078987, + "loss/reg": 0.02594805508852005, + "step": 402 + }, + { + "epoch": 0.2015, + "grad_norm": 1.2097140550613403, + "grad_norm_var": 0.7151380800453793, + "learning_rate": 2e-05, + "loss": 0.4623, + "loss/crossentropy": 2.3907727003097534, + "loss/hidden": 0.1708984375, + "loss/logits": 0.031923141330480576, + "loss/reg": 0.025945995002985, + "step": 403 + }, + { + "epoch": 0.202, + "grad_norm": 1.301154613494873, + "grad_norm_var": 0.7028043528110918, + "learning_rate": 2e-05, + "loss": 0.4544, + "loss/crossentropy": 2.606261968612671, + "loss/hidden": 0.16650390625, + "loss/logits": 0.02846657857298851, + "loss/reg": 0.02594408206641674, + "step": 404 + }, + { + "epoch": 0.2025, + "grad_norm": 1.1995950937271118, + "grad_norm_var": 0.7076350429627898, + "learning_rate": 2e-05, + "loss": 0.4391, + "loss/crossentropy": 2.3680388927459717, + "loss/hidden": 0.15673828125, + "loss/logits": 0.022917790338397026, + "loss/reg": 0.025941966101527214, + "step": 405 + }, + { + "epoch": 0.203, + "grad_norm": 8.632776260375977, + "grad_norm_var": 3.6823756133748495, + "learning_rate": 2e-05, + "loss": 1.2499, + "loss/crossentropy": 2.4126373529434204, + "loss/hidden": 0.72021484375, + "loss/logits": 0.270312886685133, + "loss/reg": 0.025939757004380226, + "step": 406 + }, + { + "epoch": 0.2035, + "grad_norm": 1.2286854982376099, + "grad_norm_var": 3.358959418903309, + "learning_rate": 2e-05, + "loss": 0.4674, + "loss/crossentropy": 2.290730118751526, + "loss/hidden": 0.17822265625, + "loss/logits": 0.029834291897714138, + "loss/reg": 0.025937531143426895, + "step": 407 + }, + { + "epoch": 0.204, + "grad_norm": 1.3198645114898682, + "grad_norm_var": 3.359139686229141, + "learning_rate": 2e-05, + "loss": 0.4864, + "loss/crossentropy": 2.424551844596863, + "loss/hidden": 0.1953125, + "loss/logits": 0.03177413158118725, + "loss/reg": 0.025935430079698563, + "step": 408 + }, + { + "epoch": 0.2045, + "grad_norm": 1.1165919303894043, + "grad_norm_var": 3.389790819966483, + "learning_rate": 2e-05, + "loss": 0.5246, + "loss/crossentropy": 2.2340330481529236, + "loss/hidden": 0.22509765625, + "loss/logits": 0.04018213599920273, + "loss/reg": 0.02593357115983963, + "step": 409 + }, + { + "epoch": 0.205, + "grad_norm": 1.4326255321502686, + "grad_norm_var": 3.382694967184837, + "learning_rate": 2e-05, + "loss": 0.4669, + "loss/crossentropy": 2.4408915042877197, + "loss/hidden": 0.17822265625, + "loss/logits": 0.02933754399418831, + "loss/reg": 0.0259317085146904, + "step": 410 + }, + { + "epoch": 0.2055, + "grad_norm": 1.4279175996780396, + "grad_norm_var": 3.379406616020368, + "learning_rate": 2e-05, + "loss": 0.4722, + "loss/crossentropy": 2.398142695426941, + "loss/hidden": 0.18359375, + "loss/logits": 0.02931864559650421, + "loss/reg": 0.02592984400689602, + "step": 411 + }, + { + "epoch": 0.206, + "grad_norm": 3.493486166000366, + "grad_norm_var": 3.5139842381908477, + "learning_rate": 2e-05, + "loss": 0.5326, + "loss/crossentropy": 2.3516749143600464, + "loss/hidden": 0.17431640625, + "loss/logits": 0.0990044642239809, + "loss/reg": 0.025927875190973282, + "step": 412 + }, + { + "epoch": 0.2065, + "grad_norm": 1.1578741073608398, + "grad_norm_var": 3.5018478922430516, + "learning_rate": 2e-05, + "loss": 0.4537, + "loss/crossentropy": 2.4094560146331787, + "loss/hidden": 0.16845703125, + "loss/logits": 0.025964444503188133, + "loss/reg": 0.0259258896112442, + "step": 413 + }, + { + "epoch": 0.207, + "grad_norm": 1.1830717325210571, + "grad_norm_var": 3.5268350962116277, + "learning_rate": 2e-05, + "loss": 0.4649, + "loss/crossentropy": 2.5271745920181274, + "loss/hidden": 0.1650390625, + "loss/logits": 0.04061662219464779, + "loss/reg": 0.02592400461435318, + "step": 414 + }, + { + "epoch": 0.2075, + "grad_norm": 1.8626422882080078, + "grad_norm_var": 3.493204661138373, + "learning_rate": 2e-05, + "loss": 0.5312, + "loss/crossentropy": 2.4808900356292725, + "loss/hidden": 0.23193359375, + "loss/logits": 0.040002613328397274, + "loss/reg": 0.025921940803527832, + "step": 415 + }, + { + "epoch": 0.208, + "grad_norm": 1.3317103385925293, + "grad_norm_var": 3.4880922061323294, + "learning_rate": 2e-05, + "loss": 0.4776, + "loss/crossentropy": 2.2604642510414124, + "loss/hidden": 0.1884765625, + "loss/logits": 0.029966252855956554, + "loss/reg": 0.0259199608117342, + "step": 416 + }, + { + "epoch": 0.2085, + "grad_norm": 1.5000430345535278, + "grad_norm_var": 3.502571821664609, + "learning_rate": 2e-05, + "loss": 0.4815, + "loss/crossentropy": 2.813089966773987, + "loss/hidden": 0.1875, + "loss/logits": 0.0347793884575367, + "loss/reg": 0.025918107479810715, + "step": 417 + }, + { + "epoch": 0.209, + "grad_norm": 2.0012242794036865, + "grad_norm_var": 3.502288435747643, + "learning_rate": 2e-05, + "loss": 0.5129, + "loss/crossentropy": 2.4542036056518555, + "loss/hidden": 0.21923828125, + "loss/logits": 0.034517631866037846, + "loss/reg": 0.025916218757629395, + "step": 418 + }, + { + "epoch": 0.2095, + "grad_norm": 1.627580165863037, + "grad_norm_var": 3.4712634219787057, + "learning_rate": 2e-05, + "loss": 0.5077, + "loss/crossentropy": 2.2533979415893555, + "loss/hidden": 0.2109375, + "loss/logits": 0.0376081969588995, + "loss/reg": 0.02591414749622345, + "step": 419 + }, + { + "epoch": 0.21, + "grad_norm": 1.2047749757766724, + "grad_norm_var": 3.480677477073349, + "learning_rate": 2e-05, + "loss": 0.4928, + "loss/crossentropy": 2.3260152339935303, + "loss/hidden": 0.19873046875, + "loss/logits": 0.03491301275789738, + "loss/reg": 0.02591288462281227, + "step": 420 + }, + { + "epoch": 0.2105, + "grad_norm": 1.4506651163101196, + "grad_norm_var": 3.4584077400506277, + "learning_rate": 2e-05, + "loss": 0.4793, + "loss/crossentropy": 2.378847122192383, + "loss/hidden": 0.18212890625, + "loss/logits": 0.038035670295357704, + "loss/reg": 0.02591102570295334, + "step": 421 + }, + { + "epoch": 0.211, + "grad_norm": 1.5155658721923828, + "grad_norm_var": 0.3283885764687509, + "learning_rate": 2e-05, + "loss": 0.4815, + "loss/crossentropy": 2.5136163234710693, + "loss/hidden": 0.1904296875, + "loss/logits": 0.03193356655538082, + "loss/reg": 0.025909241288900375, + "step": 422 + }, + { + "epoch": 0.2115, + "grad_norm": 1.1012701988220215, + "grad_norm_var": 0.3349196404218929, + "learning_rate": 2e-05, + "loss": 0.4696, + "loss/crossentropy": 2.4695777893066406, + "loss/hidden": 0.18115234375, + "loss/logits": 0.02937779761850834, + "loss/reg": 0.025907844305038452, + "step": 423 + }, + { + "epoch": 0.212, + "grad_norm": 1.069792628288269, + "grad_norm_var": 0.3463492066639103, + "learning_rate": 2e-05, + "loss": 0.4535, + "loss/crossentropy": 2.4616788625717163, + "loss/hidden": 0.169921875, + "loss/logits": 0.024515327997505665, + "loss/reg": 0.025905968621373177, + "step": 424 + }, + { + "epoch": 0.2125, + "grad_norm": 1.3047641515731812, + "grad_norm_var": 0.3381949619476723, + "learning_rate": 2e-05, + "loss": 0.4452, + "loss/crossentropy": 2.3210073709487915, + "loss/hidden": 0.1640625, + "loss/logits": 0.022073786705732346, + "loss/reg": 0.025904452428221703, + "step": 425 + }, + { + "epoch": 0.213, + "grad_norm": 1.2246006727218628, + "grad_norm_var": 0.34392116884757395, + "learning_rate": 2e-05, + "loss": 0.439, + "loss/crossentropy": 2.378546953201294, + "loss/hidden": 0.1572265625, + "loss/logits": 0.022725941613316536, + "loss/reg": 0.025902574881911278, + "step": 426 + }, + { + "epoch": 0.2135, + "grad_norm": 1.0174260139465332, + "grad_norm_var": 0.35996108865228293, + "learning_rate": 2e-05, + "loss": 0.4365, + "loss/crossentropy": 2.5196746587753296, + "loss/hidden": 0.1552734375, + "loss/logits": 0.02219019364565611, + "loss/reg": 0.025900712236762047, + "step": 427 + }, + { + "epoch": 0.214, + "grad_norm": 1.0695523023605347, + "grad_norm_var": 0.08383900724757207, + "learning_rate": 2e-05, + "loss": 0.4605, + "loss/crossentropy": 2.321962356567383, + "loss/hidden": 0.17578125, + "loss/logits": 0.025690771639347076, + "loss/reg": 0.025898825377225876, + "step": 428 + }, + { + "epoch": 0.2145, + "grad_norm": 2.6451644897460938, + "grad_norm_var": 0.183711866568535, + "learning_rate": 2e-05, + "loss": 0.609, + "loss/crossentropy": 2.2449337244033813, + "loss/hidden": 0.28857421875, + "loss/logits": 0.061411263421177864, + "loss/reg": 0.02589711733162403, + "step": 429 + }, + { + "epoch": 0.215, + "grad_norm": 1.713813304901123, + "grad_norm_var": 0.1828266836214993, + "learning_rate": 2e-05, + "loss": 0.5079, + "loss/crossentropy": 2.4451547861099243, + "loss/hidden": 0.21533203125, + "loss/logits": 0.033602748066186905, + "loss/reg": 0.025895224884152412, + "step": 430 + }, + { + "epoch": 0.2155, + "grad_norm": 1.0962016582489014, + "grad_norm_var": 0.1801864102764767, + "learning_rate": 2e-05, + "loss": 0.4605, + "loss/crossentropy": 2.357746958732605, + "loss/hidden": 0.17578125, + "loss/logits": 0.025804596953094006, + "loss/reg": 0.025893518701195717, + "step": 431 + }, + { + "epoch": 0.216, + "grad_norm": 4.617275238037109, + "grad_norm_var": 0.8119718727911261, + "learning_rate": 2e-05, + "loss": 0.8169, + "loss/crossentropy": 2.3531649112701416, + "loss/hidden": 0.4423828125, + "loss/logits": 0.11559372302144766, + "loss/reg": 0.025891879573464394, + "step": 432 + }, + { + "epoch": 0.2165, + "grad_norm": 0.9944002032279968, + "grad_norm_var": 0.8370490047784728, + "learning_rate": 2e-05, + "loss": 0.4543, + "loss/crossentropy": 2.474943161010742, + "loss/hidden": 0.1708984375, + "loss/logits": 0.024509361945092678, + "loss/reg": 0.02588999830186367, + "step": 433 + }, + { + "epoch": 0.217, + "grad_norm": 1.6105306148529053, + "grad_norm_var": 0.8258643739880323, + "learning_rate": 2e-05, + "loss": 0.5185, + "loss/crossentropy": 2.1941992044448853, + "loss/hidden": 0.22705078125, + "loss/logits": 0.03256234619766474, + "loss/reg": 0.025888269767165184, + "step": 434 + }, + { + "epoch": 0.2175, + "grad_norm": 1.2945845127105713, + "grad_norm_var": 0.8306360972250484, + "learning_rate": 2e-05, + "loss": 0.4599, + "loss/crossentropy": 2.3870290517807007, + "loss/hidden": 0.1748046875, + "loss/logits": 0.026204396039247513, + "loss/reg": 0.025886395946145058, + "step": 435 + }, + { + "epoch": 0.218, + "grad_norm": 1.646968126296997, + "grad_norm_var": 0.8220224189189824, + "learning_rate": 2e-05, + "loss": 0.5252, + "loss/crossentropy": 2.1381598711013794, + "loss/hidden": 0.22705078125, + "loss/logits": 0.03925580158829689, + "loss/reg": 0.025884483009576797, + "step": 436 + }, + { + "epoch": 0.2185, + "grad_norm": 1.2482175827026367, + "grad_norm_var": 0.8282312987681753, + "learning_rate": 2e-05, + "loss": 0.4699, + "loss/crossentropy": 2.4693511724472046, + "loss/hidden": 0.18359375, + "loss/logits": 0.027454238384962082, + "loss/reg": 0.025882598012685776, + "step": 437 + }, + { + "epoch": 0.219, + "grad_norm": 1.6927727460861206, + "grad_norm_var": 0.828833769560893, + "learning_rate": 2e-05, + "loss": 0.6173, + "loss/crossentropy": 2.264186978340149, + "loss/hidden": 0.30419921875, + "loss/logits": 0.05433515552431345, + "loss/reg": 0.025880809873342514, + "step": 438 + }, + { + "epoch": 0.2195, + "grad_norm": 1.3382420539855957, + "grad_norm_var": 0.8170844633700326, + "learning_rate": 2e-05, + "loss": 0.4594, + "loss/crossentropy": 2.4065046310424805, + "loss/hidden": 0.16943359375, + "loss/logits": 0.03115204442292452, + "loss/reg": 0.025879191234707832, + "step": 439 + }, + { + "epoch": 0.22, + "grad_norm": 1.280760407447815, + "grad_norm_var": 0.8049795437588958, + "learning_rate": 2e-05, + "loss": 0.4746, + "loss/crossentropy": 2.4252418279647827, + "loss/hidden": 0.18701171875, + "loss/logits": 0.028856026008725166, + "loss/reg": 0.025877289474010468, + "step": 440 + }, + { + "epoch": 0.2205, + "grad_norm": 1.1407486200332642, + "grad_norm_var": 0.8133841973004384, + "learning_rate": 2e-05, + "loss": 0.4791, + "loss/crossentropy": 2.264625906944275, + "loss/hidden": 0.19091796875, + "loss/logits": 0.029384871013462543, + "loss/reg": 0.02587556093931198, + "step": 441 + }, + { + "epoch": 0.221, + "grad_norm": 1.1531625986099243, + "grad_norm_var": 0.8172974757844712, + "learning_rate": 2e-05, + "loss": 0.4645, + "loss/crossentropy": 2.3819206953048706, + "loss/hidden": 0.16015625, + "loss/logits": 0.045620132237672806, + "loss/reg": 0.025873858481645584, + "step": 442 + }, + { + "epoch": 0.2215, + "grad_norm": 1.2209059000015259, + "grad_norm_var": 0.8041477490589808, + "learning_rate": 2e-05, + "loss": 0.4922, + "loss/crossentropy": 2.2260149717330933, + "loss/hidden": 0.19775390625, + "loss/logits": 0.03575233928859234, + "loss/reg": 0.025872183963656425, + "step": 443 + }, + { + "epoch": 0.222, + "grad_norm": 1.4616377353668213, + "grad_norm_var": 0.7854915962697572, + "learning_rate": 2e-05, + "loss": 0.4759, + "loss/crossentropy": 2.326699376106262, + "loss/hidden": 0.1875, + "loss/logits": 0.02973231580108404, + "loss/reg": 0.02587028034031391, + "step": 444 + }, + { + "epoch": 0.2225, + "grad_norm": 1.1616874933242798, + "grad_norm_var": 0.7231711161910169, + "learning_rate": 2e-05, + "loss": 0.477, + "loss/crossentropy": 2.3070465326309204, + "loss/hidden": 0.18701171875, + "loss/logits": 0.03132193721830845, + "loss/reg": 0.02586846426129341, + "step": 445 + }, + { + "epoch": 0.223, + "grad_norm": 1.1598429679870605, + "grad_norm_var": 0.7296602944536337, + "learning_rate": 2e-05, + "loss": 0.4613, + "loss/crossentropy": 2.353983521461487, + "loss/hidden": 0.171875, + "loss/logits": 0.030772192403674126, + "loss/reg": 0.025866517797112465, + "step": 446 + }, + { + "epoch": 0.2235, + "grad_norm": 1.3874998092651367, + "grad_norm_var": 0.7189939859161824, + "learning_rate": 2e-05, + "loss": 0.5004, + "loss/crossentropy": 2.47870934009552, + "loss/hidden": 0.19970703125, + "loss/logits": 0.04206428676843643, + "loss/reg": 0.025864504277706146, + "step": 447 + }, + { + "epoch": 0.224, + "grad_norm": 2.931767463684082, + "grad_norm_var": 0.2017417237659708, + "learning_rate": 2e-05, + "loss": 0.8347, + "loss/crossentropy": 2.5819171667099, + "loss/hidden": 0.40234375, + "loss/logits": 0.1736808605492115, + "loss/reg": 0.025862593203783035, + "step": 448 + }, + { + "epoch": 0.2245, + "grad_norm": 1.3012363910675049, + "grad_norm_var": 0.19020454457909727, + "learning_rate": 2e-05, + "loss": 0.4471, + "loss/crossentropy": 2.3639878034591675, + "loss/hidden": 0.16162109375, + "loss/logits": 0.026854592375457287, + "loss/reg": 0.025860626250505447, + "step": 449 + }, + { + "epoch": 0.225, + "grad_norm": 1.6675218343734741, + "grad_norm_var": 0.19170785847398542, + "learning_rate": 2e-05, + "loss": 0.5328, + "loss/crossentropy": 2.3093976974487305, + "loss/hidden": 0.22119140625, + "loss/logits": 0.05306573584675789, + "loss/reg": 0.025858718901872635, + "step": 450 + }, + { + "epoch": 0.2255, + "grad_norm": 1.4241790771484375, + "grad_norm_var": 0.19019349759976567, + "learning_rate": 2e-05, + "loss": 0.4946, + "loss/crossentropy": 2.3994137048721313, + "loss/hidden": 0.2060546875, + "loss/logits": 0.029971184208989143, + "loss/reg": 0.02585672214627266, + "step": 451 + }, + { + "epoch": 0.226, + "grad_norm": 2.4200472831726074, + "grad_norm_var": 0.24773914499477828, + "learning_rate": 2e-05, + "loss": 0.4804, + "loss/crossentropy": 2.349528431892395, + "loss/hidden": 0.19482421875, + "loss/logits": 0.02699958346784115, + "loss/reg": 0.02585473842918873, + "step": 452 + }, + { + "epoch": 0.2265, + "grad_norm": 1.597848653793335, + "grad_norm_var": 0.24367026793009636, + "learning_rate": 2e-05, + "loss": 0.4561, + "loss/crossentropy": 2.2601643800735474, + "loss/hidden": 0.17041015625, + "loss/logits": 0.027189917862415314, + "loss/reg": 0.025852810591459274, + "step": 453 + }, + { + "epoch": 0.227, + "grad_norm": 1.4791680574417114, + "grad_norm_var": 0.24163663071934274, + "learning_rate": 2e-05, + "loss": 0.5426, + "loss/crossentropy": 2.184678077697754, + "loss/hidden": 0.2412109375, + "loss/logits": 0.04290330223739147, + "loss/reg": 0.025850806385278702, + "step": 454 + }, + { + "epoch": 0.2275, + "grad_norm": 1.54658842086792, + "grad_norm_var": 0.2396368776147885, + "learning_rate": 2e-05, + "loss": 0.495, + "loss/crossentropy": 2.427361249923706, + "loss/hidden": 0.203125, + "loss/logits": 0.03336675837635994, + "loss/reg": 0.025848930701613426, + "step": 455 + }, + { + "epoch": 0.228, + "grad_norm": 1.0083175897598267, + "grad_norm_var": 0.2529996468682663, + "learning_rate": 2e-05, + "loss": 0.51, + "loss/crossentropy": 2.1102696657180786, + "loss/hidden": 0.21484375, + "loss/logits": 0.03667537495493889, + "loss/reg": 0.025847142562270164, + "step": 456 + }, + { + "epoch": 0.2285, + "grad_norm": 1.2189358472824097, + "grad_norm_var": 0.2495960410376004, + "learning_rate": 2e-05, + "loss": 0.5198, + "loss/crossentropy": 2.3419077396392822, + "loss/hidden": 0.2275390625, + "loss/logits": 0.03385118395090103, + "loss/reg": 0.025845320895314217, + "step": 457 + }, + { + "epoch": 0.229, + "grad_norm": 1.3370299339294434, + "grad_norm_var": 0.24299100458264036, + "learning_rate": 2e-05, + "loss": 0.5765, + "loss/crossentropy": 1.9379181265830994, + "loss/hidden": 0.27734375, + "loss/logits": 0.0406951867043972, + "loss/reg": 0.025843370705842972, + "step": 458 + }, + { + "epoch": 0.2295, + "grad_norm": 1.1177793741226196, + "grad_norm_var": 0.24777192368354406, + "learning_rate": 2e-05, + "loss": 0.4456, + "loss/crossentropy": 2.4968008995056152, + "loss/hidden": 0.158203125, + "loss/logits": 0.02897755615413189, + "loss/reg": 0.02584136091172695, + "step": 459 + }, + { + "epoch": 0.23, + "grad_norm": 1.4649550914764404, + "grad_norm_var": 0.24774953141933906, + "learning_rate": 2e-05, + "loss": 0.4868, + "loss/crossentropy": 2.2792553901672363, + "loss/hidden": 0.193359375, + "loss/logits": 0.035073790699243546, + "loss/reg": 0.025839168578386307, + "step": 460 + }, + { + "epoch": 0.2305, + "grad_norm": 2.2920172214508057, + "grad_norm_var": 0.2745013047452227, + "learning_rate": 2e-05, + "loss": 0.621, + "loss/crossentropy": 2.418063998222351, + "loss/hidden": 0.29736328125, + "loss/logits": 0.06528288684785366, + "loss/reg": 0.025837266817688942, + "step": 461 + }, + { + "epoch": 0.231, + "grad_norm": 1.5773580074310303, + "grad_norm_var": 0.2617466213370638, + "learning_rate": 2e-05, + "loss": 0.5073, + "loss/crossentropy": 2.2258150577545166, + "loss/hidden": 0.2138671875, + "loss/logits": 0.03509692847728729, + "loss/reg": 0.025835072621703148, + "step": 462 + }, + { + "epoch": 0.2315, + "grad_norm": 1.6675801277160645, + "grad_norm_var": 0.2583117846520134, + "learning_rate": 2e-05, + "loss": 0.5136, + "loss/crossentropy": 2.485268235206604, + "loss/hidden": 0.22412109375, + "loss/logits": 0.031180618330836296, + "loss/reg": 0.025833170861005783, + "step": 463 + }, + { + "epoch": 0.232, + "grad_norm": 1.4492632150650024, + "grad_norm_var": 0.13801685370827765, + "learning_rate": 2e-05, + "loss": 0.4973, + "loss/crossentropy": 2.257757544517517, + "loss/hidden": 0.20703125, + "loss/logits": 0.03195131104439497, + "loss/reg": 0.025831099599599838, + "step": 464 + }, + { + "epoch": 0.2325, + "grad_norm": 1.2513377666473389, + "grad_norm_var": 0.1397318210080442, + "learning_rate": 2e-05, + "loss": 0.4871, + "loss/crossentropy": 2.425398826599121, + "loss/hidden": 0.1953125, + "loss/logits": 0.0334627740085125, + "loss/reg": 0.025828994810581207, + "step": 465 + }, + { + "epoch": 0.233, + "grad_norm": 1.3394722938537598, + "grad_norm_var": 0.14055180736722211, + "learning_rate": 2e-05, + "loss": 0.4586, + "loss/crossentropy": 2.373073101043701, + "loss/hidden": 0.171875, + "loss/logits": 0.02840618882328272, + "loss/reg": 0.025827039033174515, + "step": 466 + }, + { + "epoch": 0.2335, + "grad_norm": 1.447240948677063, + "grad_norm_var": 0.14031502946211252, + "learning_rate": 2e-05, + "loss": 0.5055, + "loss/crossentropy": 2.207805633544922, + "loss/hidden": 0.21337890625, + "loss/logits": 0.03383456543087959, + "loss/reg": 0.025825195014476776, + "step": 467 + }, + { + "epoch": 0.234, + "grad_norm": 2.395975351333618, + "grad_norm_var": 0.13744138699080868, + "learning_rate": 2e-05, + "loss": 0.5234, + "loss/crossentropy": 2.323424220085144, + "loss/hidden": 0.232421875, + "loss/logits": 0.03275643941015005, + "loss/reg": 0.02582353726029396, + "step": 468 + }, + { + "epoch": 0.2345, + "grad_norm": 2.5749197006225586, + "grad_norm_var": 0.2083013754485968, + "learning_rate": 2e-05, + "loss": 0.5199, + "loss/crossentropy": 2.2507615089416504, + "loss/hidden": 0.2275390625, + "loss/logits": 0.0341134462505579, + "loss/reg": 0.025821637362241745, + "step": 469 + }, + { + "epoch": 0.235, + "grad_norm": 1.052276611328125, + "grad_norm_var": 0.22503173458550182, + "learning_rate": 2e-05, + "loss": 0.4509, + "loss/crossentropy": 2.543000817298889, + "loss/hidden": 0.16796875, + "loss/logits": 0.024687878787517548, + "loss/reg": 0.025819703936576843, + "step": 470 + }, + { + "epoch": 0.2355, + "grad_norm": 1.2194154262542725, + "grad_norm_var": 0.2317099631068041, + "learning_rate": 2e-05, + "loss": 0.4424, + "loss/crossentropy": 2.2912293672561646, + "loss/hidden": 0.16015625, + "loss/logits": 0.024061255156993866, + "loss/reg": 0.025817908346652985, + "step": 471 + }, + { + "epoch": 0.236, + "grad_norm": 1.3160464763641357, + "grad_norm_var": 0.2163932029026758, + "learning_rate": 2e-05, + "loss": 0.5102, + "loss/crossentropy": 2.2214205265045166, + "loss/hidden": 0.2119140625, + "loss/logits": 0.040074046701192856, + "loss/reg": 0.025816213339567184, + "step": 472 + }, + { + "epoch": 0.2365, + "grad_norm": 1.4499560594558716, + "grad_norm_var": 0.20968210761966072, + "learning_rate": 2e-05, + "loss": 0.465, + "loss/crossentropy": 2.4633235931396484, + "loss/hidden": 0.17626953125, + "loss/logits": 0.030628393404185772, + "loss/reg": 0.025814484804868698, + "step": 473 + }, + { + "epoch": 0.237, + "grad_norm": 1.5795156955718994, + "grad_norm_var": 0.20616303007269987, + "learning_rate": 2e-05, + "loss": 0.4987, + "loss/crossentropy": 2.5177139043807983, + "loss/hidden": 0.21240234375, + "loss/logits": 0.02812807820737362, + "loss/reg": 0.025812778621912003, + "step": 474 + }, + { + "epoch": 0.2375, + "grad_norm": 1.834425449371338, + "grad_norm_var": 0.19460237139374303, + "learning_rate": 2e-05, + "loss": 0.548, + "loss/crossentropy": 2.566522002220154, + "loss/hidden": 0.251953125, + "loss/logits": 0.03790239989757538, + "loss/reg": 0.0258110873401165, + "step": 475 + }, + { + "epoch": 0.238, + "grad_norm": 1.8921895027160645, + "grad_norm_var": 0.19720773265526592, + "learning_rate": 2e-05, + "loss": 0.4895, + "loss/crossentropy": 2.6173768043518066, + "loss/hidden": 0.19873046875, + "loss/logits": 0.0326268021017313, + "loss/reg": 0.025809384882450104, + "step": 476 + }, + { + "epoch": 0.2385, + "grad_norm": 1.4226226806640625, + "grad_norm_var": 0.16958397715451046, + "learning_rate": 2e-05, + "loss": 0.498, + "loss/crossentropy": 2.3383296728134155, + "loss/hidden": 0.20751953125, + "loss/logits": 0.03235785476863384, + "loss/reg": 0.025807524099946022, + "step": 477 + }, + { + "epoch": 0.239, + "grad_norm": 1.141805648803711, + "grad_norm_var": 0.1822821790845537, + "learning_rate": 2e-05, + "loss": 0.4543, + "loss/crossentropy": 2.428423523902893, + "loss/hidden": 0.169921875, + "loss/logits": 0.026319866999983788, + "loss/reg": 0.025805801153182983, + "step": 478 + }, + { + "epoch": 0.2395, + "grad_norm": 1.0476349592208862, + "grad_norm_var": 0.19779294720925691, + "learning_rate": 2e-05, + "loss": 0.4542, + "loss/crossentropy": 2.3026620149612427, + "loss/hidden": 0.16845703125, + "loss/logits": 0.027672583237290382, + "loss/reg": 0.025803864002227783, + "step": 479 + }, + { + "epoch": 0.24, + "grad_norm": 1.3201205730438232, + "grad_norm_var": 0.20015459609518108, + "learning_rate": 2e-05, + "loss": 0.4768, + "loss/crossentropy": 2.4549564123153687, + "loss/hidden": 0.18603515625, + "loss/logits": 0.0327040059491992, + "loss/reg": 0.025801965966820717, + "step": 480 + }, + { + "epoch": 0.2405, + "grad_norm": 2.7316701412200928, + "grad_norm_var": 0.28452048900647253, + "learning_rate": 2e-05, + "loss": 0.6865, + "loss/crossentropy": 2.420086145401001, + "loss/hidden": 0.318359375, + "loss/logits": 0.11013734713196754, + "loss/reg": 0.025800272822380066, + "step": 481 + }, + { + "epoch": 0.241, + "grad_norm": 1.2162243127822876, + "grad_norm_var": 0.28992089783955044, + "learning_rate": 2e-05, + "loss": 0.5028, + "loss/crossentropy": 2.3521331548690796, + "loss/hidden": 0.2177734375, + "loss/logits": 0.027009712532162666, + "loss/reg": 0.025798635557293892, + "step": 482 + }, + { + "epoch": 0.2415, + "grad_norm": 1.079655647277832, + "grad_norm_var": 0.3059815393422553, + "learning_rate": 2e-05, + "loss": 0.4912, + "loss/crossentropy": 2.3484867811203003, + "loss/hidden": 0.19873046875, + "loss/logits": 0.03451688028872013, + "loss/reg": 0.025796744972467422, + "step": 483 + }, + { + "epoch": 0.242, + "grad_norm": 1.3099355697631836, + "grad_norm_var": 0.2614914398097065, + "learning_rate": 2e-05, + "loss": 0.4762, + "loss/crossentropy": 2.4110331535339355, + "loss/hidden": 0.18798828125, + "loss/logits": 0.03024720586836338, + "loss/reg": 0.025794848799705505, + "step": 484 + }, + { + "epoch": 0.2425, + "grad_norm": 1.11648690700531, + "grad_norm_var": 0.1876940743940563, + "learning_rate": 2e-05, + "loss": 0.4663, + "loss/crossentropy": 2.3808969259262085, + "loss/hidden": 0.17919921875, + "loss/logits": 0.02915147691965103, + "loss/reg": 0.025793053209781647, + "step": 485 + }, + { + "epoch": 0.243, + "grad_norm": 1.1367552280426025, + "grad_norm_var": 0.18399111878470176, + "learning_rate": 2e-05, + "loss": 0.4665, + "loss/crossentropy": 2.3539984226226807, + "loss/hidden": 0.1787109375, + "loss/logits": 0.02986688818782568, + "loss/reg": 0.02579127438366413, + "step": 486 + }, + { + "epoch": 0.2435, + "grad_norm": 1.1363672018051147, + "grad_norm_var": 0.18670864710500906, + "learning_rate": 2e-05, + "loss": 0.457, + "loss/crossentropy": 2.5751640796661377, + "loss/hidden": 0.17041015625, + "loss/logits": 0.028702068142592907, + "loss/reg": 0.02578934282064438, + "step": 487 + }, + { + "epoch": 0.244, + "grad_norm": 1.59341299533844, + "grad_norm_var": 0.1876461007769971, + "learning_rate": 2e-05, + "loss": 0.549, + "loss/crossentropy": 2.4539562463760376, + "loss/hidden": 0.248046875, + "loss/logits": 0.043077923357486725, + "loss/reg": 0.02578747272491455, + "step": 488 + }, + { + "epoch": 0.2445, + "grad_norm": 1.3884077072143555, + "grad_norm_var": 0.18778514582004005, + "learning_rate": 2e-05, + "loss": 0.4548, + "loss/crossentropy": 2.4432852268218994, + "loss/hidden": 0.171875, + "loss/logits": 0.0251072458922863, + "loss/reg": 0.025785457342863083, + "step": 489 + }, + { + "epoch": 0.245, + "grad_norm": 1.168309211730957, + "grad_norm_var": 0.1903861218173105, + "learning_rate": 2e-05, + "loss": 0.4505, + "loss/crossentropy": 2.2845112085342407, + "loss/hidden": 0.16162109375, + "loss/logits": 0.031048119068145752, + "loss/reg": 0.025783469900488853, + "step": 490 + }, + { + "epoch": 0.2455, + "grad_norm": 1.2630984783172607, + "grad_norm_var": 0.17834144864876003, + "learning_rate": 2e-05, + "loss": 0.4489, + "loss/crossentropy": 2.357891082763672, + "loss/hidden": 0.1650390625, + "loss/logits": 0.02602921612560749, + "loss/reg": 0.02578144334256649, + "step": 491 + }, + { + "epoch": 0.246, + "grad_norm": 1.0908071994781494, + "grad_norm_var": 0.16298183484375428, + "learning_rate": 2e-05, + "loss": 0.453, + "loss/crossentropy": 2.3261715173721313, + "loss/hidden": 0.15625, + "loss/logits": 0.03899524360895157, + "loss/reg": 0.02577943727374077, + "step": 492 + }, + { + "epoch": 0.2465, + "grad_norm": 1.1049315929412842, + "grad_norm_var": 0.165057508559335, + "learning_rate": 2e-05, + "loss": 0.4612, + "loss/crossentropy": 2.347619652748108, + "loss/hidden": 0.17529296875, + "loss/logits": 0.028116335161030293, + "loss/reg": 0.025777503848075867, + "step": 493 + }, + { + "epoch": 0.247, + "grad_norm": 1.2722063064575195, + "grad_norm_var": 0.1633202153049367, + "learning_rate": 2e-05, + "loss": 0.4791, + "loss/crossentropy": 2.428340435028076, + "loss/hidden": 0.1904296875, + "loss/logits": 0.030900001525878906, + "loss/reg": 0.02577553130686283, + "step": 494 + }, + { + "epoch": 0.2475, + "grad_norm": 1.0491212606430054, + "grad_norm_var": 0.16326816109757653, + "learning_rate": 2e-05, + "loss": 0.4459, + "loss/crossentropy": 2.44633686542511, + "loss/hidden": 0.1630859375, + "loss/logits": 0.02505970373749733, + "loss/reg": 0.025773610919713974, + "step": 495 + }, + { + "epoch": 0.248, + "grad_norm": 1.4577767848968506, + "grad_norm_var": 0.1646181560542212, + "learning_rate": 2e-05, + "loss": 0.5875, + "loss/crossentropy": 2.1383886337280273, + "loss/hidden": 0.2783203125, + "loss/logits": 0.05150237772613764, + "loss/reg": 0.02577175572514534, + "step": 496 + }, + { + "epoch": 0.2485, + "grad_norm": 1.1691800355911255, + "grad_norm_var": 0.023045095234002843, + "learning_rate": 2e-05, + "loss": 0.4948, + "loss/crossentropy": 2.406825542449951, + "loss/hidden": 0.2001953125, + "loss/logits": 0.036906635388731956, + "loss/reg": 0.025769958272576332, + "step": 497 + }, + { + "epoch": 0.249, + "grad_norm": 1.1311383247375488, + "grad_norm_var": 0.023563575455448373, + "learning_rate": 2e-05, + "loss": 0.4495, + "loss/crossentropy": 2.2107361555099487, + "loss/hidden": 0.16943359375, + "loss/logits": 0.02235421910881996, + "loss/reg": 0.025767968967556953, + "step": 498 + }, + { + "epoch": 0.2495, + "grad_norm": 1.2846966981887817, + "grad_norm_var": 0.022443893755450736, + "learning_rate": 2e-05, + "loss": 0.484, + "loss/crossentropy": 2.169008255004883, + "loss/hidden": 0.18798828125, + "loss/logits": 0.03831418417394161, + "loss/reg": 0.025765718892216682, + "step": 499 + }, + { + "epoch": 0.25, + "grad_norm": 1.2635072469711304, + "grad_norm_var": 0.02208093059473833, + "learning_rate": 2e-05, + "loss": 0.5241, + "loss/crossentropy": 2.3311681747436523, + "loss/hidden": 0.228515625, + "loss/logits": 0.037904972210526466, + "loss/reg": 0.025763733312487602, + "step": 500 + }, + { + "epoch": 0.2505, + "grad_norm": 1.8094271421432495, + "grad_norm_var": 0.04191426078620844, + "learning_rate": 2e-05, + "loss": 0.5198, + "loss/crossentropy": 2.18564236164093, + "loss/hidden": 0.22705078125, + "loss/logits": 0.03517603315412998, + "loss/reg": 0.025761688128113747, + "step": 501 + }, + { + "epoch": 0.251, + "grad_norm": 1.4268393516540527, + "grad_norm_var": 0.042022005671670054, + "learning_rate": 2e-05, + "loss": 0.5027, + "loss/crossentropy": 2.3186033964157104, + "loss/hidden": 0.2109375, + "loss/logits": 0.03412244841456413, + "loss/reg": 0.025759579613804817, + "step": 502 + }, + { + "epoch": 0.2515, + "grad_norm": 1.6704895496368408, + "grad_norm_var": 0.04904823070484075, + "learning_rate": 2e-05, + "loss": 0.507, + "loss/crossentropy": 2.444745898246765, + "loss/hidden": 0.21240234375, + "loss/logits": 0.03699003718793392, + "loss/reg": 0.02575748972594738, + "step": 503 + }, + { + "epoch": 0.252, + "grad_norm": 1.4041507244110107, + "grad_norm_var": 0.04442425217177727, + "learning_rate": 2e-05, + "loss": 0.4537, + "loss/crossentropy": 2.3990856409072876, + "loss/hidden": 0.166015625, + "loss/logits": 0.030156807973980904, + "loss/reg": 0.025755319744348526, + "step": 504 + }, + { + "epoch": 0.2525, + "grad_norm": 1.5246555805206299, + "grad_norm_var": 0.04701556722156628, + "learning_rate": 2e-05, + "loss": 0.5252, + "loss/crossentropy": 2.551340937614441, + "loss/hidden": 0.23583984375, + "loss/logits": 0.031838640570640564, + "loss/reg": 0.025753194466233253, + "step": 505 + }, + { + "epoch": 0.253, + "grad_norm": 1.8362479209899902, + "grad_norm_var": 0.06155521373344843, + "learning_rate": 2e-05, + "loss": 0.5572, + "loss/crossentropy": 2.1370293498039246, + "loss/hidden": 0.23974609375, + "loss/logits": 0.05993914417922497, + "loss/reg": 0.02575111947953701, + "step": 506 + }, + { + "epoch": 0.2535, + "grad_norm": 1.1423455476760864, + "grad_norm_var": 0.06402495885733686, + "learning_rate": 2e-05, + "loss": 0.4659, + "loss/crossentropy": 2.4107199907302856, + "loss/hidden": 0.1806640625, + "loss/logits": 0.027789254672825336, + "loss/reg": 0.025749139487743378, + "step": 507 + }, + { + "epoch": 0.254, + "grad_norm": 1.2471706867218018, + "grad_norm_var": 0.06010039179400053, + "learning_rate": 2e-05, + "loss": 0.5008, + "loss/crossentropy": 2.2391830682754517, + "loss/hidden": 0.20703125, + "loss/logits": 0.03628289885818958, + "loss/reg": 0.025747055187821388, + "step": 508 + }, + { + "epoch": 0.2545, + "grad_norm": 1.6316094398498535, + "grad_norm_var": 0.059376668774846306, + "learning_rate": 2e-05, + "loss": 0.5185, + "loss/crossentropy": 2.4537373781204224, + "loss/hidden": 0.2294921875, + "loss/logits": 0.031561460345983505, + "loss/reg": 0.02574506774544716, + "step": 509 + }, + { + "epoch": 0.255, + "grad_norm": 1.7221488952636719, + "grad_norm_var": 0.06466089846806326, + "learning_rate": 2e-05, + "loss": 0.5166, + "loss/crossentropy": 2.008660316467285, + "loss/hidden": 0.2294921875, + "loss/logits": 0.029630004428327084, + "loss/reg": 0.02574305608868599, + "step": 510 + }, + { + "epoch": 0.2555, + "grad_norm": 2.063495635986328, + "grad_norm_var": 0.07838236427375302, + "learning_rate": 2e-05, + "loss": 0.6291, + "loss/crossentropy": 2.2193583250045776, + "loss/hidden": 0.3251953125, + "loss/logits": 0.04652561619877815, + "loss/reg": 0.025741035118699074, + "step": 511 + }, + { + "epoch": 0.256, + "grad_norm": 2.1549365520477295, + "grad_norm_var": 0.10608428210922506, + "learning_rate": 2e-05, + "loss": 0.5281, + "loss/crossentropy": 1.9776748418807983, + "loss/hidden": 0.2412109375, + "loss/logits": 0.029531195759773254, + "loss/reg": 0.025739166885614395, + "step": 512 + }, + { + "epoch": 0.2565, + "grad_norm": 2.0352017879486084, + "grad_norm_var": 0.11128044423135464, + "learning_rate": 2e-05, + "loss": 0.5546, + "loss/crossentropy": 2.393476963043213, + "loss/hidden": 0.25341796875, + "loss/logits": 0.043846890330314636, + "loss/reg": 0.02573738433420658, + "step": 513 + }, + { + "epoch": 0.257, + "grad_norm": 1.3759031295776367, + "grad_norm_var": 0.10023724397306069, + "learning_rate": 2e-05, + "loss": 0.5094, + "loss/crossentropy": 2.3285356760025024, + "loss/hidden": 0.21875, + "loss/logits": 0.03326238878071308, + "loss/reg": 0.02573556825518608, + "step": 514 + }, + { + "epoch": 0.2575, + "grad_norm": 2.0449092388153076, + "grad_norm_var": 0.10444321701007618, + "learning_rate": 2e-05, + "loss": 0.5387, + "loss/crossentropy": 2.3776673078536987, + "loss/hidden": 0.22216796875, + "loss/logits": 0.05914916470646858, + "loss/reg": 0.025733835995197296, + "step": 515 + }, + { + "epoch": 0.258, + "grad_norm": 1.2532458305358887, + "grad_norm_var": 0.10497457736165051, + "learning_rate": 2e-05, + "loss": 0.4725, + "loss/crossentropy": 2.5611300468444824, + "loss/hidden": 0.1796875, + "loss/logits": 0.035542636178433895, + "loss/reg": 0.025731824338436127, + "step": 516 + }, + { + "epoch": 0.2585, + "grad_norm": 1.166143774986267, + "grad_norm_var": 0.11685692171310862, + "learning_rate": 2e-05, + "loss": 0.4871, + "loss/crossentropy": 2.292641520500183, + "loss/hidden": 0.19873046875, + "loss/logits": 0.031022757291793823, + "loss/reg": 0.025729816406965256, + "step": 517 + }, + { + "epoch": 0.259, + "grad_norm": 0.9319448471069336, + "grad_norm_var": 0.14400094830482596, + "learning_rate": 2e-05, + "loss": 0.4359, + "loss/crossentropy": 2.2908148765563965, + "loss/hidden": 0.15673828125, + "loss/logits": 0.021897392347455025, + "loss/reg": 0.025727812200784683, + "step": 518 + }, + { + "epoch": 0.2595, + "grad_norm": 1.3351777791976929, + "grad_norm_var": 0.146771754161323, + "learning_rate": 2e-05, + "loss": 0.5675, + "loss/crossentropy": 2.117431879043579, + "loss/hidden": 0.26708984375, + "loss/logits": 0.04313970357179642, + "loss/reg": 0.025725772604346275, + "step": 519 + }, + { + "epoch": 0.26, + "grad_norm": 1.2591443061828613, + "grad_norm_var": 0.1509895364147709, + "learning_rate": 2e-05, + "loss": 0.472, + "loss/crossentropy": 2.2097796201705933, + "loss/hidden": 0.1875, + "loss/logits": 0.027306508272886276, + "loss/reg": 0.025723854079842567, + "step": 520 + }, + { + "epoch": 0.2605, + "grad_norm": 1.4675482511520386, + "grad_norm_var": 0.15135031036683486, + "learning_rate": 2e-05, + "loss": 0.5158, + "loss/crossentropy": 2.1767526865005493, + "loss/hidden": 0.22119140625, + "loss/logits": 0.037375250831246376, + "loss/reg": 0.025722013786435127, + "step": 521 + }, + { + "epoch": 0.261, + "grad_norm": 1.318777322769165, + "grad_norm_var": 0.1477635335278175, + "learning_rate": 2e-05, + "loss": 0.4433, + "loss/crossentropy": 2.3265219926834106, + "loss/hidden": 0.16064453125, + "loss/logits": 0.025505591183900833, + "loss/reg": 0.025719961151480675, + "step": 522 + }, + { + "epoch": 0.2615, + "grad_norm": 1.4309393167495728, + "grad_norm_var": 0.13884665705610644, + "learning_rate": 2e-05, + "loss": 0.4473, + "loss/crossentropy": 2.297981858253479, + "loss/hidden": 0.1650390625, + "loss/logits": 0.025057895109057426, + "loss/reg": 0.025717932730913162, + "step": 523 + }, + { + "epoch": 0.262, + "grad_norm": 2.0628879070281982, + "grad_norm_var": 0.14995613654657897, + "learning_rate": 2e-05, + "loss": 0.4881, + "loss/crossentropy": 2.3760178089141846, + "loss/hidden": 0.1982421875, + "loss/logits": 0.032686688005924225, + "loss/reg": 0.025715861469507217, + "step": 524 + }, + { + "epoch": 0.2625, + "grad_norm": 1.75223970413208, + "grad_norm_var": 0.1517218258554711, + "learning_rate": 2e-05, + "loss": 0.4799, + "loss/crossentropy": 2.3368886709213257, + "loss/hidden": 0.19287109375, + "loss/logits": 0.02990109659731388, + "loss/reg": 0.02571384236216545, + "step": 525 + }, + { + "epoch": 0.263, + "grad_norm": 1.1534109115600586, + "grad_norm_var": 0.16160742489911778, + "learning_rate": 2e-05, + "loss": 0.4834, + "loss/crossentropy": 2.2142513394355774, + "loss/hidden": 0.19482421875, + "loss/logits": 0.031417591497302055, + "loss/reg": 0.025711748749017715, + "step": 526 + }, + { + "epoch": 0.2635, + "grad_norm": 1.425850510597229, + "grad_norm_var": 0.143393700633121, + "learning_rate": 2e-05, + "loss": 0.4661, + "loss/crossentropy": 2.439908742904663, + "loss/hidden": 0.1796875, + "loss/logits": 0.029349423944950104, + "loss/reg": 0.02570977620780468, + "step": 527 + }, + { + "epoch": 0.264, + "grad_norm": 1.4983434677124023, + "grad_norm_var": 0.11392210677285745, + "learning_rate": 2e-05, + "loss": 0.508, + "loss/crossentropy": 2.310701370239258, + "loss/hidden": 0.1728515625, + "loss/logits": 0.078089265152812, + "loss/reg": 0.025707799941301346, + "step": 528 + }, + { + "epoch": 0.2645, + "grad_norm": 1.6121326684951782, + "grad_norm_var": 0.09319685976795024, + "learning_rate": 2e-05, + "loss": 0.5862, + "loss/crossentropy": 2.195641279220581, + "loss/hidden": 0.28857421875, + "loss/logits": 0.04055267106741667, + "loss/reg": 0.025705868378281593, + "step": 529 + }, + { + "epoch": 0.265, + "grad_norm": 1.4942004680633545, + "grad_norm_var": 0.09301259307607192, + "learning_rate": 2e-05, + "loss": 0.5006, + "loss/crossentropy": 2.2726430892944336, + "loss/hidden": 0.2060546875, + "loss/logits": 0.037457194179296494, + "loss/reg": 0.025703880935907364, + "step": 530 + }, + { + "epoch": 0.2655, + "grad_norm": 2.3016085624694824, + "grad_norm_var": 0.11747795625703147, + "learning_rate": 2e-05, + "loss": 0.5752, + "loss/crossentropy": 2.360868453979492, + "loss/hidden": 0.28515625, + "loss/logits": 0.03298346884548664, + "loss/reg": 0.02570200525224209, + "step": 531 + }, + { + "epoch": 0.266, + "grad_norm": 1.9155231714248657, + "grad_norm_var": 0.12606227216750904, + "learning_rate": 2e-05, + "loss": 0.5386, + "loss/crossentropy": 2.1607614755630493, + "loss/hidden": 0.23974609375, + "loss/logits": 0.04186772648245096, + "loss/reg": 0.025700142607092857, + "step": 532 + }, + { + "epoch": 0.2665, + "grad_norm": 1.9601225852966309, + "grad_norm_var": 0.12928627941643545, + "learning_rate": 2e-05, + "loss": 0.5628, + "loss/crossentropy": 2.4702740907669067, + "loss/hidden": 0.2607421875, + "loss/logits": 0.04509196989238262, + "loss/reg": 0.025698326528072357, + "step": 533 + }, + { + "epoch": 0.267, + "grad_norm": 1.6414953470230103, + "grad_norm_var": 0.10157179579757945, + "learning_rate": 2e-05, + "loss": 0.435, + "loss/crossentropy": 2.3161516189575195, + "loss/hidden": 0.15576171875, + "loss/logits": 0.022296501323580742, + "loss/reg": 0.025696277618408203, + "step": 534 + }, + { + "epoch": 0.2675, + "grad_norm": 1.7865321636199951, + "grad_norm_var": 0.09825659810908008, + "learning_rate": 2e-05, + "loss": 0.4618, + "loss/crossentropy": 2.3330507278442383, + "loss/hidden": 0.17724609375, + "loss/logits": 0.02764590922743082, + "loss/reg": 0.025694238021969795, + "step": 535 + }, + { + "epoch": 0.268, + "grad_norm": 1.8025976419448853, + "grad_norm_var": 0.08983964833530009, + "learning_rate": 2e-05, + "loss": 0.5996, + "loss/crossentropy": 2.1311851739883423, + "loss/hidden": 0.2939453125, + "loss/logits": 0.04878038726747036, + "loss/reg": 0.025692163035273552, + "step": 536 + }, + { + "epoch": 0.2685, + "grad_norm": 1.3388440608978271, + "grad_norm_var": 0.09424639337241937, + "learning_rate": 2e-05, + "loss": 0.4705, + "loss/crossentropy": 2.276059627532959, + "loss/hidden": 0.17919921875, + "loss/logits": 0.03437704313546419, + "loss/reg": 0.02569023333489895, + "step": 537 + }, + { + "epoch": 0.269, + "grad_norm": 1.1625409126281738, + "grad_norm_var": 0.10279622484355831, + "learning_rate": 2e-05, + "loss": 0.4461, + "loss/crossentropy": 2.515538215637207, + "loss/hidden": 0.1630859375, + "loss/logits": 0.02611909992992878, + "loss/reg": 0.025688180699944496, + "step": 538 + }, + { + "epoch": 0.2695, + "grad_norm": 1.3560576438903809, + "grad_norm_var": 0.10529593288305386, + "learning_rate": 2e-05, + "loss": 0.4713, + "loss/crossentropy": 2.333776354789734, + "loss/hidden": 0.1796875, + "loss/logits": 0.03480132482945919, + "loss/reg": 0.02568606473505497, + "step": 539 + }, + { + "epoch": 0.27, + "grad_norm": 1.033890724182129, + "grad_norm_var": 0.11366219521287153, + "learning_rate": 2e-05, + "loss": 0.4471, + "loss/crossentropy": 2.451215624809265, + "loss/hidden": 0.16259765625, + "loss/logits": 0.02763993013650179, + "loss/reg": 0.02568388171494007, + "step": 540 + }, + { + "epoch": 0.2705, + "grad_norm": 1.3294425010681152, + "grad_norm_var": 0.11496770242969863, + "learning_rate": 2e-05, + "loss": 0.4625, + "loss/crossentropy": 2.5572686195373535, + "loss/hidden": 0.1748046875, + "loss/logits": 0.03091136459261179, + "loss/reg": 0.025681814178824425, + "step": 541 + }, + { + "epoch": 0.271, + "grad_norm": 1.6161302328109741, + "grad_norm_var": 0.10383304121057577, + "learning_rate": 2e-05, + "loss": 0.5046, + "loss/crossentropy": 2.4156856536865234, + "loss/hidden": 0.21923828125, + "loss/logits": 0.02856369875371456, + "loss/reg": 0.02568003162741661, + "step": 542 + }, + { + "epoch": 0.2715, + "grad_norm": 1.6249600648880005, + "grad_norm_var": 0.10222625558776764, + "learning_rate": 2e-05, + "loss": 0.5214, + "loss/crossentropy": 2.5546233654022217, + "loss/hidden": 0.2255859375, + "loss/logits": 0.03906646929681301, + "loss/reg": 0.025678148493170738, + "step": 543 + }, + { + "epoch": 0.272, + "grad_norm": 1.2705844640731812, + "grad_norm_var": 0.10831713729850012, + "learning_rate": 2e-05, + "loss": 0.5189, + "loss/crossentropy": 2.3757272958755493, + "loss/hidden": 0.21728515625, + "loss/logits": 0.04489796422421932, + "loss/reg": 0.025676140561699867, + "step": 544 + }, + { + "epoch": 0.2725, + "grad_norm": 1.05636727809906, + "grad_norm_var": 0.1250863434263256, + "learning_rate": 2e-05, + "loss": 0.46, + "loss/crossentropy": 2.383628726005554, + "loss/hidden": 0.17529296875, + "loss/logits": 0.02797263953834772, + "loss/reg": 0.025674104690551758, + "step": 545 + }, + { + "epoch": 0.273, + "grad_norm": 1.2423522472381592, + "grad_norm_var": 0.13069532228992856, + "learning_rate": 2e-05, + "loss": 0.4639, + "loss/crossentropy": 2.595247983932495, + "loss/hidden": 0.1767578125, + "loss/logits": 0.030470484867691994, + "loss/reg": 0.02567211352288723, + "step": 546 + }, + { + "epoch": 0.2735, + "grad_norm": 1.1715264320373535, + "grad_norm_var": 0.09386338960438909, + "learning_rate": 2e-05, + "loss": 0.4501, + "loss/crossentropy": 2.321129322052002, + "loss/hidden": 0.16552734375, + "loss/logits": 0.027902510948479176, + "loss/reg": 0.025670204311609268, + "step": 547 + }, + { + "epoch": 0.274, + "grad_norm": 1.5972819328308105, + "grad_norm_var": 0.08072905924476359, + "learning_rate": 2e-05, + "loss": 0.5185, + "loss/crossentropy": 2.2606377601623535, + "loss/hidden": 0.22021484375, + "loss/logits": 0.04160183481872082, + "loss/reg": 0.025668160989880562, + "step": 548 + }, + { + "epoch": 0.2745, + "grad_norm": 1.0867489576339722, + "grad_norm_var": 0.067476102626288, + "learning_rate": 2e-05, + "loss": 0.4364, + "loss/crossentropy": 2.463867425918579, + "loss/hidden": 0.15625, + "loss/logits": 0.02346113882958889, + "loss/reg": 0.02566620334982872, + "step": 549 + }, + { + "epoch": 0.275, + "grad_norm": 2.4649062156677246, + "grad_norm_var": 0.13830422072727325, + "learning_rate": 2e-05, + "loss": 0.5387, + "loss/crossentropy": 2.7201980352401733, + "loss/hidden": 0.20458984375, + "loss/logits": 0.07749359030276537, + "loss/reg": 0.02566409669816494, + "step": 550 + }, + { + "epoch": 0.2755, + "grad_norm": 1.4529809951782227, + "grad_norm_var": 0.1295704130285588, + "learning_rate": 2e-05, + "loss": 0.4755, + "loss/crossentropy": 2.4272106885910034, + "loss/hidden": 0.1943359375, + "loss/logits": 0.024497310630977154, + "loss/reg": 0.025662219151854515, + "step": 551 + }, + { + "epoch": 0.276, + "grad_norm": 1.18130362033844, + "grad_norm_var": 0.12141776800456393, + "learning_rate": 2e-05, + "loss": 0.4905, + "loss/crossentropy": 2.2826067209243774, + "loss/hidden": 0.19677734375, + "loss/logits": 0.03717024438083172, + "loss/reg": 0.025659961625933647, + "step": 552 + }, + { + "epoch": 0.2765, + "grad_norm": 1.4119619131088257, + "grad_norm_var": 0.12140800103305664, + "learning_rate": 2e-05, + "loss": 0.4373, + "loss/crossentropy": 2.6987099647521973, + "loss/hidden": 0.15673828125, + "loss/logits": 0.024006612598896027, + "loss/reg": 0.02565770410001278, + "step": 553 + }, + { + "epoch": 0.277, + "grad_norm": 1.4704711437225342, + "grad_norm_var": 0.11845981336057979, + "learning_rate": 2e-05, + "loss": 0.506, + "loss/crossentropy": 2.258659243583679, + "loss/hidden": 0.21728515625, + "loss/logits": 0.032203953713178635, + "loss/reg": 0.025655701756477356, + "step": 554 + }, + { + "epoch": 0.2775, + "grad_norm": 1.6067429780960083, + "grad_norm_var": 0.12098775757429862, + "learning_rate": 2e-05, + "loss": 0.4385, + "loss/crossentropy": 2.5011746883392334, + "loss/hidden": 0.1572265625, + "loss/logits": 0.024782009422779083, + "loss/reg": 0.025653747841715813, + "step": 555 + }, + { + "epoch": 0.278, + "grad_norm": 1.1298900842666626, + "grad_norm_var": 0.1167034622019452, + "learning_rate": 2e-05, + "loss": 0.4405, + "loss/crossentropy": 2.2035679817199707, + "loss/hidden": 0.15966796875, + "loss/logits": 0.024293298833072186, + "loss/reg": 0.02565157227218151, + "step": 556 + }, + { + "epoch": 0.2785, + "grad_norm": 0.9485200047492981, + "grad_norm_var": 0.13035156532443523, + "learning_rate": 2e-05, + "loss": 0.4479, + "loss/crossentropy": 2.5580928325653076, + "loss/hidden": 0.16357421875, + "loss/logits": 0.027810726314783096, + "loss/reg": 0.025649361312389374, + "step": 557 + }, + { + "epoch": 0.279, + "grad_norm": 1.709061622619629, + "grad_norm_var": 0.13362146514691972, + "learning_rate": 2e-05, + "loss": 0.4703, + "loss/crossentropy": 2.4233322143554688, + "loss/hidden": 0.1826171875, + "loss/logits": 0.031242147088050842, + "loss/reg": 0.025647401809692383, + "step": 558 + }, + { + "epoch": 0.2795, + "grad_norm": 1.1522104740142822, + "grad_norm_var": 0.13351084508299657, + "learning_rate": 2e-05, + "loss": 0.4706, + "loss/crossentropy": 2.5604687929153442, + "loss/hidden": 0.18798828125, + "loss/logits": 0.026114785112440586, + "loss/reg": 0.025645434856414795, + "step": 559 + }, + { + "epoch": 0.28, + "grad_norm": 1.5035618543624878, + "grad_norm_var": 0.13375114473651117, + "learning_rate": 2e-05, + "loss": 0.49, + "loss/crossentropy": 2.5127099752426147, + "loss/hidden": 0.2001953125, + "loss/logits": 0.03332594968378544, + "loss/reg": 0.02564323879778385, + "step": 560 + }, + { + "epoch": 0.2805, + "grad_norm": 1.3412765264511108, + "grad_norm_var": 0.12627894398200917, + "learning_rate": 2e-05, + "loss": 0.4533, + "loss/crossentropy": 2.3233593702316284, + "loss/hidden": 0.158203125, + "loss/logits": 0.038642819970846176, + "loss/reg": 0.02564125321805477, + "step": 561 + }, + { + "epoch": 0.281, + "grad_norm": 1.3613826036453247, + "grad_norm_var": 0.124592250727938, + "learning_rate": 2e-05, + "loss": 0.4388, + "loss/crossentropy": 2.6328701972961426, + "loss/hidden": 0.158203125, + "loss/logits": 0.02416001632809639, + "loss/reg": 0.025639118626713753, + "step": 562 + }, + { + "epoch": 0.2815, + "grad_norm": 1.4453518390655518, + "grad_norm_var": 0.12050377751008766, + "learning_rate": 2e-05, + "loss": 0.467, + "loss/crossentropy": 2.379120349884033, + "loss/hidden": 0.1748046875, + "loss/logits": 0.03583723120391369, + "loss/reg": 0.02563699148595333, + "step": 563 + }, + { + "epoch": 0.282, + "grad_norm": 1.1511297225952148, + "grad_norm_var": 0.12293264284763053, + "learning_rate": 2e-05, + "loss": 0.4638, + "loss/crossentropy": 2.211379051208496, + "loss/hidden": 0.1806640625, + "loss/logits": 0.026775190606713295, + "loss/reg": 0.025634942576289177, + "step": 564 + }, + { + "epoch": 0.2825, + "grad_norm": 1.229429841041565, + "grad_norm_var": 0.11822487448682713, + "learning_rate": 2e-05, + "loss": 0.465, + "loss/crossentropy": 2.3449004888534546, + "loss/hidden": 0.162109375, + "loss/logits": 0.04653145559132099, + "loss/reg": 0.02563273347914219, + "step": 565 + }, + { + "epoch": 0.283, + "grad_norm": 1.7746120691299438, + "grad_norm_var": 0.05091479897566722, + "learning_rate": 2e-05, + "loss": 0.5206, + "loss/crossentropy": 2.4769328832626343, + "loss/hidden": 0.2275390625, + "loss/logits": 0.036771247163414955, + "loss/reg": 0.025630656629800797, + "step": 566 + }, + { + "epoch": 0.2835, + "grad_norm": 1.0254523754119873, + "grad_norm_var": 0.0574298221699075, + "learning_rate": 2e-05, + "loss": 0.4352, + "loss/crossentropy": 2.4587230682373047, + "loss/hidden": 0.15625, + "loss/logits": 0.022636396810412407, + "loss/reg": 0.025628428906202316, + "step": 567 + }, + { + "epoch": 0.284, + "grad_norm": 1.4086933135986328, + "grad_norm_var": 0.05584552607974088, + "learning_rate": 2e-05, + "loss": 0.5428, + "loss/crossentropy": 2.3397552967071533, + "loss/hidden": 0.25244140625, + "loss/logits": 0.03410719987004995, + "loss/reg": 0.025626273825764656, + "step": 568 + }, + { + "epoch": 0.2845, + "grad_norm": 1.1368968486785889, + "grad_norm_var": 0.058461728907536765, + "learning_rate": 2e-05, + "loss": 0.4463, + "loss/crossentropy": 2.4085217714309692, + "loss/hidden": 0.16259765625, + "loss/logits": 0.027470089495182037, + "loss/reg": 0.025624196976423264, + "step": 569 + }, + { + "epoch": 0.285, + "grad_norm": 1.3466085195541382, + "grad_norm_var": 0.0572190922863477, + "learning_rate": 2e-05, + "loss": 0.4488, + "loss/crossentropy": 2.454616904258728, + "loss/hidden": 0.16259765625, + "loss/logits": 0.029985230416059494, + "loss/reg": 0.025622138753533363, + "step": 570 + }, + { + "epoch": 0.2855, + "grad_norm": 1.1087514162063599, + "grad_norm_var": 0.05430530108740682, + "learning_rate": 2e-05, + "loss": 0.4377, + "loss/crossentropy": 2.381610155105591, + "loss/hidden": 0.1572265625, + "loss/logits": 0.024281597696244717, + "loss/reg": 0.025620009750127792, + "step": 571 + }, + { + "epoch": 0.286, + "grad_norm": 2.252387046813965, + "grad_norm_var": 0.10784971065445176, + "learning_rate": 2e-05, + "loss": 0.5819, + "loss/crossentropy": 2.284385323524475, + "loss/hidden": 0.26611328125, + "loss/logits": 0.05961132235825062, + "loss/reg": 0.025618063285946846, + "step": 572 + }, + { + "epoch": 0.2865, + "grad_norm": 1.2841176986694336, + "grad_norm_var": 0.09609813291731933, + "learning_rate": 2e-05, + "loss": 0.4665, + "loss/crossentropy": 2.3138378858566284, + "loss/hidden": 0.18310546875, + "loss/logits": 0.027231371961534023, + "loss/reg": 0.025616133585572243, + "step": 573 + }, + { + "epoch": 0.287, + "grad_norm": 0.9297242164611816, + "grad_norm_var": 0.10084539110608777, + "learning_rate": 2e-05, + "loss": 0.4129, + "loss/crossentropy": 2.41566002368927, + "loss/hidden": 0.13671875, + "loss/logits": 0.020072663202881813, + "loss/reg": 0.02561403624713421, + "step": 574 + }, + { + "epoch": 0.2875, + "grad_norm": 1.3601016998291016, + "grad_norm_var": 0.09832118521838087, + "learning_rate": 2e-05, + "loss": 0.4644, + "loss/crossentropy": 2.2336788177490234, + "loss/hidden": 0.1787109375, + "loss/logits": 0.029539520852267742, + "loss/reg": 0.025612102821469307, + "step": 575 + }, + { + "epoch": 0.288, + "grad_norm": 1.6692289113998413, + "grad_norm_var": 0.10334643999860299, + "learning_rate": 2e-05, + "loss": 0.4483, + "loss/crossentropy": 2.3380844593048096, + "loss/hidden": 0.16552734375, + "loss/logits": 0.02669445425271988, + "loss/reg": 0.02561003342270851, + "step": 576 + }, + { + "epoch": 0.2885, + "grad_norm": 2.4334895610809326, + "grad_norm_var": 0.17458492052786573, + "learning_rate": 2e-05, + "loss": 0.558, + "loss/crossentropy": 2.2851526737213135, + "loss/hidden": 0.2548828125, + "loss/logits": 0.0470340047031641, + "loss/reg": 0.02560798078775406, + "step": 577 + }, + { + "epoch": 0.289, + "grad_norm": 1.154819130897522, + "grad_norm_var": 0.17920585225899094, + "learning_rate": 2e-05, + "loss": 0.4925, + "loss/crossentropy": 2.4699753522872925, + "loss/hidden": 0.20703125, + "loss/logits": 0.02939967904239893, + "loss/reg": 0.025605909526348114, + "step": 578 + }, + { + "epoch": 0.2895, + "grad_norm": 1.2029677629470825, + "grad_norm_var": 0.18203981769593008, + "learning_rate": 2e-05, + "loss": 0.4405, + "loss/crossentropy": 2.388526439666748, + "loss/hidden": 0.15771484375, + "loss/logits": 0.026735836640000343, + "loss/reg": 0.025603823363780975, + "step": 579 + }, + { + "epoch": 0.29, + "grad_norm": 1.2292884588241577, + "grad_norm_var": 0.17978354168637148, + "learning_rate": 2e-05, + "loss": 0.4722, + "loss/crossentropy": 2.2643179893493652, + "loss/hidden": 0.1845703125, + "loss/logits": 0.03162453696131706, + "loss/reg": 0.025601672008633614, + "step": 580 + }, + { + "epoch": 0.2905, + "grad_norm": 1.381611704826355, + "grad_norm_var": 0.1775840985068174, + "learning_rate": 2e-05, + "loss": 0.5298, + "loss/crossentropy": 2.317778706550598, + "loss/hidden": 0.22802734375, + "loss/logits": 0.04581563360989094, + "loss/reg": 0.025599613785743713, + "step": 581 + }, + { + "epoch": 0.291, + "grad_norm": 1.9058457612991333, + "grad_norm_var": 0.18488866977521237, + "learning_rate": 2e-05, + "loss": 0.5568, + "loss/crossentropy": 2.329615592956543, + "loss/hidden": 0.265625, + "loss/logits": 0.03523416444659233, + "loss/reg": 0.02559736929833889, + "step": 582 + }, + { + "epoch": 0.2915, + "grad_norm": 2.325834035873413, + "grad_norm_var": 0.22097551825223125, + "learning_rate": 2e-05, + "loss": 0.5317, + "loss/crossentropy": 2.537761688232422, + "loss/hidden": 0.2373046875, + "loss/logits": 0.038454240188002586, + "loss/reg": 0.025595253333449364, + "step": 583 + }, + { + "epoch": 0.292, + "grad_norm": 1.53029203414917, + "grad_norm_var": 0.22028718572734055, + "learning_rate": 2e-05, + "loss": 0.5085, + "loss/crossentropy": 2.4244139194488525, + "loss/hidden": 0.208984375, + "loss/logits": 0.04360722564160824, + "loss/reg": 0.025593377649784088, + "step": 584 + }, + { + "epoch": 0.2925, + "grad_norm": 1.2639271020889282, + "grad_norm_var": 0.21487899090266935, + "learning_rate": 2e-05, + "loss": 0.4995, + "loss/crossentropy": 2.2803802490234375, + "loss/hidden": 0.19921875, + "loss/logits": 0.04435891658067703, + "loss/reg": 0.025591382756829262, + "step": 585 + }, + { + "epoch": 0.293, + "grad_norm": 1.27842116355896, + "grad_norm_var": 0.21677952247985388, + "learning_rate": 2e-05, + "loss": 0.4548, + "loss/crossentropy": 2.3663710355758667, + "loss/hidden": 0.16845703125, + "loss/logits": 0.030416646972298622, + "loss/reg": 0.02558933198451996, + "step": 586 + }, + { + "epoch": 0.2935, + "grad_norm": 1.7024108171463013, + "grad_norm_var": 0.20629975430104253, + "learning_rate": 2e-05, + "loss": 0.5827, + "loss/crossentropy": 2.059163510799408, + "loss/hidden": 0.29248046875, + "loss/logits": 0.03431258723139763, + "loss/reg": 0.02558741346001625, + "step": 587 + }, + { + "epoch": 0.294, + "grad_norm": 1.4503967761993408, + "grad_norm_var": 0.1720895319235313, + "learning_rate": 2e-05, + "loss": 0.4796, + "loss/crossentropy": 2.4850029945373535, + "loss/hidden": 0.19873046875, + "loss/logits": 0.02503114379942417, + "loss/reg": 0.025585299357771873, + "step": 588 + }, + { + "epoch": 0.2945, + "grad_norm": 1.933565378189087, + "grad_norm_var": 0.1792024124736713, + "learning_rate": 2e-05, + "loss": 0.4693, + "loss/crossentropy": 2.4364657402038574, + "loss/hidden": 0.18408203125, + "loss/logits": 0.029408352449536324, + "loss/reg": 0.025583306327462196, + "step": 589 + }, + { + "epoch": 0.295, + "grad_norm": 1.8611115217208862, + "grad_norm_var": 0.15676426573083635, + "learning_rate": 2e-05, + "loss": 0.4429, + "loss/crossentropy": 2.441853404045105, + "loss/hidden": 0.16259765625, + "loss/logits": 0.02446013130247593, + "loss/reg": 0.025581372901797295, + "step": 590 + }, + { + "epoch": 0.2955, + "grad_norm": 1.1267725229263306, + "grad_norm_var": 0.1677922843229851, + "learning_rate": 2e-05, + "loss": 0.441, + "loss/crossentropy": 2.560555934906006, + "loss/hidden": 0.1591796875, + "loss/logits": 0.026059484109282494, + "loss/reg": 0.025579283013939857, + "step": 591 + }, + { + "epoch": 0.296, + "grad_norm": 9.252140998840332, + "grad_norm_var": 3.841050987302196, + "learning_rate": 2e-05, + "loss": 0.5664, + "loss/crossentropy": 2.2949132919311523, + "loss/hidden": 0.2783203125, + "loss/logits": 0.032294947654008865, + "loss/reg": 0.0255771204829216, + "step": 592 + }, + { + "epoch": 0.2965, + "grad_norm": 1.3007549047470093, + "grad_norm_var": 3.8655234521870527, + "learning_rate": 2e-05, + "loss": 0.441, + "loss/crossentropy": 2.31974720954895, + "loss/hidden": 0.1611328125, + "loss/logits": 0.02411063387989998, + "loss/reg": 0.025574954226613045, + "step": 593 + }, + { + "epoch": 0.297, + "grad_norm": 1.7131131887435913, + "grad_norm_var": 3.822554124166938, + "learning_rate": 2e-05, + "loss": 0.4643, + "loss/crossentropy": 2.39312207698822, + "loss/hidden": 0.18212890625, + "loss/logits": 0.026430404745042324, + "loss/reg": 0.0255727581679821, + "step": 594 + }, + { + "epoch": 0.2975, + "grad_norm": 1.6008955240249634, + "grad_norm_var": 3.7886423499076054, + "learning_rate": 2e-05, + "loss": 0.512, + "loss/crossentropy": 2.3966288566589355, + "loss/hidden": 0.2109375, + "loss/logits": 0.045352160930633545, + "loss/reg": 0.025570496916770935, + "step": 595 + }, + { + "epoch": 0.298, + "grad_norm": 1.7445118427276611, + "grad_norm_var": 3.748611248289865, + "learning_rate": 2e-05, + "loss": 0.538, + "loss/crossentropy": 1.9948007464408875, + "loss/hidden": 0.23388671875, + "loss/logits": 0.0484439916908741, + "loss/reg": 0.025568410754203796, + "step": 596 + }, + { + "epoch": 0.2985, + "grad_norm": 1.8626893758773804, + "grad_norm_var": 3.7179115354234606, + "learning_rate": 2e-05, + "loss": 0.5166, + "loss/crossentropy": 2.4233922958374023, + "loss/hidden": 0.22119140625, + "loss/logits": 0.03974040970206261, + "loss/reg": 0.025566227734088898, + "step": 597 + }, + { + "epoch": 0.299, + "grad_norm": 1.2042471170425415, + "grad_norm_var": 3.7683163733932923, + "learning_rate": 2e-05, + "loss": 0.4547, + "loss/crossentropy": 2.1347005367279053, + "loss/hidden": 0.17041015625, + "loss/logits": 0.028620691038668156, + "loss/reg": 0.025564009323716164, + "step": 598 + }, + { + "epoch": 0.2995, + "grad_norm": 1.5764883756637573, + "grad_norm_var": 3.778044329930853, + "learning_rate": 2e-05, + "loss": 0.4981, + "loss/crossentropy": 2.4556859731674194, + "loss/hidden": 0.2021484375, + "loss/logits": 0.0403362512588501, + "loss/reg": 0.025561654940247536, + "step": 599 + }, + { + "epoch": 0.3, + "grad_norm": 1.096451997756958, + "grad_norm_var": 3.8184307388690244, + "learning_rate": 2e-05, + "loss": 0.5136, + "loss/crossentropy": 2.321175456047058, + "loss/hidden": 0.21875, + "loss/logits": 0.03927676286548376, + "loss/reg": 0.02555953338742256, + "step": 600 + }, + { + "epoch": 0.3005, + "grad_norm": 1.2970784902572632, + "grad_norm_var": 3.8152547172108693, + "learning_rate": 2e-05, + "loss": 0.4625, + "loss/crossentropy": 2.479397773742676, + "loss/hidden": 0.166015625, + "loss/logits": 0.040870534256100655, + "loss/reg": 0.02555713802576065, + "step": 601 + }, + { + "epoch": 0.301, + "grad_norm": 1.2297303676605225, + "grad_norm_var": 3.8200878842337733, + "learning_rate": 2e-05, + "loss": 0.4696, + "loss/crossentropy": 2.384745955467224, + "loss/hidden": 0.1826171875, + "loss/logits": 0.031467003747820854, + "loss/reg": 0.02555503323674202, + "step": 602 + }, + { + "epoch": 0.3015, + "grad_norm": 0.9617077112197876, + "grad_norm_var": 3.883473919292651, + "learning_rate": 2e-05, + "loss": 0.4283, + "loss/crossentropy": 2.3142151832580566, + "loss/hidden": 0.14990234375, + "loss/logits": 0.022820310667157173, + "loss/reg": 0.02555287443101406, + "step": 603 + }, + { + "epoch": 0.302, + "grad_norm": 1.0868256092071533, + "grad_norm_var": 3.9159895776620384, + "learning_rate": 2e-05, + "loss": 0.445, + "loss/crossentropy": 2.4487764835357666, + "loss/hidden": 0.16064453125, + "loss/logits": 0.02884063497185707, + "loss/reg": 0.02555077336728573, + "step": 604 + }, + { + "epoch": 0.3025, + "grad_norm": 1.2197123765945435, + "grad_norm_var": 3.94730949969078, + "learning_rate": 2e-05, + "loss": 0.4717, + "loss/crossentropy": 2.4810107946395874, + "loss/hidden": 0.1865234375, + "loss/logits": 0.02972548082470894, + "loss/reg": 0.02554868534207344, + "step": 605 + }, + { + "epoch": 0.303, + "grad_norm": 1.2248950004577637, + "grad_norm_var": 3.974497531375912, + "learning_rate": 2e-05, + "loss": 0.4324, + "loss/crossentropy": 2.4361231327056885, + "loss/hidden": 0.15380859375, + "loss/logits": 0.023081100545823574, + "loss/reg": 0.025546491146087646, + "step": 606 + }, + { + "epoch": 0.3035, + "grad_norm": 1.5527586936950684, + "grad_norm_var": 3.945123091404479, + "learning_rate": 2e-05, + "loss": 0.4697, + "loss/crossentropy": 2.5113409757614136, + "loss/hidden": 0.18896484375, + "loss/logits": 0.02533858921378851, + "loss/reg": 0.02554413489997387, + "step": 607 + }, + { + "epoch": 0.304, + "grad_norm": 2.6096105575561523, + "grad_norm_var": 0.16489908848412535, + "learning_rate": 2e-05, + "loss": 0.442, + "loss/crossentropy": 2.345840811729431, + "loss/hidden": 0.15771484375, + "loss/logits": 0.028883887454867363, + "loss/reg": 0.02554202266037464, + "step": 608 + }, + { + "epoch": 0.3045, + "grad_norm": 1.2581703662872314, + "grad_norm_var": 0.1658887448879168, + "learning_rate": 2e-05, + "loss": 0.4535, + "loss/crossentropy": 2.4540570974349976, + "loss/hidden": 0.16845703125, + "loss/logits": 0.029606305062770844, + "loss/reg": 0.025539804250001907, + "step": 609 + }, + { + "epoch": 0.305, + "grad_norm": 1.0741705894470215, + "grad_norm_var": 0.16919604526549956, + "learning_rate": 2e-05, + "loss": 0.4519, + "loss/crossentropy": 2.4377275705337524, + "loss/hidden": 0.1650390625, + "loss/logits": 0.031461406499147415, + "loss/reg": 0.02553771249949932, + "step": 610 + }, + { + "epoch": 0.3055, + "grad_norm": 1.2582398653030396, + "grad_norm_var": 0.1679268859736533, + "learning_rate": 2e-05, + "loss": 0.4509, + "loss/crossentropy": 2.414643406867981, + "loss/hidden": 0.166015625, + "loss/logits": 0.029505026526749134, + "loss/reg": 0.025535589084029198, + "step": 611 + }, + { + "epoch": 0.306, + "grad_norm": 1.128620982170105, + "grad_norm_var": 0.16261113353333864, + "learning_rate": 2e-05, + "loss": 0.4456, + "loss/crossentropy": 2.4645986557006836, + "loss/hidden": 0.16357421875, + "loss/logits": 0.02667510323226452, + "loss/reg": 0.025533363223075867, + "step": 612 + }, + { + "epoch": 0.3065, + "grad_norm": 1.2573778629302979, + "grad_norm_var": 0.14434184243498857, + "learning_rate": 2e-05, + "loss": 0.4809, + "loss/crossentropy": 2.4240217208862305, + "loss/hidden": 0.1953125, + "loss/logits": 0.030296322889626026, + "loss/reg": 0.025531131774187088, + "step": 613 + }, + { + "epoch": 0.307, + "grad_norm": 7.996622562408447, + "grad_norm_var": 2.9277827960594167, + "learning_rate": 2e-05, + "loss": 0.9676, + "loss/crossentropy": 2.0657594203948975, + "loss/hidden": 0.5224609375, + "loss/logits": 0.1897994950413704, + "loss/reg": 0.02552902325987816, + "step": 614 + }, + { + "epoch": 0.3075, + "grad_norm": 0.9756619930267334, + "grad_norm_var": 2.963385991390479, + "learning_rate": 2e-05, + "loss": 0.4192, + "loss/crossentropy": 2.511542320251465, + "loss/hidden": 0.142578125, + "loss/logits": 0.02132318541407585, + "loss/reg": 0.025526810437440872, + "step": 615 + }, + { + "epoch": 0.308, + "grad_norm": 1.9326781034469604, + "grad_norm_var": 2.939604367144011, + "learning_rate": 2e-05, + "loss": 0.4602, + "loss/crossentropy": 2.2573784589767456, + "loss/hidden": 0.18212890625, + "loss/logits": 0.02284115180373192, + "loss/reg": 0.025524748489260674, + "step": 616 + }, + { + "epoch": 0.3085, + "grad_norm": 1.3322287797927856, + "grad_norm_var": 2.9375401728012682, + "learning_rate": 2e-05, + "loss": 0.4746, + "loss/crossentropy": 2.328918933868408, + "loss/hidden": 0.1884765625, + "loss/logits": 0.03090812638401985, + "loss/reg": 0.02552272193133831, + "step": 617 + }, + { + "epoch": 0.309, + "grad_norm": 1.368570327758789, + "grad_norm_var": 2.92899917136145, + "learning_rate": 2e-05, + "loss": 0.4906, + "loss/crossentropy": 2.449865460395813, + "loss/hidden": 0.189453125, + "loss/logits": 0.04598201438784599, + "loss/reg": 0.025520512834191322, + "step": 618 + }, + { + "epoch": 0.3095, + "grad_norm": 1.2598553895950317, + "grad_norm_var": 2.902626964663748, + "learning_rate": 2e-05, + "loss": 0.4644, + "loss/crossentropy": 2.4811675548553467, + "loss/hidden": 0.1748046875, + "loss/logits": 0.03438819758594036, + "loss/reg": 0.025518309324979782, + "step": 619 + }, + { + "epoch": 0.31, + "grad_norm": 1.5200271606445312, + "grad_norm_var": 2.8741158851437234, + "learning_rate": 2e-05, + "loss": 0.4648, + "loss/crossentropy": 2.443149447441101, + "loss/hidden": 0.1796875, + "loss/logits": 0.02990366704761982, + "loss/reg": 0.025516200810670853, + "step": 620 + }, + { + "epoch": 0.3105, + "grad_norm": 1.1189664602279663, + "grad_norm_var": 2.882687177244717, + "learning_rate": 2e-05, + "loss": 0.4401, + "loss/crossentropy": 2.3359590768814087, + "loss/hidden": 0.1591796875, + "loss/logits": 0.025752616114914417, + "loss/reg": 0.02551414631307125, + "step": 621 + }, + { + "epoch": 0.311, + "grad_norm": 1.1328538656234741, + "grad_norm_var": 2.8903269313735427, + "learning_rate": 2e-05, + "loss": 0.4446, + "loss/crossentropy": 2.3448485136032104, + "loss/hidden": 0.16357421875, + "loss/logits": 0.025948218069970608, + "loss/reg": 0.025512101128697395, + "step": 622 + }, + { + "epoch": 0.3115, + "grad_norm": 1.3280351161956787, + "grad_norm_var": 2.9008471808041234, + "learning_rate": 2e-05, + "loss": 0.502, + "loss/crossentropy": 2.3840510845184326, + "loss/hidden": 0.203125, + "loss/logits": 0.04378024488687515, + "loss/reg": 0.025509938597679138, + "step": 623 + }, + { + "epoch": 0.312, + "grad_norm": 1.2119669914245605, + "grad_norm_var": 2.8691701461931562, + "learning_rate": 2e-05, + "loss": 0.4384, + "loss/crossentropy": 2.5495107173919678, + "loss/hidden": 0.158203125, + "loss/logits": 0.025127064436674118, + "loss/reg": 0.025507742539048195, + "step": 624 + }, + { + "epoch": 0.3125, + "grad_norm": 1.3906071186065674, + "grad_norm_var": 2.862515149821022, + "learning_rate": 2e-05, + "loss": 0.4877, + "loss/crossentropy": 2.418786406517029, + "loss/hidden": 0.1953125, + "loss/logits": 0.037290943786501884, + "loss/reg": 0.02550552599132061, + "step": 625 + }, + { + "epoch": 0.313, + "grad_norm": 1.1530770063400269, + "grad_norm_var": 2.8562631605775035, + "learning_rate": 2e-05, + "loss": 0.4523, + "loss/crossentropy": 2.5153443813323975, + "loss/hidden": 0.16796875, + "loss/logits": 0.029278968460857868, + "loss/reg": 0.02550341933965683, + "step": 626 + }, + { + "epoch": 0.3135, + "grad_norm": 1.9171541929244995, + "grad_norm_var": 2.843679575594864, + "learning_rate": 2e-05, + "loss": 0.4798, + "loss/crossentropy": 2.592397689819336, + "loss/hidden": 0.19482421875, + "loss/logits": 0.029941866174340248, + "loss/reg": 0.025501396507024765, + "step": 627 + }, + { + "epoch": 0.314, + "grad_norm": 1.4067423343658447, + "grad_norm_var": 2.8254152118389118, + "learning_rate": 2e-05, + "loss": 0.4421, + "loss/crossentropy": 2.360334277153015, + "loss/hidden": 0.1650390625, + "loss/logits": 0.02205614186823368, + "loss/reg": 0.02549940161406994, + "step": 628 + }, + { + "epoch": 0.3145, + "grad_norm": 1.271565318107605, + "grad_norm_var": 2.8244601627756833, + "learning_rate": 2e-05, + "loss": 0.492, + "loss/crossentropy": 2.323120355606079, + "loss/hidden": 0.20556640625, + "loss/logits": 0.031410202383995056, + "loss/reg": 0.025497442111372948, + "step": 629 + }, + { + "epoch": 0.315, + "grad_norm": 1.192052960395813, + "grad_norm_var": 0.06888867322267149, + "learning_rate": 2e-05, + "loss": 0.5017, + "loss/crossentropy": 2.176342010498047, + "loss/hidden": 0.2138671875, + "loss/logits": 0.032856905832886696, + "loss/reg": 0.025495316833257675, + "step": 630 + }, + { + "epoch": 0.3155, + "grad_norm": 1.0660690069198608, + "grad_norm_var": 0.06495340762153295, + "learning_rate": 2e-05, + "loss": 0.4197, + "loss/crossentropy": 2.4988861083984375, + "loss/hidden": 0.1416015625, + "loss/logits": 0.023143235594034195, + "loss/reg": 0.025493212044239044, + "step": 631 + }, + { + "epoch": 0.316, + "grad_norm": 1.3349320888519287, + "grad_norm_var": 0.04085774566783152, + "learning_rate": 2e-05, + "loss": 0.4506, + "loss/crossentropy": 2.455591082572937, + "loss/hidden": 0.169921875, + "loss/logits": 0.0257937153801322, + "loss/reg": 0.025491099804639816, + "step": 632 + }, + { + "epoch": 0.3165, + "grad_norm": 3.0417232513427734, + "grad_norm_var": 0.22793577307115717, + "learning_rate": 2e-05, + "loss": 0.5239, + "loss/crossentropy": 2.37486732006073, + "loss/hidden": 0.2314453125, + "loss/logits": 0.03756898641586304, + "loss/reg": 0.025489188730716705, + "step": 633 + }, + { + "epoch": 0.317, + "grad_norm": 1.2681235074996948, + "grad_norm_var": 0.22925030763110257, + "learning_rate": 2e-05, + "loss": 0.5084, + "loss/crossentropy": 2.259597897529602, + "loss/hidden": 0.21337890625, + "loss/logits": 0.040193804539740086, + "loss/reg": 0.025486983358860016, + "step": 634 + }, + { + "epoch": 0.3175, + "grad_norm": 1.7327830791473389, + "grad_norm_var": 0.2335495834433952, + "learning_rate": 2e-05, + "loss": 0.464, + "loss/crossentropy": 2.754118800163269, + "loss/hidden": 0.18115234375, + "loss/logits": 0.027960547246038914, + "loss/reg": 0.025484783574938774, + "step": 635 + }, + { + "epoch": 0.318, + "grad_norm": 1.1379330158233643, + "grad_norm_var": 0.23874590770987894, + "learning_rate": 2e-05, + "loss": 0.4394, + "loss/crossentropy": 2.275562047958374, + "loss/hidden": 0.1630859375, + "loss/logits": 0.021530453115701675, + "loss/reg": 0.02548276260495186, + "step": 636 + }, + { + "epoch": 0.3185, + "grad_norm": 2.230278253555298, + "grad_norm_var": 0.2714714145474554, + "learning_rate": 2e-05, + "loss": 0.5173, + "loss/crossentropy": 2.338230013847351, + "loss/hidden": 0.23681640625, + "loss/logits": 0.025674378499388695, + "loss/reg": 0.025480857118964195, + "step": 637 + }, + { + "epoch": 0.319, + "grad_norm": 1.439009428024292, + "grad_norm_var": 0.26281213986055435, + "learning_rate": 2e-05, + "loss": 0.4592, + "loss/crossentropy": 2.2542585134506226, + "loss/hidden": 0.177734375, + "loss/logits": 0.026685651391744614, + "loss/reg": 0.025478988885879517, + "step": 638 + }, + { + "epoch": 0.3195, + "grad_norm": 1.2269906997680664, + "grad_norm_var": 0.26586984825830745, + "learning_rate": 2e-05, + "loss": 0.484, + "loss/crossentropy": 2.4006763696670532, + "loss/hidden": 0.19189453125, + "loss/logits": 0.03735353797674179, + "loss/reg": 0.02547682449221611, + "step": 639 + }, + { + "epoch": 0.32, + "grad_norm": 1.4380499124526978, + "grad_norm_var": 0.2603422819560449, + "learning_rate": 2e-05, + "loss": 0.4904, + "loss/crossentropy": 2.2535301446914673, + "loss/hidden": 0.2001953125, + "loss/logits": 0.035440364852547646, + "loss/reg": 0.02547490783035755, + "step": 640 + }, + { + "epoch": 0.3205, + "grad_norm": 1.1311625242233276, + "grad_norm_var": 0.268867656697473, + "learning_rate": 2e-05, + "loss": 0.4509, + "loss/crossentropy": 2.3959745168685913, + "loss/hidden": 0.1689453125, + "loss/logits": 0.027200866490602493, + "loss/reg": 0.025472737848758698, + "step": 641 + }, + { + "epoch": 0.321, + "grad_norm": 1.5106732845306396, + "grad_norm_var": 0.2603555469624775, + "learning_rate": 2e-05, + "loss": 0.4666, + "loss/crossentropy": 2.3994356393814087, + "loss/hidden": 0.17919921875, + "loss/logits": 0.03274068981409073, + "loss/reg": 0.025470787659287453, + "step": 642 + }, + { + "epoch": 0.3215, + "grad_norm": 1.823555588722229, + "grad_norm_var": 0.25596636935256256, + "learning_rate": 2e-05, + "loss": 0.5589, + "loss/crossentropy": 2.1762577295303345, + "loss/hidden": 0.255859375, + "loss/logits": 0.04833154007792473, + "loss/reg": 0.025468602776527405, + "step": 643 + }, + { + "epoch": 0.322, + "grad_norm": 1.4282046556472778, + "grad_norm_var": 0.2556832814253122, + "learning_rate": 2e-05, + "loss": 0.4487, + "loss/crossentropy": 2.3608391284942627, + "loss/hidden": 0.1650390625, + "loss/logits": 0.02895598392933607, + "loss/reg": 0.0254666730761528, + "step": 644 + }, + { + "epoch": 0.3225, + "grad_norm": 1.1915183067321777, + "grad_norm_var": 0.2587039981971661, + "learning_rate": 2e-05, + "loss": 0.4567, + "loss/crossentropy": 2.160835921764374, + "loss/hidden": 0.17529296875, + "loss/logits": 0.026723448187112808, + "loss/reg": 0.02546459622681141, + "step": 645 + }, + { + "epoch": 0.323, + "grad_norm": 1.0989381074905396, + "grad_norm_var": 0.2632189617332703, + "learning_rate": 2e-05, + "loss": 0.4694, + "loss/crossentropy": 2.429106831550598, + "loss/hidden": 0.181640625, + "loss/logits": 0.03316484112292528, + "loss/reg": 0.025462418794631958, + "step": 646 + }, + { + "epoch": 0.3235, + "grad_norm": 2.257662296295166, + "grad_norm_var": 0.28202735887923397, + "learning_rate": 2e-05, + "loss": 0.5176, + "loss/crossentropy": 2.431147336959839, + "loss/hidden": 0.22216796875, + "loss/logits": 0.04080248158425093, + "loss/reg": 0.02546020597219467, + "step": 647 + }, + { + "epoch": 0.324, + "grad_norm": 1.9271612167358398, + "grad_norm_var": 0.28453986075382054, + "learning_rate": 2e-05, + "loss": 0.4812, + "loss/crossentropy": 2.2328370809555054, + "loss/hidden": 0.19580078125, + "loss/logits": 0.03081681113690138, + "loss/reg": 0.02545810490846634, + "step": 648 + }, + { + "epoch": 0.3245, + "grad_norm": 1.57036554813385, + "grad_norm_var": 0.14048631360791, + "learning_rate": 2e-05, + "loss": 0.4572, + "loss/crossentropy": 2.3384969234466553, + "loss/hidden": 0.1748046875, + "loss/logits": 0.027865654788911343, + "loss/reg": 0.02545584924519062, + "step": 649 + }, + { + "epoch": 0.325, + "grad_norm": 3.972613573074341, + "grad_norm_var": 0.5047189714591601, + "learning_rate": 2e-05, + "loss": 0.8007, + "loss/crossentropy": 2.171482264995575, + "loss/hidden": 0.39306640625, + "loss/logits": 0.1530690910294652, + "loss/reg": 0.0254536010324955, + "step": 650 + }, + { + "epoch": 0.3255, + "grad_norm": 1.2306694984436035, + "grad_norm_var": 0.5179338564331883, + "learning_rate": 2e-05, + "loss": 0.4915, + "loss/crossentropy": 2.2794147729873657, + "loss/hidden": 0.203125, + "loss/logits": 0.033877959474921227, + "loss/reg": 0.025451431050896645, + "step": 651 + }, + { + "epoch": 0.326, + "grad_norm": 6.3861775398254395, + "grad_norm_var": 1.8717174937138472, + "learning_rate": 2e-05, + "loss": 0.6826, + "loss/crossentropy": 2.2695876359939575, + "loss/hidden": 0.35986328125, + "loss/logits": 0.06826404109597206, + "loss/reg": 0.02544919028878212, + "step": 652 + }, + { + "epoch": 0.3265, + "grad_norm": 0.9534096717834473, + "grad_norm_var": 1.932954969588551, + "learning_rate": 2e-05, + "loss": 0.4184, + "loss/crossentropy": 2.4821490049362183, + "loss/hidden": 0.14208984375, + "loss/logits": 0.02188246138393879, + "loss/reg": 0.02544700726866722, + "step": 653 + }, + { + "epoch": 0.327, + "grad_norm": 1.8388314247131348, + "grad_norm_var": 1.917750585249983, + "learning_rate": 2e-05, + "loss": 0.5184, + "loss/crossentropy": 2.622692823410034, + "loss/hidden": 0.21337890625, + "loss/logits": 0.05053009279072285, + "loss/reg": 0.025444859638810158, + "step": 654 + }, + { + "epoch": 0.3275, + "grad_norm": 1.0860503911972046, + "grad_norm_var": 1.9323275539076297, + "learning_rate": 2e-05, + "loss": 0.4528, + "loss/crossentropy": 2.4405031204223633, + "loss/hidden": 0.17041015625, + "loss/logits": 0.027923785150051117, + "loss/reg": 0.02544267661869526, + "step": 655 + }, + { + "epoch": 0.328, + "grad_norm": 1.8346869945526123, + "grad_norm_var": 1.9162589037726556, + "learning_rate": 2e-05, + "loss": 0.4467, + "loss/crossentropy": 2.561371684074402, + "loss/hidden": 0.16552734375, + "loss/logits": 0.02681200671941042, + "loss/reg": 0.025440504774451256, + "step": 656 + }, + { + "epoch": 0.3285, + "grad_norm": 1.7516165971755981, + "grad_norm_var": 1.8723634601240877, + "learning_rate": 2e-05, + "loss": 0.5473, + "loss/crossentropy": 2.1868897676467896, + "loss/hidden": 0.24951171875, + "loss/logits": 0.04336274042725563, + "loss/reg": 0.025438381358981133, + "step": 657 + }, + { + "epoch": 0.329, + "grad_norm": 1.1200268268585205, + "grad_norm_var": 1.906939612518704, + "learning_rate": 2e-05, + "loss": 0.4269, + "loss/crossentropy": 2.5193029642105103, + "loss/hidden": 0.14892578125, + "loss/logits": 0.023628379218280315, + "loss/reg": 0.025436177849769592, + "step": 658 + }, + { + "epoch": 0.3295, + "grad_norm": 1.2578015327453613, + "grad_norm_var": 1.9377626206597995, + "learning_rate": 2e-05, + "loss": 0.4864, + "loss/crossentropy": 2.4258992671966553, + "loss/hidden": 0.19140625, + "loss/logits": 0.040623242035508156, + "loss/reg": 0.02543400041759014, + "step": 659 + }, + { + "epoch": 0.33, + "grad_norm": 1.0507436990737915, + "grad_norm_var": 1.9720027861822638, + "learning_rate": 2e-05, + "loss": 0.4556, + "loss/crossentropy": 2.3461010456085205, + "loss/hidden": 0.16796875, + "loss/logits": 0.033318827860057354, + "loss/reg": 0.02543184906244278, + "step": 660 + }, + { + "epoch": 0.3305, + "grad_norm": 1.2176129817962646, + "grad_norm_var": 1.969552437425464, + "learning_rate": 2e-05, + "loss": 0.469, + "loss/crossentropy": 2.3208965063095093, + "loss/hidden": 0.1787109375, + "loss/logits": 0.03600373677909374, + "loss/reg": 0.025429651141166687, + "step": 661 + }, + { + "epoch": 0.331, + "grad_norm": 1.1113396883010864, + "grad_norm_var": 1.9682215053201066, + "learning_rate": 2e-05, + "loss": 0.4615, + "loss/crossentropy": 1.9688389897346497, + "loss/hidden": 0.1826171875, + "loss/logits": 0.024597243405878544, + "loss/reg": 0.025427548214793205, + "step": 662 + }, + { + "epoch": 0.3315, + "grad_norm": 2.15281343460083, + "grad_norm_var": 1.9640542341909926, + "learning_rate": 2e-05, + "loss": 0.5739, + "loss/crossentropy": 2.476504325866699, + "loss/hidden": 0.2392578125, + "loss/logits": 0.08039886690676212, + "loss/reg": 0.025425344705581665, + "step": 663 + }, + { + "epoch": 0.332, + "grad_norm": 1.142524003982544, + "grad_norm_var": 2.0000960230816474, + "learning_rate": 2e-05, + "loss": 0.4834, + "loss/crossentropy": 2.45276939868927, + "loss/hidden": 0.1865234375, + "loss/logits": 0.042642902582883835, + "loss/reg": 0.025423116981983185, + "step": 664 + }, + { + "epoch": 0.3325, + "grad_norm": 1.3945834636688232, + "grad_norm_var": 2.0086944041381836, + "learning_rate": 2e-05, + "loss": 0.5291, + "loss/crossentropy": 2.2395424842834473, + "loss/hidden": 0.22265625, + "loss/logits": 0.05220697447657585, + "loss/reg": 0.025420982390642166, + "step": 665 + }, + { + "epoch": 0.333, + "grad_norm": 1.2921602725982666, + "grad_norm_var": 1.6969372224034807, + "learning_rate": 2e-05, + "loss": 0.5112, + "loss/crossentropy": 2.4718152284622192, + "loss/hidden": 0.2158203125, + "loss/logits": 0.041200825944542885, + "loss/reg": 0.025418834760785103, + "step": 666 + }, + { + "epoch": 0.3335, + "grad_norm": 1.8263285160064697, + "grad_norm_var": 1.6837190851105295, + "learning_rate": 2e-05, + "loss": 0.6125, + "loss/crossentropy": 2.3179105520248413, + "loss/hidden": 0.30029296875, + "loss/logits": 0.058010220527648926, + "loss/reg": 0.02541666105389595, + "step": 667 + }, + { + "epoch": 0.334, + "grad_norm": 0.9793453216552734, + "grad_norm_var": 0.14228114450421098, + "learning_rate": 2e-05, + "loss": 0.421, + "loss/crossentropy": 2.375541925430298, + "loss/hidden": 0.1435546875, + "loss/logits": 0.023268045857548714, + "loss/reg": 0.025414319708943367, + "step": 668 + }, + { + "epoch": 0.3345, + "grad_norm": 1.0925960540771484, + "grad_norm_var": 0.13565654288369539, + "learning_rate": 2e-05, + "loss": 0.4561, + "loss/crossentropy": 2.314823031425476, + "loss/hidden": 0.173828125, + "loss/logits": 0.028134356252849102, + "loss/reg": 0.025411993265151978, + "step": 669 + }, + { + "epoch": 0.335, + "grad_norm": 1.2090929746627808, + "grad_norm_var": 0.12227878219406557, + "learning_rate": 2e-05, + "loss": 0.4747, + "loss/crossentropy": 2.471633553504944, + "loss/hidden": 0.19189453125, + "loss/logits": 0.02871632482856512, + "loss/reg": 0.025409623980522156, + "step": 670 + }, + { + "epoch": 0.3355, + "grad_norm": 1.2346880435943604, + "grad_norm_var": 0.11852848812049478, + "learning_rate": 2e-05, + "loss": 0.4636, + "loss/crossentropy": 2.414598226547241, + "loss/hidden": 0.1748046875, + "loss/logits": 0.034768104553222656, + "loss/reg": 0.02540736459195614, + "step": 671 + }, + { + "epoch": 0.336, + "grad_norm": 1.069199800491333, + "grad_norm_var": 0.10611561855972672, + "learning_rate": 2e-05, + "loss": 0.4507, + "loss/crossentropy": 2.2887717485427856, + "loss/hidden": 0.1689453125, + "loss/logits": 0.027695579454302788, + "loss/reg": 0.025404995307326317, + "step": 672 + }, + { + "epoch": 0.3365, + "grad_norm": 1.3578476905822754, + "grad_norm_var": 0.09243173709434505, + "learning_rate": 2e-05, + "loss": 0.4806, + "loss/crossentropy": 2.090680956840515, + "loss/hidden": 0.2001953125, + "loss/logits": 0.026400449685752392, + "loss/reg": 0.025402558967471123, + "step": 673 + }, + { + "epoch": 0.337, + "grad_norm": 1.222834587097168, + "grad_norm_var": 0.09087487045602523, + "learning_rate": 2e-05, + "loss": 0.4371, + "loss/crossentropy": 2.3926587104797363, + "loss/hidden": 0.15234375, + "loss/logits": 0.03075406327843666, + "loss/reg": 0.025400325655937195, + "step": 674 + }, + { + "epoch": 0.3375, + "grad_norm": 1.2310668230056763, + "grad_norm_var": 0.09102797075126906, + "learning_rate": 2e-05, + "loss": 0.487, + "loss/crossentropy": 2.3810113668441772, + "loss/hidden": 0.19970703125, + "loss/logits": 0.03329848870635033, + "loss/reg": 0.0253978930413723, + "step": 675 + }, + { + "epoch": 0.338, + "grad_norm": 1.1485071182250977, + "grad_norm_var": 0.08855158055118005, + "learning_rate": 2e-05, + "loss": 0.4242, + "loss/crossentropy": 2.513722538948059, + "loss/hidden": 0.1455078125, + "loss/logits": 0.02471320889890194, + "loss/reg": 0.025395726785063744, + "step": 676 + }, + { + "epoch": 0.3385, + "grad_norm": 1.302976369857788, + "grad_norm_var": 0.08815285694720097, + "learning_rate": 2e-05, + "loss": 0.408, + "loss/crossentropy": 2.3934881687164307, + "loss/hidden": 0.132568359375, + "loss/logits": 0.0214870385825634, + "loss/reg": 0.025393173098564148, + "step": 677 + }, + { + "epoch": 0.339, + "grad_norm": 1.2492976188659668, + "grad_norm_var": 0.08590898883028307, + "learning_rate": 2e-05, + "loss": 0.4409, + "loss/crossentropy": 2.4734569787979126, + "loss/hidden": 0.16064453125, + "loss/logits": 0.02638374548405409, + "loss/reg": 0.025390924885869026, + "step": 678 + }, + { + "epoch": 0.3395, + "grad_norm": 1.4173915386199951, + "grad_norm_var": 0.03673691192539176, + "learning_rate": 2e-05, + "loss": 0.423, + "loss/crossentropy": 2.4406436681747437, + "loss/hidden": 0.14453125, + "loss/logits": 0.024627392180263996, + "loss/reg": 0.025388652458786964, + "step": 679 + }, + { + "epoch": 0.34, + "grad_norm": 0.9957833290100098, + "grad_norm_var": 0.04039394780387108, + "learning_rate": 2e-05, + "loss": 0.4216, + "loss/crossentropy": 2.472551703453064, + "loss/hidden": 0.14453125, + "loss/logits": 0.023214499466121197, + "loss/reg": 0.025386210530996323, + "step": 680 + }, + { + "epoch": 0.3405, + "grad_norm": 1.3958747386932373, + "grad_norm_var": 0.040418689929556946, + "learning_rate": 2e-05, + "loss": 0.4807, + "loss/crossentropy": 2.636582612991333, + "loss/hidden": 0.18798828125, + "loss/logits": 0.038843123242259026, + "loss/reg": 0.02538374997675419, + "step": 681 + }, + { + "epoch": 0.341, + "grad_norm": 1.2559229135513306, + "grad_norm_var": 0.04030460464576505, + "learning_rate": 2e-05, + "loss": 0.4735, + "loss/crossentropy": 2.2572195529937744, + "loss/hidden": 0.1865234375, + "loss/logits": 0.03318110667169094, + "loss/reg": 0.02538151666522026, + "step": 682 + }, + { + "epoch": 0.3415, + "grad_norm": 1.437334418296814, + "grad_norm_var": 0.019833637621310865, + "learning_rate": 2e-05, + "loss": 0.4508, + "loss/crossentropy": 2.4471691846847534, + "loss/hidden": 0.17041015625, + "loss/logits": 0.026614676229655743, + "loss/reg": 0.0253791194409132, + "step": 683 + }, + { + "epoch": 0.342, + "grad_norm": 1.7899738550186157, + "grad_norm_var": 0.03435388481276878, + "learning_rate": 2e-05, + "loss": 0.4697, + "loss/crossentropy": 2.3129884004592896, + "loss/hidden": 0.1865234375, + "loss/logits": 0.029374102130532265, + "loss/reg": 0.025376921519637108, + "step": 684 + }, + { + "epoch": 0.3425, + "grad_norm": 1.0163391828536987, + "grad_norm_var": 0.03657853766482973, + "learning_rate": 2e-05, + "loss": 0.4382, + "loss/crossentropy": 2.441192150115967, + "loss/hidden": 0.16015625, + "loss/logits": 0.024306317791342735, + "loss/reg": 0.025374585762619972, + "step": 685 + }, + { + "epoch": 0.343, + "grad_norm": 1.1746463775634766, + "grad_norm_var": 0.03693649317759663, + "learning_rate": 2e-05, + "loss": 0.4359, + "loss/crossentropy": 2.3794326782226562, + "loss/hidden": 0.15576171875, + "loss/logits": 0.026386510580778122, + "loss/reg": 0.025372277945280075, + "step": 686 + }, + { + "epoch": 0.3435, + "grad_norm": 1.0302844047546387, + "grad_norm_var": 0.04047557695632419, + "learning_rate": 2e-05, + "loss": 0.4222, + "loss/crossentropy": 2.421720266342163, + "loss/hidden": 0.146484375, + "loss/logits": 0.022049223072826862, + "loss/reg": 0.025370018556714058, + "step": 687 + }, + { + "epoch": 0.344, + "grad_norm": 1.4138187170028687, + "grad_norm_var": 0.039316962171863895, + "learning_rate": 2e-05, + "loss": 0.4613, + "loss/crossentropy": 2.4710036516189575, + "loss/hidden": 0.17626953125, + "loss/logits": 0.03138366714119911, + "loss/reg": 0.02536788582801819, + "step": 688 + }, + { + "epoch": 0.3445, + "grad_norm": 1.39634108543396, + "grad_norm_var": 0.03982198390903117, + "learning_rate": 2e-05, + "loss": 0.4409, + "loss/crossentropy": 2.50797963142395, + "loss/hidden": 0.158203125, + "loss/logits": 0.029052263125777245, + "loss/reg": 0.02536572329699993, + "step": 689 + }, + { + "epoch": 0.345, + "grad_norm": 1.4006764888763428, + "grad_norm_var": 0.040445578503683306, + "learning_rate": 2e-05, + "loss": 0.4874, + "loss/crossentropy": 2.327502489089966, + "loss/hidden": 0.20947265625, + "loss/logits": 0.0243146987631917, + "loss/reg": 0.025363536551594734, + "step": 690 + }, + { + "epoch": 0.3455, + "grad_norm": 1.3401939868927002, + "grad_norm_var": 0.04031761591638811, + "learning_rate": 2e-05, + "loss": 0.4645, + "loss/crossentropy": 2.4942984580993652, + "loss/hidden": 0.18017578125, + "loss/logits": 0.03075546585023403, + "loss/reg": 0.02536129206418991, + "step": 691 + }, + { + "epoch": 0.346, + "grad_norm": 1.1368129253387451, + "grad_norm_var": 0.040558999133186016, + "learning_rate": 2e-05, + "loss": 0.4513, + "loss/crossentropy": 2.5324333906173706, + "loss/hidden": 0.1708984375, + "loss/logits": 0.02676941640675068, + "loss/reg": 0.025359032675623894, + "step": 692 + }, + { + "epoch": 0.3465, + "grad_norm": 10.904756546020508, + "grad_norm_var": 5.81021311974864, + "learning_rate": 2e-05, + "loss": 0.6749, + "loss/crossentropy": 2.5305880308151245, + "loss/hidden": 0.39306640625, + "loss/logits": 0.028291589580476284, + "loss/reg": 0.02535676583647728, + "step": 693 + }, + { + "epoch": 0.347, + "grad_norm": 1.0383360385894775, + "grad_norm_var": 5.831219439922715, + "learning_rate": 2e-05, + "loss": 0.4407, + "loss/crossentropy": 2.347463846206665, + "loss/hidden": 0.1640625, + "loss/logits": 0.02308377344161272, + "loss/reg": 0.025354566052556038, + "step": 694 + }, + { + "epoch": 0.3475, + "grad_norm": 1.1074702739715576, + "grad_norm_var": 5.856505480500767, + "learning_rate": 2e-05, + "loss": 0.4515, + "loss/crossentropy": 2.6167062520980835, + "loss/hidden": 0.1689453125, + "loss/logits": 0.029017897322773933, + "loss/reg": 0.025352245196700096, + "step": 695 + }, + { + "epoch": 0.348, + "grad_norm": 1.6335835456848145, + "grad_norm_var": 5.808040474999217, + "learning_rate": 2e-05, + "loss": 0.4665, + "loss/crossentropy": 2.2225993871688843, + "loss/hidden": 0.181640625, + "loss/logits": 0.031376788392663, + "loss/reg": 0.025349974632263184, + "step": 696 + }, + { + "epoch": 0.3485, + "grad_norm": 2.073458194732666, + "grad_norm_var": 5.790781894960122, + "learning_rate": 2e-05, + "loss": 0.513, + "loss/crossentropy": 2.1957470178604126, + "loss/hidden": 0.23193359375, + "loss/logits": 0.027595311403274536, + "loss/reg": 0.02534763514995575, + "step": 697 + }, + { + "epoch": 0.349, + "grad_norm": 4.3756818771362305, + "grad_norm_var": 6.1116753594536, + "learning_rate": 2e-05, + "loss": 0.6104, + "loss/crossentropy": 2.120497226715088, + "loss/hidden": 0.32080078125, + "loss/logits": 0.03617184329777956, + "loss/reg": 0.025345396250486374, + "step": 698 + }, + { + "epoch": 0.3495, + "grad_norm": 2.3373966217041016, + "grad_norm_var": 6.07775883522955, + "learning_rate": 2e-05, + "loss": 0.574, + "loss/crossentropy": 2.3245344161987305, + "loss/hidden": 0.2373046875, + "loss/logits": 0.08329359069466591, + "loss/reg": 0.025343157351017, + "step": 699 + }, + { + "epoch": 0.35, + "grad_norm": 1.0438388586044312, + "grad_norm_var": 6.153157025860973, + "learning_rate": 2e-05, + "loss": 0.4362, + "loss/crossentropy": 2.362974166870117, + "loss/hidden": 0.15576171875, + "loss/logits": 0.026995157822966576, + "loss/reg": 0.025340832769870758, + "step": 700 + }, + { + "epoch": 0.3505, + "grad_norm": 1.6430028676986694, + "grad_norm_var": 6.082854600769767, + "learning_rate": 2e-05, + "loss": 0.5277, + "loss/crossentropy": 2.102017641067505, + "loss/hidden": 0.2333984375, + "loss/logits": 0.04095187783241272, + "loss/reg": 0.025338461622595787, + "step": 701 + }, + { + "epoch": 0.351, + "grad_norm": 1.5996311902999878, + "grad_norm_var": 6.036571733599795, + "learning_rate": 2e-05, + "loss": 0.566, + "loss/crossentropy": 2.1797362565994263, + "loss/hidden": 0.271484375, + "loss/logits": 0.04117584228515625, + "loss/reg": 0.025335904210805893, + "step": 702 + }, + { + "epoch": 0.3515, + "grad_norm": 1.128166913986206, + "grad_norm_var": 6.021680040096112, + "learning_rate": 2e-05, + "loss": 0.4406, + "loss/crossentropy": 2.4110106229782104, + "loss/hidden": 0.1611328125, + "loss/logits": 0.026095453649759293, + "loss/reg": 0.025333648547530174, + "step": 703 + }, + { + "epoch": 0.352, + "grad_norm": 1.8349699974060059, + "grad_norm_var": 5.987309069676893, + "learning_rate": 2e-05, + "loss": 0.5493, + "loss/crossentropy": 2.0857229232788086, + "loss/hidden": 0.2529296875, + "loss/logits": 0.04306299611926079, + "loss/reg": 0.0253314059227705, + "step": 704 + }, + { + "epoch": 0.3525, + "grad_norm": 1.496748924255371, + "grad_norm_var": 5.9765153933005, + "learning_rate": 2e-05, + "loss": 0.4725, + "loss/crossentropy": 2.4090970754623413, + "loss/hidden": 0.18701171875, + "loss/logits": 0.0321922991424799, + "loss/reg": 0.0253291055560112, + "step": 705 + }, + { + "epoch": 0.353, + "grad_norm": 1.9091578722000122, + "grad_norm_var": 5.9346915662249025, + "learning_rate": 2e-05, + "loss": 0.4712, + "loss/crossentropy": 2.4798312187194824, + "loss/hidden": 0.18115234375, + "loss/logits": 0.03674683719873428, + "loss/reg": 0.025326747447252274, + "step": 706 + }, + { + "epoch": 0.3535, + "grad_norm": 1.3746347427368164, + "grad_norm_var": 5.930414656573596, + "learning_rate": 2e-05, + "loss": 0.4387, + "loss/crossentropy": 2.332283616065979, + "loss/hidden": 0.15673828125, + "loss/logits": 0.02867988497018814, + "loss/reg": 0.025324523448944092, + "step": 707 + }, + { + "epoch": 0.354, + "grad_norm": 1.2437435388565063, + "grad_norm_var": 5.914689920860994, + "learning_rate": 2e-05, + "loss": 0.4413, + "loss/crossentropy": 2.4120808839797974, + "loss/hidden": 0.15869140625, + "loss/logits": 0.029423246160149574, + "loss/reg": 0.025322169065475464, + "step": 708 + }, + { + "epoch": 0.3545, + "grad_norm": 1.5218226909637451, + "grad_norm_var": 0.6477736948298792, + "learning_rate": 2e-05, + "loss": 0.4785, + "loss/crossentropy": 2.4414559602737427, + "loss/hidden": 0.1923828125, + "loss/logits": 0.032884806394577026, + "loss/reg": 0.02531973458826542, + "step": 709 + }, + { + "epoch": 0.355, + "grad_norm": 1.0013576745986938, + "grad_norm_var": 0.651171268534646, + "learning_rate": 2e-05, + "loss": 0.4448, + "loss/crossentropy": 2.2079886198043823, + "loss/hidden": 0.158203125, + "loss/logits": 0.033450678922235966, + "loss/reg": 0.025317512452602386, + "step": 710 + }, + { + "epoch": 0.3555, + "grad_norm": 1.558840274810791, + "grad_norm_var": 0.6277757593685663, + "learning_rate": 2e-05, + "loss": 0.5398, + "loss/crossentropy": 2.2513808012008667, + "loss/hidden": 0.23974609375, + "loss/logits": 0.046865444630384445, + "loss/reg": 0.025314999744296074, + "step": 711 + }, + { + "epoch": 0.356, + "grad_norm": 1.1995527744293213, + "grad_norm_var": 0.6454767272231472, + "learning_rate": 2e-05, + "loss": 0.5245, + "loss/crossentropy": 2.1171988248825073, + "loss/hidden": 0.2294921875, + "loss/logits": 0.041836922988295555, + "loss/reg": 0.025312749668955803, + "step": 712 + }, + { + "epoch": 0.3565, + "grad_norm": 1.0955619812011719, + "grad_norm_var": 0.6577077274442764, + "learning_rate": 2e-05, + "loss": 0.4366, + "loss/crossentropy": 2.375182032585144, + "loss/hidden": 0.15576171875, + "loss/logits": 0.027697966434061527, + "loss/reg": 0.02531055547297001, + "step": 713 + }, + { + "epoch": 0.357, + "grad_norm": 1.287891149520874, + "grad_norm_var": 0.13050938322241734, + "learning_rate": 2e-05, + "loss": 0.4445, + "loss/crossentropy": 2.4279476404190063, + "loss/hidden": 0.1640625, + "loss/logits": 0.027320224791765213, + "loss/reg": 0.025308314710855484, + "step": 714 + }, + { + "epoch": 0.3575, + "grad_norm": 1.1665476560592651, + "grad_norm_var": 0.07840015841878997, + "learning_rate": 2e-05, + "loss": 0.4761, + "loss/crossentropy": 2.3419547080993652, + "loss/hidden": 0.1904296875, + "loss/logits": 0.03259772714227438, + "loss/reg": 0.02530606836080551, + "step": 715 + }, + { + "epoch": 0.358, + "grad_norm": 1.0555628538131714, + "grad_norm_var": 0.07788077396049188, + "learning_rate": 2e-05, + "loss": 0.4214, + "loss/crossentropy": 2.2978007793426514, + "loss/hidden": 0.142578125, + "loss/logits": 0.025796832516789436, + "loss/reg": 0.025303872302174568, + "step": 716 + }, + { + "epoch": 0.3585, + "grad_norm": 0.9452884793281555, + "grad_norm_var": 0.084055576139029, + "learning_rate": 2e-05, + "loss": 0.4439, + "loss/crossentropy": 2.2497235536575317, + "loss/hidden": 0.1630859375, + "loss/logits": 0.027815911918878555, + "loss/reg": 0.025301622226834297, + "step": 717 + }, + { + "epoch": 0.359, + "grad_norm": 1.4938298463821411, + "grad_norm_var": 0.08107452606832148, + "learning_rate": 2e-05, + "loss": 0.4895, + "loss/crossentropy": 2.2590330839157104, + "loss/hidden": 0.203125, + "loss/logits": 0.0333606218919158, + "loss/reg": 0.025299306958913803, + "step": 718 + }, + { + "epoch": 0.3595, + "grad_norm": 1.3809986114501953, + "grad_norm_var": 0.07819483831579542, + "learning_rate": 2e-05, + "loss": 0.4289, + "loss/crossentropy": 2.3282105922698975, + "loss/hidden": 0.1474609375, + "loss/logits": 0.028477998450398445, + "loss/reg": 0.025296946987509727, + "step": 719 + }, + { + "epoch": 0.36, + "grad_norm": 1.4700795412063599, + "grad_norm_var": 0.062819776138249, + "learning_rate": 2e-05, + "loss": 0.4491, + "loss/crossentropy": 2.4214909076690674, + "loss/hidden": 0.171875, + "loss/logits": 0.024313151836395264, + "loss/reg": 0.025294575840234756, + "step": 720 + }, + { + "epoch": 0.3605, + "grad_norm": 1.8786181211471558, + "grad_norm_var": 0.08067338037888813, + "learning_rate": 2e-05, + "loss": 0.4645, + "loss/crossentropy": 2.574108123779297, + "loss/hidden": 0.177734375, + "loss/logits": 0.03380656335502863, + "loss/reg": 0.025292182341217995, + "step": 721 + }, + { + "epoch": 0.361, + "grad_norm": 1.4755817651748657, + "grad_norm_var": 0.06003798552636786, + "learning_rate": 2e-05, + "loss": 0.4538, + "loss/crossentropy": 2.327579617500305, + "loss/hidden": 0.171875, + "loss/logits": 0.029062069952487946, + "loss/reg": 0.025289788842201233, + "step": 722 + }, + { + "epoch": 0.3615, + "grad_norm": 1.5997651815414429, + "grad_norm_var": 0.06478959320761259, + "learning_rate": 2e-05, + "loss": 0.4824, + "loss/crossentropy": 2.248897910118103, + "loss/hidden": 0.19189453125, + "loss/logits": 0.037585300393402576, + "loss/reg": 0.025287389755249023, + "step": 723 + }, + { + "epoch": 0.362, + "grad_norm": 0.9326217770576477, + "grad_norm_var": 0.07466397239676793, + "learning_rate": 2e-05, + "loss": 0.4426, + "loss/crossentropy": 2.4139727354049683, + "loss/hidden": 0.16162109375, + "loss/logits": 0.028111821971833706, + "loss/reg": 0.025284940376877785, + "step": 724 + }, + { + "epoch": 0.3625, + "grad_norm": 1.4016598463058472, + "grad_norm_var": 0.07227671584546869, + "learning_rate": 2e-05, + "loss": 0.4875, + "loss/crossentropy": 2.2318572402000427, + "loss/hidden": 0.19140625, + "loss/logits": 0.04322698712348938, + "loss/reg": 0.02528252638876438, + "step": 725 + }, + { + "epoch": 0.363, + "grad_norm": 2.276989698410034, + "grad_norm_var": 0.12165648929605381, + "learning_rate": 2e-05, + "loss": 0.5242, + "loss/crossentropy": 2.2202149629592896, + "loss/hidden": 0.23095703125, + "loss/logits": 0.040427614003419876, + "loss/reg": 0.0252800602465868, + "step": 726 + }, + { + "epoch": 0.3635, + "grad_norm": 1.3967205286026, + "grad_norm_var": 0.11962167472225668, + "learning_rate": 2e-05, + "loss": 0.4749, + "loss/crossentropy": 2.4266481399536133, + "loss/hidden": 0.189453125, + "loss/logits": 0.03263301961123943, + "loss/reg": 0.025277448818087578, + "step": 727 + }, + { + "epoch": 0.364, + "grad_norm": 1.777940273284912, + "grad_norm_var": 0.12672369877618006, + "learning_rate": 2e-05, + "loss": 0.4827, + "loss/crossentropy": 2.5460067987442017, + "loss/hidden": 0.19384765625, + "loss/logits": 0.03612758591771126, + "loss/reg": 0.025274960324168205, + "step": 728 + }, + { + "epoch": 0.3645, + "grad_norm": 1.3306031227111816, + "grad_norm_var": 0.12017416562564118, + "learning_rate": 2e-05, + "loss": 0.4589, + "loss/crossentropy": 2.3280850648880005, + "loss/hidden": 0.17236328125, + "loss/logits": 0.033771621994674206, + "loss/reg": 0.025272710248827934, + "step": 729 + }, + { + "epoch": 0.365, + "grad_norm": 1.8370397090911865, + "grad_norm_var": 0.12865930776388695, + "learning_rate": 2e-05, + "loss": 0.477, + "loss/crossentropy": 2.3396809101104736, + "loss/hidden": 0.1923828125, + "loss/logits": 0.03194649703800678, + "loss/reg": 0.025270242244005203, + "step": 730 + }, + { + "epoch": 0.3655, + "grad_norm": 1.231892704963684, + "grad_norm_var": 0.12633683764280407, + "learning_rate": 2e-05, + "loss": 0.4522, + "loss/crossentropy": 2.5270928144454956, + "loss/hidden": 0.169921875, + "loss/logits": 0.029576458036899567, + "loss/reg": 0.0252678282558918, + "step": 731 + }, + { + "epoch": 0.366, + "grad_norm": 1.4518746137619019, + "grad_norm_var": 0.11436872382724275, + "learning_rate": 2e-05, + "loss": 0.4706, + "loss/crossentropy": 2.5901981592178345, + "loss/hidden": 0.1640625, + "loss/logits": 0.05386000592261553, + "loss/reg": 0.02526557259261608, + "step": 732 + }, + { + "epoch": 0.3665, + "grad_norm": 1.6061288118362427, + "grad_norm_var": 0.09343888808112574, + "learning_rate": 2e-05, + "loss": 0.4847, + "loss/crossentropy": 2.3777267932891846, + "loss/hidden": 0.19482421875, + "loss/logits": 0.03727641887962818, + "loss/reg": 0.02526322938501835, + "step": 733 + }, + { + "epoch": 0.367, + "grad_norm": 1.2036224603652954, + "grad_norm_var": 0.10025301072385275, + "learning_rate": 2e-05, + "loss": 0.4686, + "loss/crossentropy": 2.4308606386184692, + "loss/hidden": 0.18359375, + "loss/logits": 0.0324308592826128, + "loss/reg": 0.02526094578206539, + "step": 734 + }, + { + "epoch": 0.3675, + "grad_norm": 1.1550683975219727, + "grad_norm_var": 0.10750280174214168, + "learning_rate": 2e-05, + "loss": 0.4347, + "loss/crossentropy": 2.320576786994934, + "loss/hidden": 0.1533203125, + "loss/logits": 0.028841860592365265, + "loss/reg": 0.025258498266339302, + "step": 735 + }, + { + "epoch": 0.368, + "grad_norm": 1.1622178554534912, + "grad_norm_var": 0.1147218928368229, + "learning_rate": 2e-05, + "loss": 0.4227, + "loss/crossentropy": 2.37722384929657, + "loss/hidden": 0.14892578125, + "loss/logits": 0.021224712021648884, + "loss/reg": 0.025255965068936348, + "step": 736 + }, + { + "epoch": 0.3685, + "grad_norm": 1.2075239419937134, + "grad_norm_var": 0.1074162568700674, + "learning_rate": 2e-05, + "loss": 0.4463, + "loss/crossentropy": 2.341481566429138, + "loss/hidden": 0.16259765625, + "loss/logits": 0.031207844614982605, + "loss/reg": 0.025253457948565483, + "step": 737 + }, + { + "epoch": 0.369, + "grad_norm": 1.5584073066711426, + "grad_norm_var": 0.10823295060967611, + "learning_rate": 2e-05, + "loss": 0.4797, + "loss/crossentropy": 2.2698925733566284, + "loss/hidden": 0.18701171875, + "loss/logits": 0.04022688418626785, + "loss/reg": 0.025251101702451706, + "step": 738 + }, + { + "epoch": 0.3695, + "grad_norm": 1.0440956354141235, + "grad_norm_var": 0.11611120991532643, + "learning_rate": 2e-05, + "loss": 0.4302, + "loss/crossentropy": 2.3013094663619995, + "loss/hidden": 0.1533203125, + "loss/logits": 0.024398976005613804, + "loss/reg": 0.025248851627111435, + "step": 739 + }, + { + "epoch": 0.37, + "grad_norm": 1.4074509143829346, + "grad_norm_var": 0.09992254468936514, + "learning_rate": 2e-05, + "loss": 0.4871, + "loss/crossentropy": 2.5492948293685913, + "loss/hidden": 0.2041015625, + "loss/logits": 0.03054051846265793, + "loss/reg": 0.025246579200029373, + "step": 740 + }, + { + "epoch": 0.3705, + "grad_norm": 1.2850230932235718, + "grad_norm_var": 0.10137802938979425, + "learning_rate": 2e-05, + "loss": 0.463, + "loss/crossentropy": 2.3693546056747437, + "loss/hidden": 0.17919921875, + "loss/logits": 0.031334346160292625, + "loss/reg": 0.02524430677294731, + "step": 741 + }, + { + "epoch": 0.371, + "grad_norm": 1.5320541858673096, + "grad_norm_var": 0.05226058368684695, + "learning_rate": 2e-05, + "loss": 0.4494, + "loss/crossentropy": 2.404141068458557, + "loss/hidden": 0.16748046875, + "loss/logits": 0.029474626295268536, + "loss/reg": 0.025241872295737267, + "step": 742 + }, + { + "epoch": 0.3715, + "grad_norm": 1.2663581371307373, + "grad_norm_var": 0.05314906099788974, + "learning_rate": 2e-05, + "loss": 0.4418, + "loss/crossentropy": 2.3754160404205322, + "loss/hidden": 0.16455078125, + "loss/logits": 0.0248889597132802, + "loss/reg": 0.0252396073192358, + "step": 743 + }, + { + "epoch": 0.372, + "grad_norm": 1.8194047212600708, + "grad_norm_var": 0.05546441039958623, + "learning_rate": 2e-05, + "loss": 0.4775, + "loss/crossentropy": 2.3306996822357178, + "loss/hidden": 0.173828125, + "loss/logits": 0.051343479193747044, + "loss/reg": 0.02523711882531643, + "step": 744 + }, + { + "epoch": 0.3725, + "grad_norm": 1.1723297834396362, + "grad_norm_var": 0.05809724214908408, + "learning_rate": 2e-05, + "loss": 0.4121, + "loss/crossentropy": 2.492545485496521, + "loss/hidden": 0.13623046875, + "loss/logits": 0.02352056372910738, + "loss/reg": 0.025234658271074295, + "step": 745 + }, + { + "epoch": 0.373, + "grad_norm": 1.085463047027588, + "grad_norm_var": 0.04672765278327275, + "learning_rate": 2e-05, + "loss": 0.4431, + "loss/crossentropy": 2.5141403675079346, + "loss/hidden": 0.1611328125, + "loss/logits": 0.02963507827371359, + "loss/reg": 0.02523215487599373, + "step": 746 + }, + { + "epoch": 0.3735, + "grad_norm": 1.266335129737854, + "grad_norm_var": 0.04637739796541395, + "learning_rate": 2e-05, + "loss": 0.4577, + "loss/crossentropy": 2.527552366256714, + "loss/hidden": 0.177734375, + "loss/logits": 0.02770281210541725, + "loss/reg": 0.025229567661881447, + "step": 747 + }, + { + "epoch": 0.374, + "grad_norm": 1.704702377319336, + "grad_norm_var": 0.05460029232385371, + "learning_rate": 2e-05, + "loss": 0.448, + "loss/crossentropy": 2.5581319332122803, + "loss/hidden": 0.16943359375, + "loss/logits": 0.026291027665138245, + "loss/reg": 0.0252272579818964, + "step": 748 + }, + { + "epoch": 0.3745, + "grad_norm": 2.3668906688690186, + "grad_norm_var": 0.11753805177080157, + "learning_rate": 2e-05, + "loss": 0.5337, + "loss/crossentropy": 2.3304221630096436, + "loss/hidden": 0.24169921875, + "loss/logits": 0.03977209888398647, + "loss/reg": 0.025224953889846802, + "step": 749 + }, + { + "epoch": 0.375, + "grad_norm": 1.3969782590866089, + "grad_norm_var": 0.11507466699231461, + "learning_rate": 2e-05, + "loss": 0.4742, + "loss/crossentropy": 2.3295921087265015, + "loss/hidden": 0.19140625, + "loss/logits": 0.030590247362852097, + "loss/reg": 0.02522265538573265, + "step": 750 + }, + { + "epoch": 0.3755, + "grad_norm": 1.4511960744857788, + "grad_norm_var": 0.11080980776827473, + "learning_rate": 2e-05, + "loss": 0.5357, + "loss/crossentropy": 2.506491780281067, + "loss/hidden": 0.2373046875, + "loss/logits": 0.04617682471871376, + "loss/reg": 0.025220239534974098, + "step": 751 + }, + { + "epoch": 0.376, + "grad_norm": 0.9766618609428406, + "grad_norm_var": 0.1193494277132064, + "learning_rate": 2e-05, + "loss": 0.4495, + "loss/crossentropy": 2.248973250389099, + "loss/hidden": 0.169921875, + "loss/logits": 0.02738242596387863, + "loss/reg": 0.025217954069375992, + "step": 752 + }, + { + "epoch": 0.3765, + "grad_norm": 1.1288150548934937, + "grad_norm_var": 0.12184896532288716, + "learning_rate": 2e-05, + "loss": 0.4228, + "loss/crossentropy": 2.373740792274475, + "loss/hidden": 0.1474609375, + "loss/logits": 0.023226436227560043, + "loss/reg": 0.025215715169906616, + "step": 753 + }, + { + "epoch": 0.377, + "grad_norm": 1.3548938035964966, + "grad_norm_var": 0.12024460158514286, + "learning_rate": 2e-05, + "loss": 0.4838, + "loss/crossentropy": 2.304950475692749, + "loss/hidden": 0.1845703125, + "loss/logits": 0.04713786952197552, + "loss/reg": 0.02521336078643799, + "step": 754 + }, + { + "epoch": 0.3775, + "grad_norm": 1.3141090869903564, + "grad_norm_var": 0.1123061572966031, + "learning_rate": 2e-05, + "loss": 0.469, + "loss/crossentropy": 2.4102286100387573, + "loss/hidden": 0.185546875, + "loss/logits": 0.03135187551379204, + "loss/reg": 0.025211207568645477, + "step": 755 + }, + { + "epoch": 0.378, + "grad_norm": 2.194099187850952, + "grad_norm_var": 0.1509201675997546, + "learning_rate": 2e-05, + "loss": 0.5182, + "loss/crossentropy": 2.5629695653915405, + "loss/hidden": 0.21923828125, + "loss/logits": 0.04687961935997009, + "loss/reg": 0.02520875632762909, + "step": 756 + }, + { + "epoch": 0.3785, + "grad_norm": 1.8557016849517822, + "grad_norm_var": 0.15817322836116407, + "learning_rate": 2e-05, + "loss": 0.4778, + "loss/crossentropy": 2.405009627342224, + "loss/hidden": 0.19677734375, + "loss/logits": 0.028977664187550545, + "loss/reg": 0.025206197053194046, + "step": 757 + }, + { + "epoch": 0.379, + "grad_norm": 1.1612073183059692, + "grad_norm_var": 0.1648314055929359, + "learning_rate": 2e-05, + "loss": 0.4358, + "loss/crossentropy": 2.4666056632995605, + "loss/hidden": 0.1591796875, + "loss/logits": 0.024547006003558636, + "loss/reg": 0.02520374022424221, + "step": 758 + }, + { + "epoch": 0.3795, + "grad_norm": 1.2368805408477783, + "grad_norm_var": 0.16568490433094543, + "learning_rate": 2e-05, + "loss": 0.4821, + "loss/crossentropy": 2.499003052711487, + "loss/hidden": 0.193359375, + "loss/logits": 0.03675047680735588, + "loss/reg": 0.025201212614774704, + "step": 759 + }, + { + "epoch": 0.38, + "grad_norm": 1.1964080333709717, + "grad_norm_var": 0.16074074145114683, + "learning_rate": 2e-05, + "loss": 0.4833, + "loss/crossentropy": 2.2424347400665283, + "loss/hidden": 0.19921875, + "loss/logits": 0.032082391902804375, + "loss/reg": 0.025198953226208687, + "step": 760 + }, + { + "epoch": 0.3805, + "grad_norm": 1.2416514158248901, + "grad_norm_var": 0.15866947858677752, + "learning_rate": 2e-05, + "loss": 0.4837, + "loss/crossentropy": 2.1305224299430847, + "loss/hidden": 0.20068359375, + "loss/logits": 0.031024353578686714, + "loss/reg": 0.02519652061164379, + "step": 761 + }, + { + "epoch": 0.381, + "grad_norm": 1.4174950122833252, + "grad_norm_var": 0.15016297167369203, + "learning_rate": 2e-05, + "loss": 0.4513, + "loss/crossentropy": 2.610305905342102, + "loss/hidden": 0.16796875, + "loss/logits": 0.03143086936324835, + "loss/reg": 0.025194261223077774, + "step": 762 + }, + { + "epoch": 0.3815, + "grad_norm": 1.2875245809555054, + "grad_norm_var": 0.149660827140138, + "learning_rate": 2e-05, + "loss": 0.4411, + "loss/crossentropy": 2.418062686920166, + "loss/hidden": 0.16162109375, + "loss/logits": 0.027532209642231464, + "loss/reg": 0.02519218809902668, + "step": 763 + }, + { + "epoch": 0.382, + "grad_norm": 2.1845688819885254, + "grad_norm_var": 0.180008472094818, + "learning_rate": 2e-05, + "loss": 0.5501, + "loss/crossentropy": 2.4159024953842163, + "loss/hidden": 0.25244140625, + "loss/logits": 0.04572839289903641, + "loss/reg": 0.025189923122525215, + "step": 764 + }, + { + "epoch": 0.3825, + "grad_norm": 1.8935918807983398, + "grad_norm_var": 0.13837621014211632, + "learning_rate": 2e-05, + "loss": 0.4812, + "loss/crossentropy": 2.5697638988494873, + "loss/hidden": 0.1962890625, + "loss/logits": 0.03301386162638664, + "loss/reg": 0.025187674909830093, + "step": 765 + }, + { + "epoch": 0.383, + "grad_norm": 1.5216825008392334, + "grad_norm_var": 0.13837117134393406, + "learning_rate": 2e-05, + "loss": 0.4664, + "loss/crossentropy": 2.4129964113235474, + "loss/hidden": 0.18212890625, + "loss/logits": 0.032372357323765755, + "loss/reg": 0.025185411795973778, + "step": 766 + }, + { + "epoch": 0.3835, + "grad_norm": 1.0873256921768188, + "grad_norm_var": 0.14724468912793848, + "learning_rate": 2e-05, + "loss": 0.4436, + "loss/crossentropy": 2.3542500734329224, + "loss/hidden": 0.1650390625, + "loss/logits": 0.026738815940916538, + "loss/reg": 0.025183262303471565, + "step": 767 + }, + { + "epoch": 0.384, + "grad_norm": 1.8608894348144531, + "grad_norm_var": 0.14139169238716037, + "learning_rate": 2e-05, + "loss": 0.4817, + "loss/crossentropy": 2.070446014404297, + "loss/hidden": 0.19677734375, + "loss/logits": 0.03307647071778774, + "loss/reg": 0.025181252509355545, + "step": 768 + }, + { + "epoch": 0.3845, + "grad_norm": 1.2290267944335938, + "grad_norm_var": 0.1371124714077353, + "learning_rate": 2e-05, + "loss": 0.4697, + "loss/crossentropy": 2.607566475868225, + "loss/hidden": 0.18359375, + "loss/logits": 0.03431819751858711, + "loss/reg": 0.025178972631692886, + "step": 769 + }, + { + "epoch": 0.385, + "grad_norm": 1.6517506837844849, + "grad_norm_var": 0.13678511646327815, + "learning_rate": 2e-05, + "loss": 0.5167, + "loss/crossentropy": 2.214709520339966, + "loss/hidden": 0.22412109375, + "loss/logits": 0.04081333614885807, + "loss/reg": 0.0251768808811903, + "step": 770 + }, + { + "epoch": 0.3855, + "grad_norm": 1.0256072282791138, + "grad_norm_var": 0.14994063600200108, + "learning_rate": 2e-05, + "loss": 0.4487, + "loss/crossentropy": 2.3049023151397705, + "loss/hidden": 0.1689453125, + "loss/logits": 0.02800673432648182, + "loss/reg": 0.02517460659146309, + "step": 771 + }, + { + "epoch": 0.386, + "grad_norm": 1.1300290822982788, + "grad_norm_var": 0.12263260379390548, + "learning_rate": 2e-05, + "loss": 0.4688, + "loss/crossentropy": 2.4035372734069824, + "loss/hidden": 0.18701171875, + "loss/logits": 0.030027078464627266, + "loss/reg": 0.025172380730509758, + "step": 772 + }, + { + "epoch": 0.3865, + "grad_norm": 1.5945173501968384, + "grad_norm_var": 0.11229187265839163, + "learning_rate": 2e-05, + "loss": 0.509, + "loss/crossentropy": 2.2906605005264282, + "loss/hidden": 0.2138671875, + "loss/logits": 0.04339625872671604, + "loss/reg": 0.02517029643058777, + "step": 773 + }, + { + "epoch": 0.387, + "grad_norm": 1.2455098628997803, + "grad_norm_var": 0.1098270276560114, + "learning_rate": 2e-05, + "loss": 0.506, + "loss/crossentropy": 2.1885640621185303, + "loss/hidden": 0.220703125, + "loss/logits": 0.033652519807219505, + "loss/reg": 0.025167938321828842, + "step": 774 + }, + { + "epoch": 0.3875, + "grad_norm": 1.943253755569458, + "grad_norm_var": 0.12326830210349768, + "learning_rate": 2e-05, + "loss": 0.4517, + "loss/crossentropy": 2.537282109260559, + "loss/hidden": 0.17041015625, + "loss/logits": 0.029665526933968067, + "loss/reg": 0.0251656174659729, + "step": 775 + }, + { + "epoch": 0.388, + "grad_norm": 1.6714816093444824, + "grad_norm_var": 0.12008035318969133, + "learning_rate": 2e-05, + "loss": 0.4953, + "loss/crossentropy": 2.1838293075561523, + "loss/hidden": 0.208984375, + "loss/logits": 0.034706905484199524, + "loss/reg": 0.025163283571600914, + "step": 776 + }, + { + "epoch": 0.3885, + "grad_norm": 1.2149651050567627, + "grad_norm_var": 0.12104097819330283, + "learning_rate": 2e-05, + "loss": 0.445, + "loss/crossentropy": 2.3864688873291016, + "loss/hidden": 0.16064453125, + "loss/logits": 0.03277465607970953, + "loss/reg": 0.025161121040582657, + "step": 777 + }, + { + "epoch": 0.389, + "grad_norm": 1.1848781108856201, + "grad_norm_var": 0.12690278069956282, + "learning_rate": 2e-05, + "loss": 0.4562, + "loss/crossentropy": 2.383737087249756, + "loss/hidden": 0.17138671875, + "loss/logits": 0.033218057826161385, + "loss/reg": 0.025158870965242386, + "step": 778 + }, + { + "epoch": 0.3895, + "grad_norm": 2.212529420852661, + "grad_norm_var": 0.1562819136879937, + "learning_rate": 2e-05, + "loss": 0.564, + "loss/crossentropy": 2.454702615737915, + "loss/hidden": 0.27734375, + "loss/logits": 0.03507992811501026, + "loss/reg": 0.025156671181321144, + "step": 779 + }, + { + "epoch": 0.39, + "grad_norm": 1.3409082889556885, + "grad_norm_var": 0.12834240393133164, + "learning_rate": 2e-05, + "loss": 0.4781, + "loss/crossentropy": 2.3675941228866577, + "loss/hidden": 0.1875, + "loss/logits": 0.03905305452644825, + "loss/reg": 0.02515433356165886, + "step": 780 + }, + { + "epoch": 0.3905, + "grad_norm": 1.649703025817871, + "grad_norm_var": 0.1188706614056916, + "learning_rate": 2e-05, + "loss": 0.5103, + "loss/crossentropy": 1.9899500608444214, + "loss/hidden": 0.20458984375, + "loss/logits": 0.05418789014220238, + "loss/reg": 0.025151889771223068, + "step": 781 + }, + { + "epoch": 0.391, + "grad_norm": 1.539289951324463, + "grad_norm_var": 0.11900490617594, + "learning_rate": 2e-05, + "loss": 0.4399, + "loss/crossentropy": 2.3202253580093384, + "loss/hidden": 0.16015625, + "loss/logits": 0.02825088147073984, + "loss/reg": 0.025149622932076454, + "step": 782 + }, + { + "epoch": 0.3915, + "grad_norm": 2.8615036010742188, + "grad_norm_var": 0.22430059081523435, + "learning_rate": 2e-05, + "loss": 0.5087, + "loss/crossentropy": 2.3681873083114624, + "loss/hidden": 0.1845703125, + "loss/logits": 0.07264281064271927, + "loss/reg": 0.025147197768092155, + "step": 783 + }, + { + "epoch": 0.392, + "grad_norm": 1.4059878587722778, + "grad_norm_var": 0.22048462683969675, + "learning_rate": 2e-05, + "loss": 0.4234, + "loss/crossentropy": 2.4366742372512817, + "loss/hidden": 0.14990234375, + "loss/logits": 0.022082606330513954, + "loss/reg": 0.025144780054688454, + "step": 784 + }, + { + "epoch": 0.3925, + "grad_norm": 1.011724591255188, + "grad_norm_var": 0.23291844077479976, + "learning_rate": 2e-05, + "loss": 0.4233, + "loss/crossentropy": 2.2704352140426636, + "loss/hidden": 0.1435546875, + "loss/logits": 0.028314979746937752, + "loss/reg": 0.02514229156076908, + "step": 785 + }, + { + "epoch": 0.393, + "grad_norm": 1.193475365638733, + "grad_norm_var": 0.23938277110281755, + "learning_rate": 2e-05, + "loss": 0.4774, + "loss/crossentropy": 2.372236728668213, + "loss/hidden": 0.19091796875, + "loss/logits": 0.0351157495751977, + "loss/reg": 0.0251397043466568, + "step": 786 + }, + { + "epoch": 0.3935, + "grad_norm": 1.1133381128311157, + "grad_norm_var": 0.23414986734980137, + "learning_rate": 2e-05, + "loss": 0.4323, + "loss/crossentropy": 2.2562466859817505, + "loss/hidden": 0.15283203125, + "loss/logits": 0.028081120923161507, + "loss/reg": 0.025137118995189667, + "step": 787 + }, + { + "epoch": 0.394, + "grad_norm": 1.4887886047363281, + "grad_norm_var": 0.22356068135045598, + "learning_rate": 2e-05, + "loss": 0.4772, + "loss/crossentropy": 2.218737244606018, + "loss/hidden": 0.1708984375, + "loss/logits": 0.0549413226544857, + "loss/reg": 0.025134827941656113, + "step": 788 + }, + { + "epoch": 0.3945, + "grad_norm": 1.6351099014282227, + "grad_norm_var": 0.22394795699470554, + "learning_rate": 2e-05, + "loss": 0.4897, + "loss/crossentropy": 2.6010366678237915, + "loss/hidden": 0.2099609375, + "loss/logits": 0.02844669111073017, + "loss/reg": 0.02513228729367256, + "step": 789 + }, + { + "epoch": 0.395, + "grad_norm": 1.6281384229660034, + "grad_norm_var": 0.21784319752439665, + "learning_rate": 2e-05, + "loss": 0.4864, + "loss/crossentropy": 2.3249675035476685, + "loss/hidden": 0.20703125, + "loss/logits": 0.028079986572265625, + "loss/reg": 0.02512998878955841, + "step": 790 + }, + { + "epoch": 0.3955, + "grad_norm": 1.526131510734558, + "grad_norm_var": 0.20787200314066634, + "learning_rate": 2e-05, + "loss": 0.53, + "loss/crossentropy": 2.134896695613861, + "loss/hidden": 0.2412109375, + "loss/logits": 0.037468770518898964, + "loss/reg": 0.025127559900283813, + "step": 791 + }, + { + "epoch": 0.396, + "grad_norm": 1.2636619806289673, + "grad_norm_var": 0.211246353547789, + "learning_rate": 2e-05, + "loss": 0.4303, + "loss/crossentropy": 2.412594199180603, + "loss/hidden": 0.154296875, + "loss/logits": 0.024777178652584553, + "loss/reg": 0.02512528747320175, + "step": 792 + }, + { + "epoch": 0.3965, + "grad_norm": 1.5792723894119263, + "grad_norm_var": 0.2048758713311332, + "learning_rate": 2e-05, + "loss": 0.4862, + "loss/crossentropy": 2.2064541578292847, + "loss/hidden": 0.20068359375, + "loss/logits": 0.034297335892915726, + "loss/reg": 0.02512306347489357, + "step": 793 + }, + { + "epoch": 0.397, + "grad_norm": 1.3188270330429077, + "grad_norm_var": 0.199661045066693, + "learning_rate": 2e-05, + "loss": 0.4927, + "loss/crossentropy": 2.26226544380188, + "loss/hidden": 0.19873046875, + "loss/logits": 0.04274392127990723, + "loss/reg": 0.025120839476585388, + "step": 794 + }, + { + "epoch": 0.3975, + "grad_norm": 1.6991007328033447, + "grad_norm_var": 0.1706464817422428, + "learning_rate": 2e-05, + "loss": 0.4816, + "loss/crossentropy": 2.36691677570343, + "loss/hidden": 0.193359375, + "loss/logits": 0.03702061250805855, + "loss/reg": 0.025118518620729446, + "step": 795 + }, + { + "epoch": 0.398, + "grad_norm": 1.7001802921295166, + "grad_norm_var": 0.1703294579580552, + "learning_rate": 2e-05, + "loss": 0.4837, + "loss/crossentropy": 2.2683218717575073, + "loss/hidden": 0.16943359375, + "loss/logits": 0.06313092540949583, + "loss/reg": 0.025116167962551117, + "step": 796 + }, + { + "epoch": 0.3985, + "grad_norm": 1.6777490377426147, + "grad_norm_var": 0.1707948722071741, + "learning_rate": 2e-05, + "loss": 0.4669, + "loss/crossentropy": 2.364277482032776, + "loss/hidden": 0.181640625, + "loss/logits": 0.03413047455251217, + "loss/reg": 0.02511376328766346, + "step": 797 + }, + { + "epoch": 0.399, + "grad_norm": 1.1140098571777344, + "grad_norm_var": 0.18214716036864212, + "learning_rate": 2e-05, + "loss": 0.4658, + "loss/crossentropy": 2.187538802623749, + "loss/hidden": 0.1826171875, + "loss/logits": 0.03205987066030502, + "loss/reg": 0.025110801681876183, + "step": 798 + }, + { + "epoch": 0.3995, + "grad_norm": 1.129773497581482, + "grad_norm_var": 0.058341697787046044, + "learning_rate": 2e-05, + "loss": 0.4369, + "loss/crossentropy": 2.3475732803344727, + "loss/hidden": 0.1611328125, + "loss/logits": 0.024723156355321407, + "loss/reg": 0.025107914581894875, + "step": 799 + }, + { + "epoch": 0.4, + "grad_norm": 1.3154513835906982, + "grad_norm_var": 0.05884605160209707, + "learning_rate": 2e-05, + "loss": 0.4572, + "loss/crossentropy": 2.398823618888855, + "loss/hidden": 0.17431640625, + "loss/logits": 0.0318829407915473, + "loss/reg": 0.025105012580752373, + "step": 800 + }, + { + "epoch": 0.4005, + "grad_norm": 1.3138872385025024, + "grad_norm_var": 0.048922729616520874, + "learning_rate": 2e-05, + "loss": 0.4692, + "loss/crossentropy": 2.3344963788986206, + "loss/hidden": 0.181640625, + "loss/logits": 0.036580765619874, + "loss/reg": 0.025102730840444565, + "step": 801 + }, + { + "epoch": 0.401, + "grad_norm": 1.0770680904388428, + "grad_norm_var": 0.05326311463356787, + "learning_rate": 2e-05, + "loss": 0.4599, + "loss/crossentropy": 2.337261915206909, + "loss/hidden": 0.17431640625, + "loss/logits": 0.03461520001292229, + "loss/reg": 0.025099987164139748, + "step": 802 + }, + { + "epoch": 0.4015, + "grad_norm": 0.9409591555595398, + "grad_norm_var": 0.06196813771724311, + "learning_rate": 2e-05, + "loss": 0.4278, + "loss/crossentropy": 2.4044833183288574, + "loss/hidden": 0.146484375, + "loss/logits": 0.030327575281262398, + "loss/reg": 0.025096973404288292, + "step": 803 + }, + { + "epoch": 0.402, + "grad_norm": 1.4571963548660278, + "grad_norm_var": 0.06165864774989793, + "learning_rate": 2e-05, + "loss": 0.4391, + "loss/crossentropy": 2.3775731325149536, + "loss/hidden": 0.1591796875, + "loss/logits": 0.028936855494976044, + "loss/reg": 0.025093907490372658, + "step": 804 + }, + { + "epoch": 0.4025, + "grad_norm": 1.3221757411956787, + "grad_norm_var": 0.05790803967387448, + "learning_rate": 2e-05, + "loss": 0.4374, + "loss/crossentropy": 2.535359501838684, + "loss/hidden": 0.15869140625, + "loss/logits": 0.027763372287154198, + "loss/reg": 0.02509160339832306, + "step": 805 + }, + { + "epoch": 0.403, + "grad_norm": 0.944514811038971, + "grad_norm_var": 0.064405569007729, + "learning_rate": 2e-05, + "loss": 0.4569, + "loss/crossentropy": 2.209794282913208, + "loss/hidden": 0.17578125, + "loss/logits": 0.030202921479940414, + "loss/reg": 0.0250887181609869, + "step": 806 + }, + { + "epoch": 0.4035, + "grad_norm": 1.182153344154358, + "grad_norm_var": 0.06309183378906127, + "learning_rate": 2e-05, + "loss": 0.4521, + "loss/crossentropy": 2.2558337450027466, + "loss/hidden": 0.17333984375, + "loss/logits": 0.027920391410589218, + "loss/reg": 0.02508593164384365, + "step": 807 + }, + { + "epoch": 0.404, + "grad_norm": 1.3775771856307983, + "grad_norm_var": 0.06312693371007896, + "learning_rate": 2e-05, + "loss": 0.4939, + "loss/crossentropy": 2.2632880210876465, + "loss/hidden": 0.208984375, + "loss/logits": 0.03405469283461571, + "loss/reg": 0.025083083659410477, + "step": 808 + }, + { + "epoch": 0.4045, + "grad_norm": 1.5316611528396606, + "grad_norm_var": 0.06163456830326434, + "learning_rate": 2e-05, + "loss": 0.5141, + "loss/crossentropy": 2.223612070083618, + "loss/hidden": 0.22705078125, + "loss/logits": 0.036264341324567795, + "loss/reg": 0.02508021518588066, + "step": 809 + }, + { + "epoch": 0.405, + "grad_norm": 1.149705171585083, + "grad_norm_var": 0.06342368922468508, + "learning_rate": 2e-05, + "loss": 0.4366, + "loss/crossentropy": 2.47035813331604, + "loss/hidden": 0.1591796875, + "loss/logits": 0.02665360551327467, + "loss/reg": 0.02507762797176838, + "step": 810 + }, + { + "epoch": 0.4055, + "grad_norm": 1.2824203968048096, + "grad_norm_var": 0.052564492158762674, + "learning_rate": 2e-05, + "loss": 0.4829, + "loss/crossentropy": 2.478409767150879, + "loss/hidden": 0.19775390625, + "loss/logits": 0.034352305345237255, + "loss/reg": 0.02507534809410572, + "step": 811 + }, + { + "epoch": 0.406, + "grad_norm": 2.6002371311187744, + "grad_norm_var": 0.15334706854063704, + "learning_rate": 2e-05, + "loss": 0.5122, + "loss/crossentropy": 2.588177442550659, + "loss/hidden": 0.2373046875, + "loss/logits": 0.024176809936761856, + "loss/reg": 0.025072963908314705, + "step": 812 + }, + { + "epoch": 0.4065, + "grad_norm": 1.5694222450256348, + "grad_norm_var": 0.149181005955631, + "learning_rate": 2e-05, + "loss": 0.5268, + "loss/crossentropy": 2.154956102371216, + "loss/hidden": 0.232421875, + "loss/logits": 0.04367602989077568, + "loss/reg": 0.025070277974009514, + "step": 813 + }, + { + "epoch": 0.407, + "grad_norm": 1.2066580057144165, + "grad_norm_var": 0.1470275588442864, + "learning_rate": 2e-05, + "loss": 0.4439, + "loss/crossentropy": 2.4592679738998413, + "loss/hidden": 0.16064453125, + "loss/logits": 0.03256369009613991, + "loss/reg": 0.025067761540412903, + "step": 814 + }, + { + "epoch": 0.4075, + "grad_norm": 1.298493504524231, + "grad_norm_var": 0.1441324853666197, + "learning_rate": 2e-05, + "loss": 0.4939, + "loss/crossentropy": 2.1921013593673706, + "loss/hidden": 0.20263671875, + "loss/logits": 0.040627798065543175, + "loss/reg": 0.02506544440984726, + "step": 815 + }, + { + "epoch": 0.408, + "grad_norm": 1.0179194211959839, + "grad_norm_var": 0.15096046825236584, + "learning_rate": 2e-05, + "loss": 0.4628, + "loss/crossentropy": 2.2625861167907715, + "loss/hidden": 0.18212890625, + "loss/logits": 0.03002795670181513, + "loss/reg": 0.025062717497348785, + "step": 816 + }, + { + "epoch": 0.4085, + "grad_norm": 1.279781460762024, + "grad_norm_var": 0.1511041804692482, + "learning_rate": 2e-05, + "loss": 0.4842, + "loss/crossentropy": 2.138561725616455, + "loss/hidden": 0.19775390625, + "loss/logits": 0.035851323045790195, + "loss/reg": 0.025060279294848442, + "step": 817 + }, + { + "epoch": 0.409, + "grad_norm": 1.0807183980941772, + "grad_norm_var": 0.15098318869743482, + "learning_rate": 2e-05, + "loss": 0.4273, + "loss/crossentropy": 2.3973305225372314, + "loss/hidden": 0.150390625, + "loss/logits": 0.026333114132285118, + "loss/reg": 0.02505759336054325, + "step": 818 + }, + { + "epoch": 0.4095, + "grad_norm": 1.2695621252059937, + "grad_norm_var": 0.1407917737407074, + "learning_rate": 2e-05, + "loss": 0.4683, + "loss/crossentropy": 2.253230392932892, + "loss/hidden": 0.169921875, + "loss/logits": 0.047788072377443314, + "loss/reg": 0.025055285543203354, + "step": 819 + }, + { + "epoch": 0.41, + "grad_norm": 1.209682822227478, + "grad_norm_var": 0.14102156172926023, + "learning_rate": 2e-05, + "loss": 0.4417, + "loss/crossentropy": 2.182424545288086, + "loss/hidden": 0.1611328125, + "loss/logits": 0.030013758689165115, + "loss/reg": 0.02505277469754219, + "step": 820 + }, + { + "epoch": 0.4105, + "grad_norm": 1.6707624197006226, + "grad_norm_var": 0.1481284569685306, + "learning_rate": 2e-05, + "loss": 0.5191, + "loss/crossentropy": 2.2835570573806763, + "loss/hidden": 0.22802734375, + "loss/logits": 0.04061476141214371, + "loss/reg": 0.025050263851881027, + "step": 821 + }, + { + "epoch": 0.411, + "grad_norm": 1.1448094844818115, + "grad_norm_var": 0.1396880017606003, + "learning_rate": 2e-05, + "loss": 0.4704, + "loss/crossentropy": 2.227652668952942, + "loss/hidden": 0.1787109375, + "loss/logits": 0.041171809658408165, + "loss/reg": 0.02504797838628292, + "step": 822 + }, + { + "epoch": 0.4115, + "grad_norm": 1.3101500272750854, + "grad_norm_var": 0.13755777894761198, + "learning_rate": 2e-05, + "loss": 0.4425, + "loss/crossentropy": 2.4227746725082397, + "loss/hidden": 0.1650390625, + "loss/logits": 0.02695902157574892, + "loss/reg": 0.02504545822739601, + "step": 823 + }, + { + "epoch": 0.412, + "grad_norm": 1.7020496129989624, + "grad_norm_var": 0.14425061011981716, + "learning_rate": 2e-05, + "loss": 0.4481, + "loss/crossentropy": 2.5413230657577515, + "loss/hidden": 0.1572265625, + "loss/logits": 0.040394325740635395, + "loss/reg": 0.025042949244379997, + "step": 824 + }, + { + "epoch": 0.4125, + "grad_norm": 1.0356205701828003, + "grad_norm_var": 0.15060720196286131, + "learning_rate": 2e-05, + "loss": 0.4672, + "loss/crossentropy": 2.308974862098694, + "loss/hidden": 0.18310546875, + "loss/logits": 0.033710891380906105, + "loss/reg": 0.025040656328201294, + "step": 825 + }, + { + "epoch": 0.413, + "grad_norm": 1.0203226804733276, + "grad_norm_var": 0.1553545460901887, + "learning_rate": 2e-05, + "loss": 0.4106, + "loss/crossentropy": 2.477281332015991, + "loss/hidden": 0.13671875, + "loss/logits": 0.023489448241889477, + "loss/reg": 0.025038165971636772, + "step": 826 + }, + { + "epoch": 0.4135, + "grad_norm": 1.1076934337615967, + "grad_norm_var": 0.15898062007053398, + "learning_rate": 2e-05, + "loss": 0.4373, + "loss/crossentropy": 2.4935485124588013, + "loss/hidden": 0.1591796875, + "loss/logits": 0.027745064347982407, + "loss/reg": 0.02503584697842598, + "step": 827 + }, + { + "epoch": 0.414, + "grad_norm": 11.625944137573242, + "grad_norm_var": 6.76073723206649, + "learning_rate": 2e-05, + "loss": 0.5717, + "loss/crossentropy": 2.347122311592102, + "loss/hidden": 0.29052734375, + "loss/logits": 0.030792713165283203, + "loss/reg": 0.025033539161086082, + "step": 828 + }, + { + "epoch": 0.4145, + "grad_norm": 1.3114373683929443, + "grad_norm_var": 6.776589802928325, + "learning_rate": 2e-05, + "loss": 0.4495, + "loss/crossentropy": 2.408790349960327, + "loss/hidden": 0.1708984375, + "loss/logits": 0.028319708071649075, + "loss/reg": 0.02503122203052044, + "step": 829 + }, + { + "epoch": 0.415, + "grad_norm": 1.2084417343139648, + "grad_norm_var": 6.776426715144699, + "learning_rate": 2e-05, + "loss": 0.4981, + "loss/crossentropy": 2.4079878330230713, + "loss/hidden": 0.21044921875, + "loss/logits": 0.037383945658802986, + "loss/reg": 0.0250290185213089, + "step": 830 + }, + { + "epoch": 0.4155, + "grad_norm": 1.5227446556091309, + "grad_norm_var": 6.761783844737624, + "learning_rate": 2e-05, + "loss": 0.5666, + "loss/crossentropy": 2.316787838935852, + "loss/hidden": 0.2734375, + "loss/logits": 0.0429159477353096, + "loss/reg": 0.025026634335517883, + "step": 831 + }, + { + "epoch": 0.416, + "grad_norm": 1.3830286264419556, + "grad_norm_var": 6.7268166954643736, + "learning_rate": 2e-05, + "loss": 0.5173, + "loss/crossentropy": 2.6073665618896484, + "loss/hidden": 0.22900390625, + "loss/logits": 0.03809538949280977, + "loss/reg": 0.025024237111210823, + "step": 832 + }, + { + "epoch": 0.4165, + "grad_norm": 0.9845668077468872, + "grad_norm_var": 6.757864312480587, + "learning_rate": 2e-05, + "loss": 0.4286, + "loss/crossentropy": 2.1612448692321777, + "loss/hidden": 0.15234375, + "loss/logits": 0.026042289100587368, + "loss/reg": 0.025022020563483238, + "step": 833 + }, + { + "epoch": 0.417, + "grad_norm": 1.03498113155365, + "grad_norm_var": 6.763062760659847, + "learning_rate": 2e-05, + "loss": 0.4259, + "loss/crossentropy": 2.3577685356140137, + "loss/hidden": 0.1455078125, + "loss/logits": 0.030183385126292706, + "loss/reg": 0.025019681081175804, + "step": 834 + }, + { + "epoch": 0.4175, + "grad_norm": 1.4572676420211792, + "grad_norm_var": 6.749264821786343, + "learning_rate": 2e-05, + "loss": 0.443, + "loss/crossentropy": 2.2105389833450317, + "loss/hidden": 0.162109375, + "loss/logits": 0.03076254576444626, + "loss/reg": 0.025017455220222473, + "step": 835 + }, + { + "epoch": 0.418, + "grad_norm": 1.650352954864502, + "grad_norm_var": 6.719631400519071, + "learning_rate": 2e-05, + "loss": 0.4355, + "loss/crossentropy": 2.2510547637939453, + "loss/hidden": 0.1572265625, + "loss/logits": 0.028081734664738178, + "loss/reg": 0.025015119463205338, + "step": 836 + }, + { + "epoch": 0.4185, + "grad_norm": 1.5497808456420898, + "grad_norm_var": 6.725020460592711, + "learning_rate": 2e-05, + "loss": 0.4611, + "loss/crossentropy": 2.534460186958313, + "loss/hidden": 0.17578125, + "loss/logits": 0.03517400100827217, + "loss/reg": 0.02501281537115574, + "step": 837 + }, + { + "epoch": 0.419, + "grad_norm": 1.1171302795410156, + "grad_norm_var": 6.728005163235623, + "learning_rate": 2e-05, + "loss": 0.446, + "loss/crossentropy": 2.4289716482162476, + "loss/hidden": 0.1669921875, + "loss/logits": 0.028871508315205574, + "loss/reg": 0.025010673329234123, + "step": 838 + }, + { + "epoch": 0.4195, + "grad_norm": 1.721587061882019, + "grad_norm_var": 6.70409609664535, + "learning_rate": 2e-05, + "loss": 0.434, + "loss/crossentropy": 2.530004143714905, + "loss/hidden": 0.1591796875, + "loss/logits": 0.024708636105060577, + "loss/reg": 0.02500857040286064, + "step": 839 + }, + { + "epoch": 0.42, + "grad_norm": 1.422298550605774, + "grad_norm_var": 6.718779037944629, + "learning_rate": 2e-05, + "loss": 0.4845, + "loss/crossentropy": 2.1376953125, + "loss/hidden": 0.2001953125, + "loss/logits": 0.03421156480908394, + "loss/reg": 0.025006268173456192, + "step": 840 + }, + { + "epoch": 0.4205, + "grad_norm": 1.2492433786392212, + "grad_norm_var": 6.695670215657393, + "learning_rate": 2e-05, + "loss": 0.4417, + "loss/crossentropy": 2.392310380935669, + "loss/hidden": 0.15576171875, + "loss/logits": 0.035860566422343254, + "loss/reg": 0.02500392496585846, + "step": 841 + }, + { + "epoch": 0.421, + "grad_norm": 1.8433566093444824, + "grad_norm_var": 6.634841808570999, + "learning_rate": 2e-05, + "loss": 0.5097, + "loss/crossentropy": 2.4924964904785156, + "loss/hidden": 0.21728515625, + "loss/logits": 0.042372843250632286, + "loss/reg": 0.025001544505357742, + "step": 842 + }, + { + "epoch": 0.4215, + "grad_norm": 1.458393931388855, + "grad_norm_var": 6.600249569106912, + "learning_rate": 2e-05, + "loss": 0.4652, + "loss/crossentropy": 2.4191232919692993, + "loss/hidden": 0.18505859375, + "loss/logits": 0.030173558741807938, + "loss/reg": 0.02499937266111374, + "step": 843 + }, + { + "epoch": 0.422, + "grad_norm": 1.5181694030761719, + "grad_norm_var": 0.05830908442588125, + "learning_rate": 2e-05, + "loss": 0.4305, + "loss/crossentropy": 2.543475866317749, + "loss/hidden": 0.1552734375, + "loss/logits": 0.02529764547944069, + "loss/reg": 0.024997074156999588, + "step": 844 + }, + { + "epoch": 0.4225, + "grad_norm": 1.4709956645965576, + "grad_norm_var": 0.057972554883919496, + "learning_rate": 2e-05, + "loss": 0.4453, + "loss/crossentropy": 2.40644907951355, + "loss/hidden": 0.171875, + "loss/logits": 0.023440631106495857, + "loss/reg": 0.02499477192759514, + "step": 845 + }, + { + "epoch": 0.423, + "grad_norm": 1.2264574766159058, + "grad_norm_var": 0.0575038222824185, + "learning_rate": 2e-05, + "loss": 0.453, + "loss/crossentropy": 2.4405059814453125, + "loss/hidden": 0.17529296875, + "loss/logits": 0.027810130268335342, + "loss/reg": 0.024992434307932854, + "step": 846 + }, + { + "epoch": 0.4235, + "grad_norm": 1.2261029481887817, + "grad_norm_var": 0.05866876723294444, + "learning_rate": 2e-05, + "loss": 0.4206, + "loss/crossentropy": 2.530023455619812, + "loss/hidden": 0.14697265625, + "loss/logits": 0.023748058825731277, + "loss/reg": 0.024990031495690346, + "step": 847 + }, + { + "epoch": 0.424, + "grad_norm": 1.2650319337844849, + "grad_norm_var": 0.05972113104539645, + "learning_rate": 2e-05, + "loss": 0.4805, + "loss/crossentropy": 2.4802552461624146, + "loss/hidden": 0.1787109375, + "loss/logits": 0.05189700424671173, + "loss/reg": 0.02498767152428627, + "step": 848 + }, + { + "epoch": 0.4245, + "grad_norm": 1.5666638612747192, + "grad_norm_var": 0.049646390274153636, + "learning_rate": 2e-05, + "loss": 0.4839, + "loss/crossentropy": 2.2816847562789917, + "loss/hidden": 0.19189453125, + "loss/logits": 0.04210854321718216, + "loss/reg": 0.024985330179333687, + "step": 849 + }, + { + "epoch": 0.425, + "grad_norm": 1.3735864162445068, + "grad_norm_var": 0.03926651318212458, + "learning_rate": 2e-05, + "loss": 0.4633, + "loss/crossentropy": 2.288491129875183, + "loss/hidden": 0.18017578125, + "loss/logits": 0.03332236781716347, + "loss/reg": 0.024982422590255737, + "step": 850 + }, + { + "epoch": 0.4255, + "grad_norm": 1.2534009218215942, + "grad_norm_var": 0.04152457000375349, + "learning_rate": 2e-05, + "loss": 0.505, + "loss/crossentropy": 2.300741195678711, + "loss/hidden": 0.20703125, + "loss/logits": 0.04814612679183483, + "loss/reg": 0.02498042583465576, + "step": 851 + }, + { + "epoch": 0.426, + "grad_norm": 1.2829217910766602, + "grad_norm_var": 0.03926682396235221, + "learning_rate": 2e-05, + "loss": 0.4858, + "loss/crossentropy": 2.076082229614258, + "loss/hidden": 0.19873046875, + "loss/logits": 0.037287235260009766, + "loss/reg": 0.02497800998389721, + "step": 852 + }, + { + "epoch": 0.4265, + "grad_norm": 1.0537066459655762, + "grad_norm_var": 0.04534035977341985, + "learning_rate": 2e-05, + "loss": 0.4146, + "loss/crossentropy": 2.2343058586120605, + "loss/hidden": 0.14208984375, + "loss/logits": 0.022780392318964005, + "loss/reg": 0.024975987151265144, + "step": 853 + }, + { + "epoch": 0.427, + "grad_norm": 1.25690758228302, + "grad_norm_var": 0.04169842414173767, + "learning_rate": 2e-05, + "loss": 0.4457, + "loss/crossentropy": 2.2924128770828247, + "loss/hidden": 0.1689453125, + "loss/logits": 0.027037952095270157, + "loss/reg": 0.024973342195153236, + "step": 854 + }, + { + "epoch": 0.4275, + "grad_norm": 1.1707593202590942, + "grad_norm_var": 0.03607373501481727, + "learning_rate": 2e-05, + "loss": 0.4284, + "loss/crossentropy": 2.1742767095565796, + "loss/hidden": 0.15087890625, + "loss/logits": 0.027766499668359756, + "loss/reg": 0.024970991536974907, + "step": 855 + }, + { + "epoch": 0.428, + "grad_norm": 1.244330883026123, + "grad_norm_var": 0.03639404290223063, + "learning_rate": 2e-05, + "loss": 0.4646, + "loss/crossentropy": 2.283990740776062, + "loss/hidden": 0.181640625, + "loss/logits": 0.03325035236775875, + "loss/reg": 0.024968596175312996, + "step": 856 + }, + { + "epoch": 0.4285, + "grad_norm": 1.376844048500061, + "grad_norm_var": 0.0358462854612099, + "learning_rate": 2e-05, + "loss": 0.4368, + "loss/crossentropy": 2.340665102005005, + "loss/hidden": 0.1591796875, + "loss/logits": 0.027914387173950672, + "loss/reg": 0.024966033175587654, + "step": 857 + }, + { + "epoch": 0.429, + "grad_norm": 1.1170494556427002, + "grad_norm_var": 0.020964417363066385, + "learning_rate": 2e-05, + "loss": 0.4231, + "loss/crossentropy": 2.44227135181427, + "loss/hidden": 0.146484375, + "loss/logits": 0.027010299265384674, + "loss/reg": 0.024963244795799255, + "step": 858 + }, + { + "epoch": 0.4295, + "grad_norm": 1.2896698713302612, + "grad_norm_var": 0.019266560970768804, + "learning_rate": 2e-05, + "loss": 0.4338, + "loss/crossentropy": 2.437178373336792, + "loss/hidden": 0.15673828125, + "loss/logits": 0.027423975989222527, + "loss/reg": 0.024961121380329132, + "step": 859 + }, + { + "epoch": 0.43, + "grad_norm": 1.5006057024002075, + "grad_norm_var": 0.018759206476876972, + "learning_rate": 2e-05, + "loss": 0.4984, + "loss/crossentropy": 2.36915385723114, + "loss/hidden": 0.20703125, + "loss/logits": 0.04180637001991272, + "loss/reg": 0.02495899423956871, + "step": 860 + }, + { + "epoch": 0.4305, + "grad_norm": 1.2376413345336914, + "grad_norm_var": 0.0165992425597094, + "learning_rate": 2e-05, + "loss": 0.4467, + "loss/crossentropy": 2.1004234552383423, + "loss/hidden": 0.17041015625, + "loss/logits": 0.026734575629234314, + "loss/reg": 0.0249563567340374, + "step": 861 + }, + { + "epoch": 0.431, + "grad_norm": 1.1335351467132568, + "grad_norm_var": 0.017772602276823986, + "learning_rate": 2e-05, + "loss": 0.4597, + "loss/crossentropy": 2.2476999759674072, + "loss/hidden": 0.1806640625, + "loss/logits": 0.029479091055691242, + "loss/reg": 0.02495376206934452, + "step": 862 + }, + { + "epoch": 0.4315, + "grad_norm": 1.1975138187408447, + "grad_norm_var": 0.017997867740444682, + "learning_rate": 2e-05, + "loss": 0.454, + "loss/crossentropy": 2.114013433456421, + "loss/hidden": 0.17529296875, + "loss/logits": 0.029184110462665558, + "loss/reg": 0.024951165542006493, + "step": 863 + }, + { + "epoch": 0.432, + "grad_norm": 1.684401512145996, + "grad_norm_var": 0.028711411651534922, + "learning_rate": 2e-05, + "loss": 0.4371, + "loss/crossentropy": 2.5192021131515503, + "loss/hidden": 0.16015625, + "loss/logits": 0.027418741025030613, + "loss/reg": 0.02494893968105316, + "step": 864 + }, + { + "epoch": 0.4325, + "grad_norm": 1.327570915222168, + "grad_norm_var": 0.023662792002424264, + "learning_rate": 2e-05, + "loss": 0.4681, + "loss/crossentropy": 2.387048840522766, + "loss/hidden": 0.18212890625, + "loss/logits": 0.036548664793372154, + "loss/reg": 0.024946413934230804, + "step": 865 + }, + { + "epoch": 0.433, + "grad_norm": 1.168529987335205, + "grad_norm_var": 0.02376700496530641, + "learning_rate": 2e-05, + "loss": 0.4656, + "loss/crossentropy": 2.4732731580734253, + "loss/hidden": 0.181640625, + "loss/logits": 0.034491341561079025, + "loss/reg": 0.02494383417069912, + "step": 866 + }, + { + "epoch": 0.4335, + "grad_norm": 1.0916374921798706, + "grad_norm_var": 0.02572730800574637, + "learning_rate": 2e-05, + "loss": 0.4188, + "loss/crossentropy": 2.4050283432006836, + "loss/hidden": 0.14404296875, + "loss/logits": 0.025381820276379585, + "loss/reg": 0.02494126372039318, + "step": 867 + }, + { + "epoch": 0.434, + "grad_norm": 1.173865795135498, + "grad_norm_var": 0.026113363341109638, + "learning_rate": 2e-05, + "loss": 0.4418, + "loss/crossentropy": 2.3439362049102783, + "loss/hidden": 0.16455078125, + "loss/logits": 0.02782224863767624, + "loss/reg": 0.024939002469182014, + "step": 868 + }, + { + "epoch": 0.4345, + "grad_norm": 1.1083062887191772, + "grad_norm_var": 0.02485949808100442, + "learning_rate": 2e-05, + "loss": 0.4429, + "loss/crossentropy": 2.08747261762619, + "loss/hidden": 0.16357421875, + "loss/logits": 0.029917718842625618, + "loss/reg": 0.024936381727457047, + "step": 869 + }, + { + "epoch": 0.435, + "grad_norm": 2.1887571811676025, + "grad_norm_var": 0.0793744402641759, + "learning_rate": 2e-05, + "loss": 0.5581, + "loss/crossentropy": 2.1568849086761475, + "loss/hidden": 0.26513671875, + "loss/logits": 0.04361843876540661, + "loss/reg": 0.024933794513344765, + "step": 870 + }, + { + "epoch": 0.4355, + "grad_norm": 1.2723170518875122, + "grad_norm_var": 0.07809042331594848, + "learning_rate": 2e-05, + "loss": 0.4427, + "loss/crossentropy": 2.4057594537734985, + "loss/hidden": 0.16162109375, + "loss/logits": 0.03172140009701252, + "loss/reg": 0.024931542575359344, + "step": 871 + }, + { + "epoch": 0.436, + "grad_norm": 1.2788238525390625, + "grad_norm_var": 0.07781891044481218, + "learning_rate": 2e-05, + "loss": 0.4932, + "loss/crossentropy": 2.2258142232894897, + "loss/hidden": 0.2109375, + "loss/logits": 0.032939719036221504, + "loss/reg": 0.024928996339440346, + "step": 872 + }, + { + "epoch": 0.4365, + "grad_norm": 1.833802342414856, + "grad_norm_var": 0.09422989175293613, + "learning_rate": 2e-05, + "loss": 0.4469, + "loss/crossentropy": 2.3266918659210205, + "loss/hidden": 0.17236328125, + "loss/logits": 0.02525283396244049, + "loss/reg": 0.024926558136940002, + "step": 873 + }, + { + "epoch": 0.437, + "grad_norm": 1.3627578020095825, + "grad_norm_var": 0.09036321255378343, + "learning_rate": 2e-05, + "loss": 0.4308, + "loss/crossentropy": 2.6098272800445557, + "loss/hidden": 0.15478515625, + "loss/logits": 0.026725860312581062, + "loss/reg": 0.02492396906018257, + "step": 874 + }, + { + "epoch": 0.4375, + "grad_norm": 1.3417320251464844, + "grad_norm_var": 0.09000547961185816, + "learning_rate": 2e-05, + "loss": 0.4348, + "loss/crossentropy": 2.27658474445343, + "loss/hidden": 0.15673828125, + "loss/logits": 0.028818843886256218, + "loss/reg": 0.02492145262658596, + "step": 875 + }, + { + "epoch": 0.438, + "grad_norm": 1.6484942436218262, + "grad_norm_var": 0.09397019522889086, + "learning_rate": 2e-05, + "loss": 0.4471, + "loss/crossentropy": 2.509611129760742, + "loss/hidden": 0.1689453125, + "loss/logits": 0.028927761130034924, + "loss/reg": 0.024918843060731888, + "step": 876 + }, + { + "epoch": 0.4385, + "grad_norm": 1.3224067687988281, + "grad_norm_var": 0.09283173563057918, + "learning_rate": 2e-05, + "loss": 0.4616, + "loss/crossentropy": 2.326986074447632, + "loss/hidden": 0.17822265625, + "loss/logits": 0.034166223369538784, + "loss/reg": 0.02491624280810356, + "step": 877 + }, + { + "epoch": 0.439, + "grad_norm": 1.520644187927246, + "grad_norm_var": 0.08930074610143818, + "learning_rate": 2e-05, + "loss": 0.4886, + "loss/crossentropy": 2.3941385746002197, + "loss/hidden": 0.19970703125, + "loss/logits": 0.039785370230674744, + "loss/reg": 0.0249137245118618, + "step": 878 + }, + { + "epoch": 0.4395, + "grad_norm": 1.2307255268096924, + "grad_norm_var": 0.0884393859627858, + "learning_rate": 2e-05, + "loss": 0.4364, + "loss/crossentropy": 2.3979439735412598, + "loss/hidden": 0.158203125, + "loss/logits": 0.029046453535556793, + "loss/reg": 0.024911358952522278, + "step": 879 + }, + { + "epoch": 0.44, + "grad_norm": 1.3595565557479858, + "grad_norm_var": 0.08313544190789533, + "learning_rate": 2e-05, + "loss": 0.4471, + "loss/crossentropy": 2.423276662826538, + "loss/hidden": 0.16796875, + "loss/logits": 0.03002047911286354, + "loss/reg": 0.024909034371376038, + "step": 880 + }, + { + "epoch": 0.4405, + "grad_norm": 1.374289870262146, + "grad_norm_var": 0.08288689659587992, + "learning_rate": 2e-05, + "loss": 0.4752, + "loss/crossentropy": 2.37721049785614, + "loss/hidden": 0.193359375, + "loss/logits": 0.032731397077441216, + "loss/reg": 0.024906881153583527, + "step": 881 + }, + { + "epoch": 0.441, + "grad_norm": 1.8934530019760132, + "grad_norm_var": 0.0941036028269572, + "learning_rate": 2e-05, + "loss": 0.4598, + "loss/crossentropy": 2.418339967727661, + "loss/hidden": 0.18212890625, + "loss/logits": 0.028664090670645237, + "loss/reg": 0.02490459941327572, + "step": 882 + }, + { + "epoch": 0.4415, + "grad_norm": 1.4956854581832886, + "grad_norm_var": 0.08566906663214482, + "learning_rate": 2e-05, + "loss": 0.4619, + "loss/crossentropy": 2.519649028778076, + "loss/hidden": 0.1826171875, + "loss/logits": 0.030213934369385242, + "loss/reg": 0.024902526289224625, + "step": 883 + }, + { + "epoch": 0.442, + "grad_norm": 1.841424822807312, + "grad_norm_var": 0.0877992890859374, + "learning_rate": 2e-05, + "loss": 0.455, + "loss/crossentropy": 2.4970178604125977, + "loss/hidden": 0.17626953125, + "loss/logits": 0.02974709589034319, + "loss/reg": 0.024900225922465324, + "step": 884 + }, + { + "epoch": 0.4425, + "grad_norm": 1.140735387802124, + "grad_norm_var": 0.0861516049042431, + "learning_rate": 2e-05, + "loss": 0.4212, + "loss/crossentropy": 2.385036587715149, + "loss/hidden": 0.146484375, + "loss/logits": 0.0257627060636878, + "loss/reg": 0.024898122996091843, + "step": 885 + }, + { + "epoch": 0.443, + "grad_norm": 1.5280400514602661, + "grad_norm_var": 0.05334077575196729, + "learning_rate": 2e-05, + "loss": 0.4512, + "loss/crossentropy": 2.461831569671631, + "loss/hidden": 0.1669921875, + "loss/logits": 0.0352974608540535, + "loss/reg": 0.024895787239074707, + "step": 886 + }, + { + "epoch": 0.4435, + "grad_norm": 1.0629712343215942, + "grad_norm_var": 0.06146672512662115, + "learning_rate": 2e-05, + "loss": 0.4218, + "loss/crossentropy": 2.366239547729492, + "loss/hidden": 0.14794921875, + "loss/logits": 0.02493153791874647, + "loss/reg": 0.024893587455153465, + "step": 887 + }, + { + "epoch": 0.444, + "grad_norm": 1.745954155921936, + "grad_norm_var": 0.06430499243878576, + "learning_rate": 2e-05, + "loss": 0.4751, + "loss/crossentropy": 2.4183106422424316, + "loss/hidden": 0.1826171875, + "loss/logits": 0.043571919202804565, + "loss/reg": 0.024891452863812447, + "step": 888 + }, + { + "epoch": 0.4445, + "grad_norm": 1.5373462438583374, + "grad_norm_var": 0.055868980125863034, + "learning_rate": 2e-05, + "loss": 0.4825, + "loss/crossentropy": 2.448971748352051, + "loss/hidden": 0.1875, + "loss/logits": 0.046090008690953255, + "loss/reg": 0.024889154359698296, + "step": 889 + }, + { + "epoch": 0.445, + "grad_norm": 1.2213661670684814, + "grad_norm_var": 0.05900614209897312, + "learning_rate": 2e-05, + "loss": 0.4548, + "loss/crossentropy": 2.500189185142517, + "loss/hidden": 0.17578125, + "loss/logits": 0.030193179845809937, + "loss/reg": 0.02488705888390541, + "step": 890 + }, + { + "epoch": 0.4455, + "grad_norm": 1.2715861797332764, + "grad_norm_var": 0.06036416983983243, + "learning_rate": 2e-05, + "loss": 0.4079, + "loss/crossentropy": 2.4891607761383057, + "loss/hidden": 0.13671875, + "loss/logits": 0.022357992827892303, + "loss/reg": 0.02488500438630581, + "step": 891 + }, + { + "epoch": 0.446, + "grad_norm": 1.2065671682357788, + "grad_norm_var": 0.06085480104910346, + "learning_rate": 2e-05, + "loss": 0.4132, + "loss/crossentropy": 2.3212687969207764, + "loss/hidden": 0.1416015625, + "loss/logits": 0.02274497877806425, + "loss/reg": 0.024882985278964043, + "step": 892 + }, + { + "epoch": 0.4465, + "grad_norm": 2.286463975906372, + "grad_norm_var": 0.10613483736873922, + "learning_rate": 2e-05, + "loss": 0.6098, + "loss/crossentropy": 1.9856956601142883, + "loss/hidden": 0.29248046875, + "loss/logits": 0.06851914338767529, + "loss/reg": 0.02488100528717041, + "step": 893 + }, + { + "epoch": 0.447, + "grad_norm": 1.3317387104034424, + "grad_norm_var": 0.10739939277282436, + "learning_rate": 2e-05, + "loss": 0.4361, + "loss/crossentropy": 2.180716037750244, + "loss/hidden": 0.1611328125, + "loss/logits": 0.026132527738809586, + "loss/reg": 0.024878744035959244, + "step": 894 + }, + { + "epoch": 0.4475, + "grad_norm": 1.1505863666534424, + "grad_norm_var": 0.11036276513544672, + "learning_rate": 2e-05, + "loss": 0.4076, + "loss/crossentropy": 2.4193174839019775, + "loss/hidden": 0.134765625, + "loss/logits": 0.02407541684806347, + "loss/reg": 0.024876724928617477, + "step": 895 + }, + { + "epoch": 0.448, + "grad_norm": 1.2850412130355835, + "grad_norm_var": 0.11176224122004706, + "learning_rate": 2e-05, + "loss": 0.4134, + "loss/crossentropy": 2.3620327711105347, + "loss/hidden": 0.13916015625, + "loss/logits": 0.025503816083073616, + "loss/reg": 0.024874389171600342, + "step": 896 + }, + { + "epoch": 0.4485, + "grad_norm": 2.1535191535949707, + "grad_norm_var": 0.1407210477913499, + "learning_rate": 2e-05, + "loss": 0.523, + "loss/crossentropy": 2.0216793417930603, + "loss/hidden": 0.2353515625, + "loss/logits": 0.03891510330140591, + "loss/reg": 0.024871978908777237, + "step": 897 + }, + { + "epoch": 0.449, + "grad_norm": 1.4914774894714355, + "grad_norm_var": 0.1302430455595032, + "learning_rate": 2e-05, + "loss": 0.439, + "loss/crossentropy": 2.452531099319458, + "loss/hidden": 0.16455078125, + "loss/logits": 0.025756201706826687, + "loss/reg": 0.024869605898857117, + "step": 898 + }, + { + "epoch": 0.4495, + "grad_norm": 1.766234278678894, + "grad_norm_var": 0.13522470542034715, + "learning_rate": 2e-05, + "loss": 0.4881, + "loss/crossentropy": 2.4874242544174194, + "loss/hidden": 0.203125, + "loss/logits": 0.03627724573016167, + "loss/reg": 0.02486717328429222, + "step": 899 + }, + { + "epoch": 0.45, + "grad_norm": 1.790714979171753, + "grad_norm_var": 0.13308583996840462, + "learning_rate": 2e-05, + "loss": 0.4733, + "loss/crossentropy": 2.4922057390213013, + "loss/hidden": 0.19140625, + "loss/logits": 0.033293405547738075, + "loss/reg": 0.02486467733979225, + "step": 900 + }, + { + "epoch": 0.4505, + "grad_norm": 1.8885260820388794, + "grad_norm_var": 0.13239945321149568, + "learning_rate": 2e-05, + "loss": 0.4617, + "loss/crossentropy": 2.575096845626831, + "loss/hidden": 0.18603515625, + "loss/logits": 0.02701327670365572, + "loss/reg": 0.024862412363290787, + "step": 901 + }, + { + "epoch": 0.451, + "grad_norm": 1.5798112154006958, + "grad_norm_var": 0.13245070282555294, + "learning_rate": 2e-05, + "loss": 0.4422, + "loss/crossentropy": 2.324281692504883, + "loss/hidden": 0.166015625, + "loss/logits": 0.027567077428102493, + "loss/reg": 0.024859966710209846, + "step": 902 + }, + { + "epoch": 0.4515, + "grad_norm": 1.364610195159912, + "grad_norm_var": 0.11862540114961077, + "learning_rate": 2e-05, + "loss": 0.4362, + "loss/crossentropy": 2.336674928665161, + "loss/hidden": 0.1591796875, + "loss/logits": 0.028398605063557625, + "loss/reg": 0.024857668206095695, + "step": 903 + }, + { + "epoch": 0.452, + "grad_norm": 1.5987074375152588, + "grad_norm_var": 0.11646655255089418, + "learning_rate": 2e-05, + "loss": 0.4476, + "loss/crossentropy": 2.581295609474182, + "loss/hidden": 0.1640625, + "loss/logits": 0.03499746974557638, + "loss/reg": 0.024855423718690872, + "step": 904 + }, + { + "epoch": 0.4525, + "grad_norm": 1.2477660179138184, + "grad_norm_var": 0.12249611635972564, + "learning_rate": 2e-05, + "loss": 0.471, + "loss/crossentropy": 2.3965861797332764, + "loss/hidden": 0.1923828125, + "loss/logits": 0.03011870291084051, + "loss/reg": 0.024853060021996498, + "step": 905 + }, + { + "epoch": 0.453, + "grad_norm": 1.091818928718567, + "grad_norm_var": 0.1290430691585063, + "learning_rate": 2e-05, + "loss": 0.4421, + "loss/crossentropy": 2.3112945556640625, + "loss/hidden": 0.16552734375, + "loss/logits": 0.02809662837535143, + "loss/reg": 0.024850843474268913, + "step": 906 + }, + { + "epoch": 0.4535, + "grad_norm": 1.2797828912734985, + "grad_norm_var": 0.1287631299307894, + "learning_rate": 2e-05, + "loss": 0.4194, + "loss/crossentropy": 2.327611804008484, + "loss/hidden": 0.146484375, + "loss/logits": 0.024378618225455284, + "loss/reg": 0.024848705157637596, + "step": 907 + }, + { + "epoch": 0.454, + "grad_norm": 1.0900261402130127, + "grad_norm_var": 0.13467015675929944, + "learning_rate": 2e-05, + "loss": 0.4585, + "loss/crossentropy": 2.3121442794799805, + "loss/hidden": 0.17529296875, + "loss/logits": 0.03474980313330889, + "loss/reg": 0.024846620857715607, + "step": 908 + }, + { + "epoch": 0.4545, + "grad_norm": 1.530750036239624, + "grad_norm_var": 0.09361760922795549, + "learning_rate": 2e-05, + "loss": 0.4755, + "loss/crossentropy": 2.254515528678894, + "loss/hidden": 0.19189453125, + "loss/logits": 0.035164170898497105, + "loss/reg": 0.024844245985150337, + "step": 909 + }, + { + "epoch": 0.455, + "grad_norm": 1.4343830347061157, + "grad_norm_var": 0.09228027400132052, + "learning_rate": 2e-05, + "loss": 0.4977, + "loss/crossentropy": 2.3030155897140503, + "loss/hidden": 0.220703125, + "loss/logits": 0.028591503389179707, + "loss/reg": 0.02484210580587387, + "step": 910 + }, + { + "epoch": 0.4555, + "grad_norm": 1.2215298414230347, + "grad_norm_var": 0.0894411767193444, + "learning_rate": 2e-05, + "loss": 0.4869, + "loss/crossentropy": 2.1451609134674072, + "loss/hidden": 0.203125, + "loss/logits": 0.03534893877804279, + "loss/reg": 0.024839749559760094, + "step": 911 + }, + { + "epoch": 0.456, + "grad_norm": 1.1733628511428833, + "grad_norm_var": 0.09324906194983575, + "learning_rate": 2e-05, + "loss": 0.4387, + "loss/crossentropy": 2.298704981803894, + "loss/hidden": 0.16015625, + "loss/logits": 0.030184932053089142, + "loss/reg": 0.024837518110871315, + "step": 912 + }, + { + "epoch": 0.4565, + "grad_norm": 1.3525742292404175, + "grad_norm_var": 0.06157036227700316, + "learning_rate": 2e-05, + "loss": 0.4353, + "loss/crossentropy": 2.3784111738204956, + "loss/hidden": 0.15576171875, + "loss/logits": 0.031202757731080055, + "loss/reg": 0.02483524940907955, + "step": 913 + }, + { + "epoch": 0.457, + "grad_norm": 1.6027723550796509, + "grad_norm_var": 0.06323633110931534, + "learning_rate": 2e-05, + "loss": 0.5053, + "loss/crossentropy": 2.2770267724990845, + "loss/hidden": 0.21728515625, + "loss/logits": 0.03967934101819992, + "loss/reg": 0.024832794442772865, + "step": 914 + }, + { + "epoch": 0.4575, + "grad_norm": 1.939664602279663, + "grad_norm_var": 0.07269855280353182, + "learning_rate": 2e-05, + "loss": 0.5217, + "loss/crossentropy": 2.3569631576538086, + "loss/hidden": 0.2294921875, + "loss/logits": 0.04393378458917141, + "loss/reg": 0.024830317124724388, + "step": 915 + }, + { + "epoch": 0.458, + "grad_norm": 1.4609216451644897, + "grad_norm_var": 0.06447793501211076, + "learning_rate": 2e-05, + "loss": 0.457, + "loss/crossentropy": 2.481472373008728, + "loss/hidden": 0.17138671875, + "loss/logits": 0.03733105957508087, + "loss/reg": 0.02482791244983673, + "step": 916 + }, + { + "epoch": 0.4585, + "grad_norm": 2.2184019088745117, + "grad_norm_var": 0.09150982546446039, + "learning_rate": 2e-05, + "loss": 0.4721, + "loss/crossentropy": 2.2963072061538696, + "loss/hidden": 0.189453125, + "loss/logits": 0.034421585500240326, + "loss/reg": 0.024825412780046463, + "step": 917 + }, + { + "epoch": 0.459, + "grad_norm": 1.441645622253418, + "grad_norm_var": 0.0902964389133101, + "learning_rate": 2e-05, + "loss": 0.4577, + "loss/crossentropy": 2.3010048866271973, + "loss/hidden": 0.17236328125, + "loss/logits": 0.03706255368888378, + "loss/reg": 0.024822838604450226, + "step": 918 + }, + { + "epoch": 0.4595, + "grad_norm": 2.116910219192505, + "grad_norm_var": 0.11805189358334474, + "learning_rate": 2e-05, + "loss": 0.5238, + "loss/crossentropy": 2.357789158821106, + "loss/hidden": 0.23095703125, + "loss/logits": 0.04468147084116936, + "loss/reg": 0.024820242077112198, + "step": 919 + }, + { + "epoch": 0.46, + "grad_norm": 1.6940172910690308, + "grad_norm_var": 0.12003205518374636, + "learning_rate": 2e-05, + "loss": 0.462, + "loss/crossentropy": 2.6455941200256348, + "loss/hidden": 0.18115234375, + "loss/logits": 0.03263464197516441, + "loss/reg": 0.024817565456032753, + "step": 920 + }, + { + "epoch": 0.4605, + "grad_norm": 1.2647062540054321, + "grad_norm_var": 0.11949490577010594, + "learning_rate": 2e-05, + "loss": 0.4935, + "loss/crossentropy": 2.5739123821258545, + "loss/hidden": 0.21044921875, + "loss/logits": 0.03494640905410051, + "loss/reg": 0.02481519803404808, + "step": 921 + }, + { + "epoch": 0.461, + "grad_norm": 1.8925144672393799, + "grad_norm_var": 0.11656603854064347, + "learning_rate": 2e-05, + "loss": 0.4917, + "loss/crossentropy": 2.2864513397216797, + "loss/hidden": 0.2021484375, + "loss/logits": 0.04141218215227127, + "loss/reg": 0.024812612682580948, + "step": 922 + }, + { + "epoch": 0.4615, + "grad_norm": 1.816635251045227, + "grad_norm_var": 0.11562187436851393, + "learning_rate": 2e-05, + "loss": 0.4829, + "loss/crossentropy": 2.3441028594970703, + "loss/hidden": 0.18994140625, + "loss/logits": 0.044871050864458084, + "loss/reg": 0.02481023781001568, + "step": 923 + }, + { + "epoch": 0.462, + "grad_norm": 1.319472074508667, + "grad_norm_var": 0.10397834789190098, + "learning_rate": 2e-05, + "loss": 0.4607, + "loss/crossentropy": 2.5088049173355103, + "loss/hidden": 0.18017578125, + "loss/logits": 0.03244396485388279, + "loss/reg": 0.02480742521584034, + "step": 924 + }, + { + "epoch": 0.4625, + "grad_norm": 1.5024747848510742, + "grad_norm_var": 0.10426117709982438, + "learning_rate": 2e-05, + "loss": 0.4199, + "loss/crossentropy": 2.44161593914032, + "loss/hidden": 0.14404296875, + "loss/logits": 0.027814405038952827, + "loss/reg": 0.024804776534438133, + "step": 925 + }, + { + "epoch": 0.463, + "grad_norm": 1.0549204349517822, + "grad_norm_var": 0.12117201442257676, + "learning_rate": 2e-05, + "loss": 0.4289, + "loss/crossentropy": 2.441314697265625, + "loss/hidden": 0.15185546875, + "loss/logits": 0.029008976183831692, + "loss/reg": 0.024802392348647118, + "step": 926 + }, + { + "epoch": 0.4635, + "grad_norm": 1.2234230041503906, + "grad_norm_var": 0.12108502599879684, + "learning_rate": 2e-05, + "loss": 0.4649, + "loss/crossentropy": 2.360079288482666, + "loss/hidden": 0.18505859375, + "loss/logits": 0.03184010460972786, + "loss/reg": 0.024800008162856102, + "step": 927 + }, + { + "epoch": 0.464, + "grad_norm": 1.3127866983413696, + "grad_norm_var": 0.11497950175635048, + "learning_rate": 2e-05, + "loss": 0.417, + "loss/crossentropy": 2.2916054725646973, + "loss/hidden": 0.1484375, + "loss/logits": 0.02055790089070797, + "loss/reg": 0.024797627702355385, + "step": 928 + }, + { + "epoch": 0.4645, + "grad_norm": 1.418331503868103, + "grad_norm_var": 0.11329202015476666, + "learning_rate": 2e-05, + "loss": 0.4367, + "loss/crossentropy": 2.228062152862549, + "loss/hidden": 0.15966796875, + "loss/logits": 0.02912633679807186, + "loss/reg": 0.024795077741146088, + "step": 929 + }, + { + "epoch": 0.465, + "grad_norm": 1.2717900276184082, + "grad_norm_var": 0.11913277672642243, + "learning_rate": 2e-05, + "loss": 0.4317, + "loss/crossentropy": 2.266680121421814, + "loss/hidden": 0.1552734375, + "loss/logits": 0.028470346704125404, + "loss/reg": 0.02479269914329052, + "step": 930 + }, + { + "epoch": 0.4655, + "grad_norm": 1.396073341369629, + "grad_norm_var": 0.11003177528165793, + "learning_rate": 2e-05, + "loss": 0.483, + "loss/crossentropy": 2.5918630361557007, + "loss/hidden": 0.1953125, + "loss/logits": 0.03977473732084036, + "loss/reg": 0.02479018084704876, + "step": 931 + }, + { + "epoch": 0.466, + "grad_norm": 1.1711387634277344, + "grad_norm_var": 0.11776813258662025, + "learning_rate": 2e-05, + "loss": 0.4211, + "loss/crossentropy": 2.1843584775924683, + "loss/hidden": 0.1494140625, + "loss/logits": 0.023802118375897408, + "loss/reg": 0.024787776172161102, + "step": 932 + }, + { + "epoch": 0.4665, + "grad_norm": 1.4844838380813599, + "grad_norm_var": 0.08183792965829349, + "learning_rate": 2e-05, + "loss": 0.4591, + "loss/crossentropy": 2.2599565982818604, + "loss/hidden": 0.17041015625, + "loss/logits": 0.040790168568491936, + "loss/reg": 0.024785393849015236, + "step": 933 + }, + { + "epoch": 0.467, + "grad_norm": 1.6613248586654663, + "grad_norm_var": 0.08427746877438451, + "learning_rate": 2e-05, + "loss": 0.4759, + "loss/crossentropy": 2.3885433673858643, + "loss/hidden": 0.1943359375, + "loss/logits": 0.033717614598572254, + "loss/reg": 0.02478303201496601, + "step": 934 + }, + { + "epoch": 0.4675, + "grad_norm": 2.4827864170074463, + "grad_norm_var": 0.12395562095072604, + "learning_rate": 2e-05, + "loss": 0.5939, + "loss/crossentropy": 2.383415699005127, + "loss/hidden": 0.302734375, + "loss/logits": 0.04333702102303505, + "loss/reg": 0.02478056028485298, + "step": 935 + }, + { + "epoch": 0.468, + "grad_norm": 1.4659557342529297, + "grad_norm_var": 0.12124371062594505, + "learning_rate": 2e-05, + "loss": 0.485, + "loss/crossentropy": 2.1564711332321167, + "loss/hidden": 0.19921875, + "loss/logits": 0.03798619005829096, + "loss/reg": 0.024778055027127266, + "step": 936 + }, + { + "epoch": 0.4685, + "grad_norm": 1.528003454208374, + "grad_norm_var": 0.11788932977424474, + "learning_rate": 2e-05, + "loss": 0.4102, + "loss/crossentropy": 2.308253049850464, + "loss/hidden": 0.13623046875, + "loss/logits": 0.026196792721748352, + "loss/reg": 0.024775685742497444, + "step": 937 + }, + { + "epoch": 0.469, + "grad_norm": 1.1551241874694824, + "grad_norm_var": 0.11329483698475963, + "learning_rate": 2e-05, + "loss": 0.4444, + "loss/crossentropy": 2.2042760848999023, + "loss/hidden": 0.1689453125, + "loss/logits": 0.027680596336722374, + "loss/reg": 0.024773309007287025, + "step": 938 + }, + { + "epoch": 0.4695, + "grad_norm": 2.032935857772827, + "grad_norm_var": 0.1266760833029648, + "learning_rate": 2e-05, + "loss": 0.4807, + "loss/crossentropy": 2.7006815671920776, + "loss/hidden": 0.19091796875, + "loss/logits": 0.042035577818751335, + "loss/reg": 0.02477095276117325, + "step": 939 + }, + { + "epoch": 0.47, + "grad_norm": 1.141130805015564, + "grad_norm_var": 0.1321853888846779, + "learning_rate": 2e-05, + "loss": 0.4271, + "loss/crossentropy": 2.4339696168899536, + "loss/hidden": 0.15673828125, + "loss/logits": 0.02268486563116312, + "loss/reg": 0.024768613278865814, + "step": 940 + }, + { + "epoch": 0.4705, + "grad_norm": 1.7656772136688232, + "grad_norm_var": 0.13813141921850866, + "learning_rate": 2e-05, + "loss": 0.4583, + "loss/crossentropy": 2.327541947364807, + "loss/hidden": 0.1767578125, + "loss/logits": 0.03389530163258314, + "loss/reg": 0.024766255170106888, + "step": 941 + }, + { + "epoch": 0.471, + "grad_norm": 1.3216570615768433, + "grad_norm_var": 0.12771394362122404, + "learning_rate": 2e-05, + "loss": 0.4448, + "loss/crossentropy": 2.4096368551254272, + "loss/hidden": 0.1650390625, + "loss/logits": 0.03215141408145428, + "loss/reg": 0.024764133617281914, + "step": 942 + }, + { + "epoch": 0.4715, + "grad_norm": 1.3881388902664185, + "grad_norm_var": 0.12356518206842436, + "learning_rate": 2e-05, + "loss": 0.416, + "loss/crossentropy": 2.585834264755249, + "loss/hidden": 0.1416015625, + "loss/logits": 0.02676891814917326, + "loss/reg": 0.024761632084846497, + "step": 943 + }, + { + "epoch": 0.472, + "grad_norm": 1.2373863458633423, + "grad_norm_var": 0.1258009621939289, + "learning_rate": 2e-05, + "loss": 0.4321, + "loss/crossentropy": 2.3134829998016357, + "loss/hidden": 0.15576171875, + "loss/logits": 0.02871276345103979, + "loss/reg": 0.024759074673056602, + "step": 944 + }, + { + "epoch": 0.4725, + "grad_norm": 1.1650878190994263, + "grad_norm_var": 0.1324021004505103, + "learning_rate": 2e-05, + "loss": 0.4674, + "loss/crossentropy": 2.1889017820358276, + "loss/hidden": 0.19140625, + "loss/logits": 0.028469436801970005, + "loss/reg": 0.02475649118423462, + "step": 945 + }, + { + "epoch": 0.473, + "grad_norm": 3.083178997039795, + "grad_norm_var": 0.28735681279515096, + "learning_rate": 2e-05, + "loss": 0.4476, + "loss/crossentropy": 2.484034538269043, + "loss/hidden": 0.17529296875, + "loss/logits": 0.024776030331850052, + "loss/reg": 0.02475435845553875, + "step": 946 + }, + { + "epoch": 0.4735, + "grad_norm": 2.8552777767181396, + "grad_norm_var": 0.38221875854398485, + "learning_rate": 2e-05, + "loss": 0.6484, + "loss/crossentropy": 2.2809172868728638, + "loss/hidden": 0.328125, + "loss/logits": 0.07271300628781319, + "loss/reg": 0.024751881137490273, + "step": 947 + }, + { + "epoch": 0.474, + "grad_norm": 1.3637315034866333, + "grad_norm_var": 0.3713747885972831, + "learning_rate": 2e-05, + "loss": 0.4537, + "loss/crossentropy": 2.368937849998474, + "loss/hidden": 0.17724609375, + "loss/logits": 0.02893682010471821, + "loss/reg": 0.02474971115589142, + "step": 948 + }, + { + "epoch": 0.4745, + "grad_norm": 1.571547269821167, + "grad_norm_var": 0.36939615340519977, + "learning_rate": 2e-05, + "loss": 0.435, + "loss/crossentropy": 2.5506834983825684, + "loss/hidden": 0.15869140625, + "loss/logits": 0.028869743458926678, + "loss/reg": 0.024747245013713837, + "step": 949 + }, + { + "epoch": 0.475, + "grad_norm": 1.5900770425796509, + "grad_norm_var": 0.37009206946137085, + "learning_rate": 2e-05, + "loss": 0.4676, + "loss/crossentropy": 2.4405782222747803, + "loss/hidden": 0.18701171875, + "loss/logits": 0.033153336495161057, + "loss/reg": 0.02474481612443924, + "step": 950 + }, + { + "epoch": 0.4755, + "grad_norm": 1.2171446084976196, + "grad_norm_var": 0.3375590343649016, + "learning_rate": 2e-05, + "loss": 0.4351, + "loss/crossentropy": 2.6292362213134766, + "loss/hidden": 0.1572265625, + "loss/logits": 0.030438624322414398, + "loss/reg": 0.02474270388484001, + "step": 951 + }, + { + "epoch": 0.476, + "grad_norm": 1.5484012365341187, + "grad_norm_var": 0.3363165658381873, + "learning_rate": 2e-05, + "loss": 0.4397, + "loss/crossentropy": 2.4467194080352783, + "loss/hidden": 0.16259765625, + "loss/logits": 0.029697156511247158, + "loss/reg": 0.02474055252969265, + "step": 952 + }, + { + "epoch": 0.4765, + "grad_norm": 1.3582558631896973, + "grad_norm_var": 0.34026256323006543, + "learning_rate": 2e-05, + "loss": 0.449, + "loss/crossentropy": 2.439231514930725, + "loss/hidden": 0.173828125, + "loss/logits": 0.027837133966386318, + "loss/reg": 0.024738363921642303, + "step": 953 + }, + { + "epoch": 0.477, + "grad_norm": 1.969158411026001, + "grad_norm_var": 0.33207128414330966, + "learning_rate": 2e-05, + "loss": 0.4533, + "loss/crossentropy": 2.452765464782715, + "loss/hidden": 0.17138671875, + "loss/logits": 0.03453033231198788, + "loss/reg": 0.024736056104302406, + "step": 954 + }, + { + "epoch": 0.4775, + "grad_norm": 1.4187953472137451, + "grad_norm_var": 0.32535599956763966, + "learning_rate": 2e-05, + "loss": 0.4805, + "loss/crossentropy": 2.3570865392684937, + "loss/hidden": 0.1953125, + "loss/logits": 0.0378948412835598, + "loss/reg": 0.024733752012252808, + "step": 955 + }, + { + "epoch": 0.478, + "grad_norm": 1.6787301301956177, + "grad_norm_var": 0.30875959889143295, + "learning_rate": 2e-05, + "loss": 0.4978, + "loss/crossentropy": 2.316788911819458, + "loss/hidden": 0.197265625, + "loss/logits": 0.0531964972615242, + "loss/reg": 0.02473163791000843, + "step": 956 + }, + { + "epoch": 0.4785, + "grad_norm": 1.398138403892517, + "grad_norm_var": 0.311938660042613, + "learning_rate": 2e-05, + "loss": 0.464, + "loss/crossentropy": 2.5198220014572144, + "loss/hidden": 0.177734375, + "loss/logits": 0.03894750215113163, + "loss/reg": 0.024729417636990547, + "step": 957 + }, + { + "epoch": 0.479, + "grad_norm": 1.1664695739746094, + "grad_norm_var": 0.3199335312784062, + "learning_rate": 2e-05, + "loss": 0.4793, + "loss/crossentropy": 2.3437399864196777, + "loss/hidden": 0.20068359375, + "loss/logits": 0.031378373503685, + "loss/reg": 0.024727249518036842, + "step": 958 + }, + { + "epoch": 0.4795, + "grad_norm": 1.1036415100097656, + "grad_norm_var": 0.33399962070790534, + "learning_rate": 2e-05, + "loss": 0.4159, + "loss/crossentropy": 2.387402892112732, + "loss/hidden": 0.14306640625, + "loss/logits": 0.025623535737395287, + "loss/reg": 0.02472485415637493, + "step": 959 + }, + { + "epoch": 0.48, + "grad_norm": 1.7331931591033936, + "grad_norm_var": 0.32487558042496256, + "learning_rate": 2e-05, + "loss": 0.5184, + "loss/crossentropy": 2.3020901679992676, + "loss/hidden": 0.220703125, + "loss/logits": 0.05047208443284035, + "loss/reg": 0.024722406640648842, + "step": 960 + }, + { + "epoch": 0.4805, + "grad_norm": 1.6267915964126587, + "grad_norm_var": 0.3090366583706251, + "learning_rate": 2e-05, + "loss": 0.4403, + "loss/crossentropy": 2.4366101026535034, + "loss/hidden": 0.15576171875, + "loss/logits": 0.03732542134821415, + "loss/reg": 0.024719906970858574, + "step": 961 + }, + { + "epoch": 0.481, + "grad_norm": 1.4028695821762085, + "grad_norm_var": 0.16836660240098808, + "learning_rate": 2e-05, + "loss": 0.4703, + "loss/crossentropy": 2.271396040916443, + "loss/hidden": 0.19287109375, + "loss/logits": 0.030256666243076324, + "loss/reg": 0.024717407301068306, + "step": 962 + }, + { + "epoch": 0.4815, + "grad_norm": 1.1878552436828613, + "grad_norm_var": 0.054751871241501014, + "learning_rate": 2e-05, + "loss": 0.4214, + "loss/crossentropy": 2.3253756761550903, + "loss/hidden": 0.14697265625, + "loss/logits": 0.02732379548251629, + "loss/reg": 0.024714868515729904, + "step": 963 + }, + { + "epoch": 0.482, + "grad_norm": 1.1242592334747314, + "grad_norm_var": 0.06135958658490489, + "learning_rate": 2e-05, + "loss": 0.4465, + "loss/crossentropy": 2.239442467689514, + "loss/hidden": 0.16650390625, + "loss/logits": 0.03284657001495361, + "loss/reg": 0.024712176993489265, + "step": 964 + }, + { + "epoch": 0.4825, + "grad_norm": 1.3463644981384277, + "grad_norm_var": 0.06068299245025669, + "learning_rate": 2e-05, + "loss": 0.4428, + "loss/crossentropy": 2.316848874092102, + "loss/hidden": 0.16357421875, + "loss/logits": 0.03217571787536144, + "loss/reg": 0.024709584191441536, + "step": 965 + }, + { + "epoch": 0.483, + "grad_norm": 1.765031099319458, + "grad_norm_var": 0.0663445679323234, + "learning_rate": 2e-05, + "loss": 0.4263, + "loss/crossentropy": 2.5315778255462646, + "loss/hidden": 0.14892578125, + "loss/logits": 0.030336866155266762, + "loss/reg": 0.024706894531846046, + "step": 966 + }, + { + "epoch": 0.4835, + "grad_norm": 1.2559092044830322, + "grad_norm_var": 0.06528498573976828, + "learning_rate": 2e-05, + "loss": 0.4667, + "loss/crossentropy": 2.440574526786804, + "loss/hidden": 0.18359375, + "loss/logits": 0.036081746220588684, + "loss/reg": 0.024704458191990852, + "step": 967 + }, + { + "epoch": 0.484, + "grad_norm": 1.1820833683013916, + "grad_norm_var": 0.06851111155043531, + "learning_rate": 2e-05, + "loss": 0.413, + "loss/crossentropy": 2.4466443061828613, + "loss/hidden": 0.14453125, + "loss/logits": 0.02141994796693325, + "loss/reg": 0.024701889604330063, + "step": 968 + }, + { + "epoch": 0.4845, + "grad_norm": 2.0894601345062256, + "grad_norm_var": 0.09592261683346047, + "learning_rate": 2e-05, + "loss": 0.4239, + "loss/crossentropy": 2.4341933727264404, + "loss/hidden": 0.156005859375, + "loss/logits": 0.02092854119837284, + "loss/reg": 0.02469906210899353, + "step": 969 + }, + { + "epoch": 0.485, + "grad_norm": 1.430828332901001, + "grad_norm_var": 0.07788717528373278, + "learning_rate": 2e-05, + "loss": 0.4388, + "loss/crossentropy": 2.356964588165283, + "loss/hidden": 0.1591796875, + "loss/logits": 0.03270021267235279, + "loss/reg": 0.02469666488468647, + "step": 970 + }, + { + "epoch": 0.4855, + "grad_norm": 1.776854395866394, + "grad_norm_var": 0.08527437507114347, + "learning_rate": 2e-05, + "loss": 0.5232, + "loss/crossentropy": 2.171905517578125, + "loss/hidden": 0.2265625, + "loss/logits": 0.049691107124090195, + "loss/reg": 0.02469424158334732, + "step": 971 + }, + { + "epoch": 0.486, + "grad_norm": 1.2763676643371582, + "grad_norm_var": 0.08335147102312987, + "learning_rate": 2e-05, + "loss": 0.4638, + "loss/crossentropy": 2.0069618225097656, + "loss/hidden": 0.1845703125, + "loss/logits": 0.032350869849324226, + "loss/reg": 0.02469182200729847, + "step": 972 + }, + { + "epoch": 0.4865, + "grad_norm": 1.5993913412094116, + "grad_norm_var": 0.08505121055133316, + "learning_rate": 2e-05, + "loss": 0.4726, + "loss/crossentropy": 2.4825299978256226, + "loss/hidden": 0.18701171875, + "loss/logits": 0.03873500041663647, + "loss/reg": 0.024689404293894768, + "step": 973 + }, + { + "epoch": 0.487, + "grad_norm": 1.3979259729385376, + "grad_norm_var": 0.07990529104096797, + "learning_rate": 2e-05, + "loss": 0.4285, + "loss/crossentropy": 2.3328219652175903, + "loss/hidden": 0.15283203125, + "loss/logits": 0.028818014077842236, + "loss/reg": 0.024686843156814575, + "step": 974 + }, + { + "epoch": 0.4875, + "grad_norm": 2.5152621269226074, + "grad_norm_var": 0.138094556758349, + "learning_rate": 2e-05, + "loss": 0.5242, + "loss/crossentropy": 2.279319643974304, + "loss/hidden": 0.23974609375, + "loss/logits": 0.037576699629426, + "loss/reg": 0.024684444069862366, + "step": 975 + }, + { + "epoch": 0.488, + "grad_norm": 1.4693434238433838, + "grad_norm_var": 0.13580396599954264, + "learning_rate": 2e-05, + "loss": 0.4357, + "loss/crossentropy": 2.2661033868789673, + "loss/hidden": 0.162109375, + "loss/logits": 0.026757996529340744, + "loss/reg": 0.024681907147169113, + "step": 976 + }, + { + "epoch": 0.4885, + "grad_norm": 2.0209670066833496, + "grad_norm_var": 0.15071162713445574, + "learning_rate": 2e-05, + "loss": 0.4782, + "loss/crossentropy": 2.4691094160079956, + "loss/hidden": 0.18505859375, + "loss/logits": 0.04630833398550749, + "loss/reg": 0.024679280817508698, + "step": 977 + }, + { + "epoch": 0.489, + "grad_norm": 1.5368741750717163, + "grad_norm_var": 0.1491596028383583, + "learning_rate": 2e-05, + "loss": 0.4838, + "loss/crossentropy": 2.272148370742798, + "loss/hidden": 0.2041015625, + "loss/logits": 0.032892788760364056, + "loss/reg": 0.02467675693333149, + "step": 978 + }, + { + "epoch": 0.4895, + "grad_norm": 1.4713010787963867, + "grad_norm_var": 0.14008166049740395, + "learning_rate": 2e-05, + "loss": 0.4665, + "loss/crossentropy": 2.202664375305176, + "loss/hidden": 0.18798828125, + "loss/logits": 0.03179653640836477, + "loss/reg": 0.024674372747540474, + "step": 979 + }, + { + "epoch": 0.49, + "grad_norm": 1.870495080947876, + "grad_norm_var": 0.12967598326321478, + "learning_rate": 2e-05, + "loss": 0.462, + "loss/crossentropy": 2.598837971687317, + "loss/hidden": 0.171875, + "loss/logits": 0.04344309400767088, + "loss/reg": 0.024671973660588264, + "step": 980 + }, + { + "epoch": 0.4905, + "grad_norm": 1.2552647590637207, + "grad_norm_var": 0.1335825488275977, + "learning_rate": 2e-05, + "loss": 0.4263, + "loss/crossentropy": 2.2683433294296265, + "loss/hidden": 0.15234375, + "loss/logits": 0.027212919667363167, + "loss/reg": 0.024669544771313667, + "step": 981 + }, + { + "epoch": 0.491, + "grad_norm": 1.5247058868408203, + "grad_norm_var": 0.13253172817718994, + "learning_rate": 2e-05, + "loss": 0.4795, + "loss/crossentropy": 2.3193823099136353, + "loss/hidden": 0.20166015625, + "loss/logits": 0.031176569871604443, + "loss/reg": 0.02466718479990959, + "step": 982 + }, + { + "epoch": 0.4915, + "grad_norm": 1.1023645401000977, + "grad_norm_var": 0.14114311646802283, + "learning_rate": 2e-05, + "loss": 0.4124, + "loss/crossentropy": 2.534896492958069, + "loss/hidden": 0.14111328125, + "loss/logits": 0.024660163559019566, + "loss/reg": 0.02466486021876335, + "step": 983 + }, + { + "epoch": 0.492, + "grad_norm": 1.2959052324295044, + "grad_norm_var": 0.13568678899981698, + "learning_rate": 2e-05, + "loss": 0.4442, + "loss/crossentropy": 2.2339383363723755, + "loss/hidden": 0.16748046875, + "loss/logits": 0.030090173706412315, + "loss/reg": 0.024662485346198082, + "step": 984 + }, + { + "epoch": 0.4925, + "grad_norm": 1.5475845336914062, + "grad_norm_var": 0.11882549883375754, + "learning_rate": 2e-05, + "loss": 0.4666, + "loss/crossentropy": 2.548925042152405, + "loss/hidden": 0.1875, + "loss/logits": 0.03245330601930618, + "loss/reg": 0.024660129100084305, + "step": 985 + }, + { + "epoch": 0.493, + "grad_norm": 1.518269658088684, + "grad_norm_var": 0.11770160652835292, + "learning_rate": 2e-05, + "loss": 0.4389, + "loss/crossentropy": 2.379398465156555, + "loss/hidden": 0.16552734375, + "loss/logits": 0.026743890717625618, + "loss/reg": 0.024657921865582466, + "step": 986 + }, + { + "epoch": 0.4935, + "grad_norm": 1.570279836654663, + "grad_norm_var": 0.11477257851482622, + "learning_rate": 2e-05, + "loss": 0.4243, + "loss/crossentropy": 2.4684702157974243, + "loss/hidden": 0.150390625, + "loss/logits": 0.027326886542141438, + "loss/reg": 0.02465582638978958, + "step": 987 + }, + { + "epoch": 0.494, + "grad_norm": 1.4916634559631348, + "grad_norm_var": 0.109505544141344, + "learning_rate": 2e-05, + "loss": 0.4294, + "loss/crossentropy": 2.4013638496398926, + "loss/hidden": 0.15673828125, + "loss/logits": 0.026141656562685966, + "loss/reg": 0.02465374581515789, + "step": 988 + }, + { + "epoch": 0.4945, + "grad_norm": 1.7180440425872803, + "grad_norm_var": 0.11078359056482606, + "learning_rate": 2e-05, + "loss": 0.5258, + "loss/crossentropy": 2.3340543508529663, + "loss/hidden": 0.22998046875, + "loss/logits": 0.049306683242321014, + "loss/reg": 0.024651547893881798, + "step": 989 + }, + { + "epoch": 0.495, + "grad_norm": 1.192015290260315, + "grad_norm_var": 0.11847738378988476, + "learning_rate": 2e-05, + "loss": 0.4235, + "loss/crossentropy": 2.4830812215805054, + "loss/hidden": 0.1513671875, + "loss/logits": 0.02562696486711502, + "loss/reg": 0.024649281054735184, + "step": 990 + }, + { + "epoch": 0.4955, + "grad_norm": 2.068011522293091, + "grad_norm_var": 0.07453697096159431, + "learning_rate": 2e-05, + "loss": 0.4613, + "loss/crossentropy": 2.6523276567459106, + "loss/hidden": 0.18115234375, + "loss/logits": 0.033670464530587196, + "loss/reg": 0.02464720420539379, + "step": 991 + }, + { + "epoch": 0.496, + "grad_norm": 1.2752137184143066, + "grad_norm_var": 0.07874241495605147, + "learning_rate": 2e-05, + "loss": 0.4984, + "loss/crossentropy": 2.0650646686553955, + "loss/hidden": 0.2119140625, + "loss/logits": 0.04002711549401283, + "loss/reg": 0.024645155295729637, + "step": 992 + }, + { + "epoch": 0.4965, + "grad_norm": 1.5579633712768555, + "grad_norm_var": 0.06175023932142167, + "learning_rate": 2e-05, + "loss": 0.4986, + "loss/crossentropy": 2.3349034786224365, + "loss/hidden": 0.197265625, + "loss/logits": 0.054928943514823914, + "loss/reg": 0.02464275248348713, + "step": 993 + }, + { + "epoch": 0.497, + "grad_norm": 1.2338091135025024, + "grad_norm_var": 0.06599051690941451, + "learning_rate": 2e-05, + "loss": 0.4429, + "loss/crossentropy": 2.4117237329483032, + "loss/hidden": 0.1650390625, + "loss/logits": 0.031464939936995506, + "loss/reg": 0.024640321731567383, + "step": 994 + }, + { + "epoch": 0.4975, + "grad_norm": 1.5982106924057007, + "grad_norm_var": 0.0668363147457848, + "learning_rate": 2e-05, + "loss": 0.4563, + "loss/crossentropy": 2.387059211730957, + "loss/hidden": 0.17236328125, + "loss/logits": 0.03751287795603275, + "loss/reg": 0.024637887254357338, + "step": 995 + }, + { + "epoch": 0.498, + "grad_norm": 1.4983510971069336, + "grad_norm_var": 0.056549508629934485, + "learning_rate": 2e-05, + "loss": 0.4948, + "loss/crossentropy": 2.186620593070984, + "loss/hidden": 0.21337890625, + "loss/logits": 0.03502054139971733, + "loss/reg": 0.024635281413793564, + "step": 996 + }, + { + "epoch": 0.4985, + "grad_norm": 1.8561230897903442, + "grad_norm_var": 0.062272768724757795, + "learning_rate": 2e-05, + "loss": 0.5759, + "loss/crossentropy": 2.4618980884552, + "loss/hidden": 0.27197265625, + "loss/logits": 0.05761981941759586, + "loss/reg": 0.024632660672068596, + "step": 997 + }, + { + "epoch": 0.499, + "grad_norm": 1.5297044515609741, + "grad_norm_var": 0.06228877530962974, + "learning_rate": 2e-05, + "loss": 0.4652, + "loss/crossentropy": 2.2573466300964355, + "loss/hidden": 0.18505859375, + "loss/logits": 0.03383258357644081, + "loss/reg": 0.024630188941955566, + "step": 998 + }, + { + "epoch": 0.4995, + "grad_norm": 1.9509611129760742, + "grad_norm_var": 0.06192666000230999, + "learning_rate": 2e-05, + "loss": 0.4608, + "loss/crossentropy": 2.4582537412643433, + "loss/hidden": 0.18115234375, + "loss/logits": 0.03336348757147789, + "loss/reg": 0.024627676233649254, + "step": 999 + }, + { + "epoch": 0.5, + "grad_norm": 1.263331413269043, + "grad_norm_var": 0.06312427179109174, + "learning_rate": 2e-05, + "loss": 0.4127, + "loss/crossentropy": 2.5180909633636475, + "loss/hidden": 0.1435546875, + "loss/logits": 0.022916819900274277, + "loss/reg": 0.024625113233923912, + "step": 1000 + }, + { + "epoch": 0.5005, + "grad_norm": 2.315190553665161, + "grad_norm_var": 0.09925843788652339, + "learning_rate": 2e-05, + "loss": 0.5383, + "loss/crossentropy": 2.3459049463272095, + "loss/hidden": 0.244140625, + "loss/logits": 0.04792695306241512, + "loss/reg": 0.024622488766908646, + "step": 1001 + }, + { + "epoch": 0.501, + "grad_norm": 1.533280372619629, + "grad_norm_var": 0.09910429692047741, + "learning_rate": 2e-05, + "loss": 0.4648, + "loss/crossentropy": 2.3168352842330933, + "loss/hidden": 0.18896484375, + "loss/logits": 0.029631631448864937, + "loss/reg": 0.02461997978389263, + "step": 1002 + }, + { + "epoch": 0.5015, + "grad_norm": 2.5686206817626953, + "grad_norm_var": 0.1570070725080583, + "learning_rate": 2e-05, + "loss": 0.5058, + "loss/crossentropy": 2.1777498722076416, + "loss/hidden": 0.2099609375, + "loss/logits": 0.0496145635843277, + "loss/reg": 0.024617573246359825, + "step": 1003 + }, + { + "epoch": 0.502, + "grad_norm": 1.3942785263061523, + "grad_norm_var": 0.15985904345598664, + "learning_rate": 2e-05, + "loss": 0.4515, + "loss/crossentropy": 2.30439692735672, + "loss/hidden": 0.18017578125, + "loss/logits": 0.025181924924254417, + "loss/reg": 0.024615149945020676, + "step": 1004 + }, + { + "epoch": 0.5025, + "grad_norm": 1.6636312007904053, + "grad_norm_var": 0.15961985398144515, + "learning_rate": 2e-05, + "loss": 0.4799, + "loss/crossentropy": 2.387961268424988, + "loss/hidden": 0.193359375, + "loss/logits": 0.04039803333580494, + "loss/reg": 0.02461281418800354, + "step": 1005 + }, + { + "epoch": 0.503, + "grad_norm": 1.175167202949524, + "grad_norm_var": 0.16068027431229595, + "learning_rate": 2e-05, + "loss": 0.4205, + "loss/crossentropy": 2.2776483297348022, + "loss/hidden": 0.14599609375, + "loss/logits": 0.028357837349176407, + "loss/reg": 0.024610213935375214, + "step": 1006 + }, + { + "epoch": 0.5035, + "grad_norm": 1.2402100563049316, + "grad_norm_var": 0.15793593833079214, + "learning_rate": 2e-05, + "loss": 0.4417, + "loss/crossentropy": 2.4232317209243774, + "loss/hidden": 0.16845703125, + "loss/logits": 0.027130945585668087, + "loss/reg": 0.024607809260487556, + "step": 1007 + }, + { + "epoch": 0.504, + "grad_norm": 1.4888067245483398, + "grad_norm_var": 0.15144150127088754, + "learning_rate": 2e-05, + "loss": 0.4199, + "loss/crossentropy": 2.17998468875885, + "loss/hidden": 0.15234375, + "loss/logits": 0.021496030502021313, + "loss/reg": 0.02460542693734169, + "step": 1008 + }, + { + "epoch": 0.5045, + "grad_norm": 1.750985026359558, + "grad_norm_var": 0.15225772018986655, + "learning_rate": 2e-05, + "loss": 0.4642, + "loss/crossentropy": 2.200040578842163, + "loss/hidden": 0.18701171875, + "loss/logits": 0.031146997585892677, + "loss/reg": 0.02460303343832493, + "step": 1009 + }, + { + "epoch": 0.505, + "grad_norm": 1.1058796644210815, + "grad_norm_var": 0.16001790603834795, + "learning_rate": 2e-05, + "loss": 0.4448, + "loss/crossentropy": 2.094850778579712, + "loss/hidden": 0.16796875, + "loss/logits": 0.030812044627964497, + "loss/reg": 0.024600572884082794, + "step": 1010 + }, + { + "epoch": 0.5055, + "grad_norm": 1.4799710512161255, + "grad_norm_var": 0.16124775408475406, + "learning_rate": 2e-05, + "loss": 0.4294, + "loss/crossentropy": 2.5933122634887695, + "loss/hidden": 0.15185546875, + "loss/logits": 0.03153660800307989, + "loss/reg": 0.024597788229584694, + "step": 1011 + }, + { + "epoch": 0.506, + "grad_norm": 2.6447713375091553, + "grad_norm_var": 0.22580341469373647, + "learning_rate": 2e-05, + "loss": 0.4916, + "loss/crossentropy": 2.423816442489624, + "loss/hidden": 0.212890625, + "loss/logits": 0.03276214189827442, + "loss/reg": 0.024594949558377266, + "step": 1012 + }, + { + "epoch": 0.5065, + "grad_norm": 1.3123342990875244, + "grad_norm_var": 0.23188188108190289, + "learning_rate": 2e-05, + "loss": 0.4473, + "loss/crossentropy": 2.268904685974121, + "loss/hidden": 0.16845703125, + "loss/logits": 0.03294616658240557, + "loss/reg": 0.024592256173491478, + "step": 1013 + }, + { + "epoch": 0.507, + "grad_norm": 1.6381093263626099, + "grad_norm_var": 0.23086213820556947, + "learning_rate": 2e-05, + "loss": 0.483, + "loss/crossentropy": 2.5634379386901855, + "loss/hidden": 0.1953125, + "loss/logits": 0.04177115485072136, + "loss/reg": 0.024589471518993378, + "step": 1014 + }, + { + "epoch": 0.5075, + "grad_norm": 1.9902760982513428, + "grad_norm_var": 0.2324952537472744, + "learning_rate": 2e-05, + "loss": 0.5418, + "loss/crossentropy": 2.2787784934043884, + "loss/hidden": 0.24169921875, + "loss/logits": 0.05426573008298874, + "loss/reg": 0.024587033316493034, + "step": 1015 + }, + { + "epoch": 0.508, + "grad_norm": 1.2062731981277466, + "grad_norm_var": 0.2357187944792192, + "learning_rate": 2e-05, + "loss": 0.4281, + "loss/crossentropy": 2.5111724138259888, + "loss/hidden": 0.15234375, + "loss/logits": 0.02992274332791567, + "loss/reg": 0.024584423750638962, + "step": 1016 + }, + { + "epoch": 0.5085, + "grad_norm": 1.755979299545288, + "grad_norm_var": 0.20616830501869762, + "learning_rate": 2e-05, + "loss": 0.475, + "loss/crossentropy": 2.331393003463745, + "loss/hidden": 0.19921875, + "loss/logits": 0.029994547367095947, + "loss/reg": 0.024581963196396828, + "step": 1017 + }, + { + "epoch": 0.509, + "grad_norm": 1.800107479095459, + "grad_norm_var": 0.2074693433041612, + "learning_rate": 2e-05, + "loss": 0.4413, + "loss/crossentropy": 2.3108561038970947, + "loss/hidden": 0.16650390625, + "loss/logits": 0.029039999470114708, + "loss/reg": 0.024579644203186035, + "step": 1018 + }, + { + "epoch": 0.5095, + "grad_norm": 1.8739287853240967, + "grad_norm_var": 0.15147520519188878, + "learning_rate": 2e-05, + "loss": 0.4338, + "loss/crossentropy": 2.204409599304199, + "loss/hidden": 0.16259765625, + "loss/logits": 0.025386362336575985, + "loss/reg": 0.024577105417847633, + "step": 1019 + }, + { + "epoch": 0.51, + "grad_norm": 2.2447474002838135, + "grad_norm_var": 0.17391527788569666, + "learning_rate": 2e-05, + "loss": 0.4625, + "loss/crossentropy": 2.5994725227355957, + "loss/hidden": 0.189453125, + "loss/logits": 0.0273160170763731, + "loss/reg": 0.024574514478445053, + "step": 1020 + }, + { + "epoch": 0.5105, + "grad_norm": 1.2325525283813477, + "grad_norm_var": 0.18464255921690906, + "learning_rate": 2e-05, + "loss": 0.4429, + "loss/crossentropy": 2.3073863983154297, + "loss/hidden": 0.16796875, + "loss/logits": 0.029254252091050148, + "loss/reg": 0.024571970105171204, + "step": 1021 + }, + { + "epoch": 0.511, + "grad_norm": 1.2389066219329834, + "grad_norm_var": 0.18110535153355295, + "learning_rate": 2e-05, + "loss": 0.4033, + "loss/crossentropy": 2.50894033908844, + "loss/hidden": 0.13671875, + "loss/logits": 0.020873015746474266, + "loss/reg": 0.024569377303123474, + "step": 1022 + }, + { + "epoch": 0.5115, + "grad_norm": 1.7655569314956665, + "grad_norm_var": 0.17138478636465398, + "learning_rate": 2e-05, + "loss": 0.5674, + "loss/crossentropy": 1.9970663189888, + "loss/hidden": 0.2744140625, + "loss/logits": 0.04734223149716854, + "loss/reg": 0.02456682361662388, + "step": 1023 + }, + { + "epoch": 0.512, + "grad_norm": 1.2552883625030518, + "grad_norm_var": 0.18006323532261087, + "learning_rate": 2e-05, + "loss": 0.4555, + "loss/crossentropy": 2.412826180458069, + "loss/hidden": 0.177734375, + "loss/logits": 0.03207558020949364, + "loss/reg": 0.024564214050769806, + "step": 1024 + }, + { + "epoch": 0.5125, + "grad_norm": 1.5279215574264526, + "grad_norm_var": 0.1799756513199552, + "learning_rate": 2e-05, + "loss": 0.4414, + "loss/crossentropy": 2.2349936962127686, + "loss/hidden": 0.166015625, + "loss/logits": 0.029770507477223873, + "loss/reg": 0.024561790749430656, + "step": 1025 + }, + { + "epoch": 0.513, + "grad_norm": 1.204811930656433, + "grad_norm_var": 0.17367981846485894, + "learning_rate": 2e-05, + "loss": 0.4652, + "loss/crossentropy": 2.3194793462753296, + "loss/hidden": 0.1865234375, + "loss/logits": 0.03311028238385916, + "loss/reg": 0.02455941028892994, + "step": 1026 + }, + { + "epoch": 0.5135, + "grad_norm": 1.3728058338165283, + "grad_norm_var": 0.17662305625485236, + "learning_rate": 2e-05, + "loss": 0.4484, + "loss/crossentropy": 2.6432000398635864, + "loss/hidden": 0.171875, + "loss/logits": 0.030997256748378277, + "loss/reg": 0.02455691620707512, + "step": 1027 + }, + { + "epoch": 0.514, + "grad_norm": 1.755271553993225, + "grad_norm_var": 0.10560597146194155, + "learning_rate": 2e-05, + "loss": 0.5098, + "loss/crossentropy": 2.364680051803589, + "loss/hidden": 0.23779296875, + "loss/logits": 0.026502804830670357, + "loss/reg": 0.024554504081606865, + "step": 1028 + }, + { + "epoch": 0.5145, + "grad_norm": 1.0920544862747192, + "grad_norm_var": 0.11630720334852303, + "learning_rate": 2e-05, + "loss": 0.4003, + "loss/crossentropy": 2.2761380672454834, + "loss/hidden": 0.1337890625, + "loss/logits": 0.020970601588487625, + "loss/reg": 0.024552173912525177, + "step": 1029 + }, + { + "epoch": 0.515, + "grad_norm": 6.667808532714844, + "grad_norm_var": 1.750033221105651, + "learning_rate": 2e-05, + "loss": 0.8699, + "loss/crossentropy": 2.0262590050697327, + "loss/hidden": 0.54443359375, + "loss/logits": 0.0799819864332676, + "loss/reg": 0.02454986795783043, + "step": 1030 + }, + { + "epoch": 0.5155, + "grad_norm": 1.4751737117767334, + "grad_norm_var": 1.7586317433690974, + "learning_rate": 2e-05, + "loss": 0.4788, + "loss/crossentropy": 2.3618550300598145, + "loss/hidden": 0.19677734375, + "loss/logits": 0.03654679283499718, + "loss/reg": 0.024547545239329338, + "step": 1031 + }, + { + "epoch": 0.516, + "grad_norm": 1.1540484428405762, + "grad_norm_var": 1.763227740616036, + "learning_rate": 2e-05, + "loss": 0.4253, + "loss/crossentropy": 2.479053497314453, + "loss/hidden": 0.1533203125, + "loss/logits": 0.026503758504986763, + "loss/reg": 0.024545062333345413, + "step": 1032 + }, + { + "epoch": 0.5165, + "grad_norm": 1.0233707427978516, + "grad_norm_var": 1.8048390448531781, + "learning_rate": 2e-05, + "loss": 0.4207, + "loss/crossentropy": 2.4981950521469116, + "loss/hidden": 0.14697265625, + "loss/logits": 0.028342297300696373, + "loss/reg": 0.024542683735489845, + "step": 1033 + }, + { + "epoch": 0.517, + "grad_norm": 1.6611769199371338, + "grad_norm_var": 1.8059095215172836, + "learning_rate": 2e-05, + "loss": 0.5209, + "loss/crossentropy": 2.3910595178604126, + "loss/hidden": 0.232421875, + "loss/logits": 0.04302603006362915, + "loss/reg": 0.024540260434150696, + "step": 1034 + }, + { + "epoch": 0.5175, + "grad_norm": 1.7571359872817993, + "grad_norm_var": 1.805363038051151, + "learning_rate": 2e-05, + "loss": 0.4273, + "loss/crossentropy": 2.403917074203491, + "loss/hidden": 0.15478515625, + "loss/logits": 0.02708614058792591, + "loss/reg": 0.024537930265069008, + "step": 1035 + }, + { + "epoch": 0.518, + "grad_norm": 1.377044677734375, + "grad_norm_var": 1.798280006459376, + "learning_rate": 2e-05, + "loss": 0.4691, + "loss/crossentropy": 2.070719838142395, + "loss/hidden": 0.19091796875, + "loss/logits": 0.03284468129277229, + "loss/reg": 0.02453547529876232, + "step": 1036 + }, + { + "epoch": 0.5185, + "grad_norm": 1.4872187376022339, + "grad_norm_var": 1.78569505647099, + "learning_rate": 2e-05, + "loss": 0.4843, + "loss/crossentropy": 2.3953222036361694, + "loss/hidden": 0.1982421875, + "loss/logits": 0.04068641737103462, + "loss/reg": 0.02453303523361683, + "step": 1037 + }, + { + "epoch": 0.519, + "grad_norm": 2.055389881134033, + "grad_norm_var": 1.7729751683139976, + "learning_rate": 2e-05, + "loss": 0.533, + "loss/crossentropy": 2.5933210849761963, + "loss/hidden": 0.23583984375, + "loss/logits": 0.05187349207699299, + "loss/reg": 0.024530693888664246, + "step": 1038 + }, + { + "epoch": 0.5195, + "grad_norm": 1.2173277139663696, + "grad_norm_var": 1.7935104026338773, + "learning_rate": 2e-05, + "loss": 0.4539, + "loss/crossentropy": 2.4251633882522583, + "loss/hidden": 0.17724609375, + "loss/logits": 0.03137340396642685, + "loss/reg": 0.024528371170163155, + "step": 1039 + }, + { + "epoch": 0.52, + "grad_norm": 1.4174593687057495, + "grad_norm_var": 1.7843437503956898, + "learning_rate": 2e-05, + "loss": 0.4481, + "loss/crossentropy": 2.562455415725708, + "loss/hidden": 0.16650390625, + "loss/logits": 0.036311980336904526, + "loss/reg": 0.024526001885533333, + "step": 1040 + }, + { + "epoch": 0.5205, + "grad_norm": 1.6313014030456543, + "grad_norm_var": 1.7817386417632997, + "learning_rate": 2e-05, + "loss": 0.4698, + "loss/crossentropy": 2.3674226999282837, + "loss/hidden": 0.193359375, + "loss/logits": 0.03119245171546936, + "loss/reg": 0.02452370524406433, + "step": 1041 + }, + { + "epoch": 0.521, + "grad_norm": 1.5067142248153687, + "grad_norm_var": 1.7646103614574096, + "learning_rate": 2e-05, + "loss": 0.4274, + "loss/crossentropy": 2.3603265285491943, + "loss/hidden": 0.1591796875, + "loss/logits": 0.02301643881946802, + "loss/reg": 0.024521449580788612, + "step": 1042 + }, + { + "epoch": 0.5215, + "grad_norm": 1.2168174982070923, + "grad_norm_var": 1.7748228156101766, + "learning_rate": 2e-05, + "loss": 0.4298, + "loss/crossentropy": 2.5000079870224, + "loss/hidden": 0.1572265625, + "loss/logits": 0.027365175541490316, + "loss/reg": 0.024519138038158417, + "step": 1043 + }, + { + "epoch": 0.522, + "grad_norm": 1.4697620868682861, + "grad_norm_var": 1.780895340312144, + "learning_rate": 2e-05, + "loss": 0.4368, + "loss/crossentropy": 2.4227681159973145, + "loss/hidden": 0.16455078125, + "loss/logits": 0.02705656923353672, + "loss/reg": 0.02451668120920658, + "step": 1044 + }, + { + "epoch": 0.5225, + "grad_norm": 1.3264552354812622, + "grad_norm_var": 1.7633564468147955, + "learning_rate": 2e-05, + "loss": 0.4162, + "loss/crossentropy": 2.35932993888855, + "loss/hidden": 0.1474609375, + "loss/logits": 0.023632820695638657, + "loss/reg": 0.02451416663825512, + "step": 1045 + }, + { + "epoch": 0.523, + "grad_norm": 2.0581815242767334, + "grad_norm_var": 0.08589286553724325, + "learning_rate": 2e-05, + "loss": 0.4888, + "loss/crossentropy": 2.554602861404419, + "loss/hidden": 0.19384765625, + "loss/logits": 0.04979093559086323, + "loss/reg": 0.024511631578207016, + "step": 1046 + }, + { + "epoch": 0.5235, + "grad_norm": 3.9287054538726807, + "grad_norm_var": 0.45739211083615405, + "learning_rate": 2e-05, + "loss": 0.589, + "loss/crossentropy": 2.350398898124695, + "loss/hidden": 0.302734375, + "loss/logits": 0.041149744763970375, + "loss/reg": 0.024509234353899956, + "step": 1047 + }, + { + "epoch": 0.524, + "grad_norm": 1.8000636100769043, + "grad_norm_var": 0.44135897770784704, + "learning_rate": 2e-05, + "loss": 0.4479, + "loss/crossentropy": 2.401803970336914, + "loss/hidden": 0.173828125, + "loss/logits": 0.029028436169028282, + "loss/reg": 0.024506855756044388, + "step": 1048 + }, + { + "epoch": 0.5245, + "grad_norm": 1.3291693925857544, + "grad_norm_var": 0.4202927551272635, + "learning_rate": 2e-05, + "loss": 0.481, + "loss/crossentropy": 2.194493293762207, + "loss/hidden": 0.19921875, + "loss/logits": 0.03670147806406021, + "loss/reg": 0.02450430393218994, + "step": 1049 + }, + { + "epoch": 0.525, + "grad_norm": 1.5405762195587158, + "grad_norm_var": 0.42186619050553964, + "learning_rate": 2e-05, + "loss": 0.4794, + "loss/crossentropy": 2.147883892059326, + "loss/hidden": 0.20458984375, + "loss/logits": 0.02983129769563675, + "loss/reg": 0.024501901119947433, + "step": 1050 + }, + { + "epoch": 0.5255, + "grad_norm": 1.8504911661148071, + "grad_norm_var": 0.42318484533822676, + "learning_rate": 2e-05, + "loss": 0.5108, + "loss/crossentropy": 2.149976372718811, + "loss/hidden": 0.2236328125, + "loss/logits": 0.04220755770802498, + "loss/reg": 0.0244994405657053, + "step": 1051 + }, + { + "epoch": 0.526, + "grad_norm": 1.2600603103637695, + "grad_norm_var": 0.42908996868910637, + "learning_rate": 2e-05, + "loss": 0.4532, + "loss/crossentropy": 2.4726301431655884, + "loss/hidden": 0.1748046875, + "loss/logits": 0.03347236476838589, + "loss/reg": 0.024497076869010925, + "step": 1052 + }, + { + "epoch": 0.5265, + "grad_norm": 1.9423667192459106, + "grad_norm_var": 0.42952014360100654, + "learning_rate": 2e-05, + "loss": 0.4753, + "loss/crossentropy": 2.3846495151519775, + "loss/hidden": 0.193359375, + "loss/logits": 0.03703247010707855, + "loss/reg": 0.024494826793670654, + "step": 1053 + }, + { + "epoch": 0.527, + "grad_norm": 1.3845815658569336, + "grad_norm_var": 0.4278188958703671, + "learning_rate": 2e-05, + "loss": 0.4507, + "loss/crossentropy": 2.388631224632263, + "loss/hidden": 0.17138671875, + "loss/logits": 0.03437050245702267, + "loss/reg": 0.024492528289556503, + "step": 1054 + }, + { + "epoch": 0.5275, + "grad_norm": 1.567394733428955, + "grad_norm_var": 0.41388247279120466, + "learning_rate": 2e-05, + "loss": 0.4495, + "loss/crossentropy": 2.368739128112793, + "loss/hidden": 0.16845703125, + "loss/logits": 0.03617890737950802, + "loss/reg": 0.024490313604474068, + "step": 1055 + }, + { + "epoch": 0.528, + "grad_norm": 2.2830867767333984, + "grad_norm_var": 0.42788727790428427, + "learning_rate": 2e-05, + "loss": 0.5231, + "loss/crossentropy": 2.152646243572235, + "loss/hidden": 0.234375, + "loss/logits": 0.043883830308914185, + "loss/reg": 0.02448788657784462, + "step": 1056 + }, + { + "epoch": 0.5285, + "grad_norm": 1.6639432907104492, + "grad_norm_var": 0.427411225536909, + "learning_rate": 2e-05, + "loss": 0.5485, + "loss/crossentropy": 2.2220189571380615, + "loss/hidden": 0.2431640625, + "loss/logits": 0.06049743480980396, + "loss/reg": 0.02448536455631256, + "step": 1057 + }, + { + "epoch": 0.529, + "grad_norm": 1.5924744606018066, + "grad_norm_var": 0.4249972603969434, + "learning_rate": 2e-05, + "loss": 0.4682, + "loss/crossentropy": 2.237455129623413, + "loss/hidden": 0.1796875, + "loss/logits": 0.04367717728018761, + "loss/reg": 0.02448287233710289, + "step": 1058 + }, + { + "epoch": 0.5295, + "grad_norm": 2.2733936309814453, + "grad_norm_var": 0.4177709041128878, + "learning_rate": 2e-05, + "loss": 0.4764, + "loss/crossentropy": 2.3304353952407837, + "loss/hidden": 0.1923828125, + "loss/logits": 0.03922894597053528, + "loss/reg": 0.02448027953505516, + "step": 1059 + }, + { + "epoch": 0.53, + "grad_norm": 1.09116530418396, + "grad_norm_var": 0.44488470791259543, + "learning_rate": 2e-05, + "loss": 0.4262, + "loss/crossentropy": 2.195927619934082, + "loss/hidden": 0.15234375, + "loss/logits": 0.029037375934422016, + "loss/reg": 0.02447788044810295, + "step": 1060 + }, + { + "epoch": 0.5305, + "grad_norm": 1.77937650680542, + "grad_norm_var": 0.42876102735321137, + "learning_rate": 2e-05, + "loss": 0.4547, + "loss/crossentropy": 2.4653072357177734, + "loss/hidden": 0.177734375, + "loss/logits": 0.0321984738111496, + "loss/reg": 0.02447550557553768, + "step": 1061 + }, + { + "epoch": 0.531, + "grad_norm": 1.846907138824463, + "grad_norm_var": 0.42523747091545416, + "learning_rate": 2e-05, + "loss": 0.5053, + "loss/crossentropy": 2.340217351913452, + "loss/hidden": 0.21435546875, + "loss/logits": 0.046232474967837334, + "loss/reg": 0.02447315864264965, + "step": 1062 + }, + { + "epoch": 0.5315, + "grad_norm": 1.659089207649231, + "grad_norm_var": 0.10931806474138266, + "learning_rate": 2e-05, + "loss": 0.4724, + "loss/crossentropy": 2.350934624671936, + "loss/hidden": 0.1865234375, + "loss/logits": 0.04117522016167641, + "loss/reg": 0.024470685049891472, + "step": 1063 + }, + { + "epoch": 0.532, + "grad_norm": 1.4331797361373901, + "grad_norm_var": 0.11180905743439092, + "learning_rate": 2e-05, + "loss": 0.4513, + "loss/crossentropy": 2.3989791870117188, + "loss/hidden": 0.17578125, + "loss/logits": 0.03085092268884182, + "loss/reg": 0.02446819841861725, + "step": 1064 + }, + { + "epoch": 0.5325, + "grad_norm": 1.4166337251663208, + "grad_norm_var": 0.10847479799077456, + "learning_rate": 2e-05, + "loss": 0.4312, + "loss/crossentropy": 2.507182240486145, + "loss/hidden": 0.15771484375, + "loss/logits": 0.02880854159593582, + "loss/reg": 0.02446584217250347, + "step": 1065 + }, + { + "epoch": 0.533, + "grad_norm": 1.7280267477035522, + "grad_norm_var": 0.10764748193198746, + "learning_rate": 2e-05, + "loss": 0.4635, + "loss/crossentropy": 2.478935956954956, + "loss/hidden": 0.1865234375, + "loss/logits": 0.03238129895180464, + "loss/reg": 0.0244633499532938, + "step": 1066 + }, + { + "epoch": 0.5335, + "grad_norm": 1.2200838327407837, + "grad_norm_var": 0.11758883412413562, + "learning_rate": 2e-05, + "loss": 0.4332, + "loss/crossentropy": 2.245327651500702, + "loss/hidden": 0.15673828125, + "loss/logits": 0.03184494376182556, + "loss/reg": 0.024460740387439728, + "step": 1067 + }, + { + "epoch": 0.534, + "grad_norm": 1.8010295629501343, + "grad_norm_var": 0.10891741560488928, + "learning_rate": 2e-05, + "loss": 0.5088, + "loss/crossentropy": 2.45032274723053, + "loss/hidden": 0.22021484375, + "loss/logits": 0.04397309757769108, + "loss/reg": 0.024458307772874832, + "step": 1068 + }, + { + "epoch": 0.5345, + "grad_norm": 3.2257442474365234, + "grad_norm_var": 0.2588636742482642, + "learning_rate": 2e-05, + "loss": 0.5897, + "loss/crossentropy": 2.250162959098816, + "loss/hidden": 0.2841796875, + "loss/logits": 0.06098415516316891, + "loss/reg": 0.024455880746245384, + "step": 1069 + }, + { + "epoch": 0.535, + "grad_norm": 1.5339529514312744, + "grad_norm_var": 0.2530226057684303, + "learning_rate": 2e-05, + "loss": 0.4264, + "loss/crossentropy": 2.2610918283462524, + "loss/hidden": 0.154296875, + "loss/logits": 0.02759288903325796, + "loss/reg": 0.024453405290842056, + "step": 1070 + }, + { + "epoch": 0.5355, + "grad_norm": 1.972740888595581, + "grad_norm_var": 0.2530325031228223, + "learning_rate": 2e-05, + "loss": 0.521, + "loss/crossentropy": 2.1504000425338745, + "loss/hidden": 0.23681640625, + "loss/logits": 0.03967903181910515, + "loss/reg": 0.02445101924240589, + "step": 1071 + }, + { + "epoch": 0.536, + "grad_norm": 1.547863245010376, + "grad_norm_var": 0.23774975509498403, + "learning_rate": 2e-05, + "loss": 0.4831, + "loss/crossentropy": 2.2970356941223145, + "loss/hidden": 0.19873046875, + "loss/logits": 0.03990238159894943, + "loss/reg": 0.024448538199067116, + "step": 1072 + }, + { + "epoch": 0.5365, + "grad_norm": 1.686294674873352, + "grad_norm_var": 0.23756444788163353, + "learning_rate": 2e-05, + "loss": 0.436, + "loss/crossentropy": 2.4303818941116333, + "loss/hidden": 0.16357421875, + "loss/logits": 0.02797577064484358, + "loss/reg": 0.024446075782179832, + "step": 1073 + }, + { + "epoch": 0.537, + "grad_norm": 2.996263027191162, + "grad_norm_var": 0.3334905820123376, + "learning_rate": 2e-05, + "loss": 0.4512, + "loss/crossentropy": 2.3784351348876953, + "loss/hidden": 0.17724609375, + "loss/logits": 0.02949346974492073, + "loss/reg": 0.02444363757967949, + "step": 1074 + }, + { + "epoch": 0.5375, + "grad_norm": 1.6075915098190308, + "grad_norm_var": 0.32145599917045425, + "learning_rate": 2e-05, + "loss": 0.4564, + "loss/crossentropy": 2.3008534908294678, + "loss/hidden": 0.1806640625, + "loss/logits": 0.03131491877138615, + "loss/reg": 0.02444116212427616, + "step": 1075 + }, + { + "epoch": 0.538, + "grad_norm": 1.6602723598480225, + "grad_norm_var": 0.28911651671163174, + "learning_rate": 2e-05, + "loss": 0.5333, + "loss/crossentropy": 2.380239248275757, + "loss/hidden": 0.24072265625, + "loss/logits": 0.04818672500550747, + "loss/reg": 0.02443861961364746, + "step": 1076 + }, + { + "epoch": 0.5385, + "grad_norm": 1.4191992282867432, + "grad_norm_var": 0.2991605248784346, + "learning_rate": 2e-05, + "loss": 0.5074, + "loss/crossentropy": 1.901290237903595, + "loss/hidden": 0.22509765625, + "loss/logits": 0.03790563438087702, + "loss/reg": 0.02443600259721279, + "step": 1077 + }, + { + "epoch": 0.539, + "grad_norm": 3.096097230911255, + "grad_norm_var": 0.4049728367226398, + "learning_rate": 2e-05, + "loss": 0.5158, + "loss/crossentropy": 2.3517009019851685, + "loss/hidden": 0.23388671875, + "loss/logits": 0.037621984258294106, + "loss/reg": 0.024433549493551254, + "step": 1078 + }, + { + "epoch": 0.5395, + "grad_norm": 2.404075860977173, + "grad_norm_var": 0.4181886829541852, + "learning_rate": 2e-05, + "loss": 0.481, + "loss/crossentropy": 2.174505352973938, + "loss/hidden": 0.19873046875, + "loss/logits": 0.03792595863342285, + "loss/reg": 0.02443109266459942, + "step": 1079 + }, + { + "epoch": 0.54, + "grad_norm": 2.0042619705200195, + "grad_norm_var": 0.40136528423351076, + "learning_rate": 2e-05, + "loss": 0.4407, + "loss/crossentropy": 2.4950019121170044, + "loss/hidden": 0.165283203125, + "loss/logits": 0.031170199625194073, + "loss/reg": 0.024428587406873703, + "step": 1080 + }, + { + "epoch": 0.5405, + "grad_norm": 1.8745791912078857, + "grad_norm_var": 0.38144694441163024, + "learning_rate": 2e-05, + "loss": 0.5041, + "loss/crossentropy": 2.4403117895126343, + "loss/hidden": 0.22119140625, + "loss/logits": 0.03862900286912918, + "loss/reg": 0.024426110088825226, + "step": 1081 + }, + { + "epoch": 0.541, + "grad_norm": 1.9030897617340088, + "grad_norm_var": 0.3773378128842729, + "learning_rate": 2e-05, + "loss": 0.5108, + "loss/crossentropy": 2.2553837299346924, + "loss/hidden": 0.22802734375, + "loss/logits": 0.03854364529252052, + "loss/reg": 0.024423446506261826, + "step": 1082 + }, + { + "epoch": 0.5415, + "grad_norm": 1.5374236106872559, + "grad_norm_var": 0.350755978913394, + "learning_rate": 2e-05, + "loss": 0.426, + "loss/crossentropy": 2.549424886703491, + "loss/hidden": 0.1533203125, + "loss/logits": 0.02845953404903412, + "loss/reg": 0.024420736357569695, + "step": 1083 + }, + { + "epoch": 0.542, + "grad_norm": 1.7184265851974487, + "grad_norm_var": 0.3535600255487106, + "learning_rate": 2e-05, + "loss": 0.4644, + "loss/crossentropy": 2.3825089931488037, + "loss/hidden": 0.1884765625, + "loss/logits": 0.031747978180646896, + "loss/reg": 0.02441803179681301, + "step": 1084 + }, + { + "epoch": 0.5425, + "grad_norm": 1.4469131231307983, + "grad_norm_var": 0.26339110279265016, + "learning_rate": 2e-05, + "loss": 0.4765, + "loss/crossentropy": 2.2089916467666626, + "loss/hidden": 0.1962890625, + "loss/logits": 0.03606886602938175, + "loss/reg": 0.02441529557108879, + "step": 1085 + }, + { + "epoch": 0.543, + "grad_norm": 1.1277079582214355, + "grad_norm_var": 0.293563715509962, + "learning_rate": 2e-05, + "loss": 0.4419, + "loss/crossentropy": 2.1233601570129395, + "loss/hidden": 0.166015625, + "loss/logits": 0.03171114809811115, + "loss/reg": 0.02441273257136345, + "step": 1086 + }, + { + "epoch": 0.5435, + "grad_norm": 1.922126054763794, + "grad_norm_var": 0.2930653944445924, + "learning_rate": 2e-05, + "loss": 0.4729, + "loss/crossentropy": 2.306940197944641, + "loss/hidden": 0.193359375, + "loss/logits": 0.035489412024617195, + "loss/reg": 0.02441009320318699, + "step": 1087 + }, + { + "epoch": 0.544, + "grad_norm": 1.4455621242523193, + "grad_norm_var": 0.29814092122534225, + "learning_rate": 2e-05, + "loss": 0.4431, + "loss/crossentropy": 2.4753963947296143, + "loss/hidden": 0.17041015625, + "loss/logits": 0.028606380335986614, + "loss/reg": 0.02440747246146202, + "step": 1088 + }, + { + "epoch": 0.5445, + "grad_norm": 1.202568769454956, + "grad_norm_var": 0.3243311065439721, + "learning_rate": 2e-05, + "loss": 0.4296, + "loss/crossentropy": 2.5858423709869385, + "loss/hidden": 0.15673828125, + "loss/logits": 0.028857764787971973, + "loss/reg": 0.024404924362897873, + "step": 1089 + }, + { + "epoch": 0.545, + "grad_norm": 1.676564335823059, + "grad_norm_var": 0.228913483216607, + "learning_rate": 2e-05, + "loss": 0.4523, + "loss/crossentropy": 2.392747402191162, + "loss/hidden": 0.18017578125, + "loss/logits": 0.028122087940573692, + "loss/reg": 0.024402471259236336, + "step": 1090 + }, + { + "epoch": 0.5455, + "grad_norm": 1.7542563676834106, + "grad_norm_var": 0.2274162683570199, + "learning_rate": 2e-05, + "loss": 0.455, + "loss/crossentropy": 2.146073818206787, + "loss/hidden": 0.18115234375, + "loss/logits": 0.02988947369158268, + "loss/reg": 0.024400051683187485, + "step": 1091 + }, + { + "epoch": 0.546, + "grad_norm": 1.9361008405685425, + "grad_norm_var": 0.22842751723861923, + "learning_rate": 2e-05, + "loss": 0.482, + "loss/crossentropy": 2.789412260055542, + "loss/hidden": 0.208740234375, + "loss/logits": 0.029331857338547707, + "loss/reg": 0.02439761720597744, + "step": 1092 + }, + { + "epoch": 0.5465, + "grad_norm": 1.5437133312225342, + "grad_norm_var": 0.2234179936427338, + "learning_rate": 2e-05, + "loss": 0.4217, + "loss/crossentropy": 2.3370405435562134, + "loss/hidden": 0.14794921875, + "loss/logits": 0.029778199270367622, + "loss/reg": 0.024395201355218887, + "step": 1093 + }, + { + "epoch": 0.547, + "grad_norm": 1.5581791400909424, + "grad_norm_var": 0.1028233910206414, + "learning_rate": 2e-05, + "loss": 0.4635, + "loss/crossentropy": 2.2354471683502197, + "loss/hidden": 0.185546875, + "loss/logits": 0.0340447872877121, + "loss/reg": 0.024392733350396156, + "step": 1094 + }, + { + "epoch": 0.5475, + "grad_norm": 1.944296956062317, + "grad_norm_var": 0.07231965473971678, + "learning_rate": 2e-05, + "loss": 0.4836, + "loss/crossentropy": 2.2027004957199097, + "loss/hidden": 0.2060546875, + "loss/logits": 0.03367648273706436, + "loss/reg": 0.024390380829572678, + "step": 1095 + }, + { + "epoch": 0.548, + "grad_norm": 1.5446677207946777, + "grad_norm_var": 0.0645622226297327, + "learning_rate": 2e-05, + "loss": 0.4729, + "loss/crossentropy": 2.347012758255005, + "loss/hidden": 0.19189453125, + "loss/logits": 0.03708443604409695, + "loss/reg": 0.024387938901782036, + "step": 1096 + }, + { + "epoch": 0.5485, + "grad_norm": 1.898376703262329, + "grad_norm_var": 0.06536252751224628, + "learning_rate": 2e-05, + "loss": 0.4712, + "loss/crossentropy": 2.2295031547546387, + "loss/hidden": 0.1904296875, + "loss/logits": 0.03696603327989578, + "loss/reg": 0.02438538894057274, + "step": 1097 + }, + { + "epoch": 0.549, + "grad_norm": 2.1913044452667236, + "grad_norm_var": 0.08085664370673058, + "learning_rate": 2e-05, + "loss": 0.5301, + "loss/crossentropy": 2.167301833629608, + "loss/hidden": 0.2392578125, + "loss/logits": 0.04704119265079498, + "loss/reg": 0.024382859468460083, + "step": 1098 + }, + { + "epoch": 0.5495, + "grad_norm": 1.253875970840454, + "grad_norm_var": 0.0902515637472618, + "learning_rate": 2e-05, + "loss": 0.4156, + "loss/crossentropy": 2.4487051963806152, + "loss/hidden": 0.1435546875, + "loss/logits": 0.028273213654756546, + "loss/reg": 0.024380315095186234, + "step": 1099 + }, + { + "epoch": 0.55, + "grad_norm": 1.5682874917984009, + "grad_norm_var": 0.0899961499541573, + "learning_rate": 2e-05, + "loss": 0.4756, + "loss/crossentropy": 2.167448401451111, + "loss/hidden": 0.19580078125, + "loss/logits": 0.03600800037384033, + "loss/reg": 0.024377938359975815, + "step": 1100 + }, + { + "epoch": 0.5505, + "grad_norm": 1.4174175262451172, + "grad_norm_var": 0.09075445922031561, + "learning_rate": 2e-05, + "loss": 0.4288, + "loss/crossentropy": 2.5550715923309326, + "loss/hidden": 0.1572265625, + "loss/logits": 0.02786921989172697, + "loss/reg": 0.02437533624470234, + "step": 1101 + }, + { + "epoch": 0.551, + "grad_norm": 1.3593388795852661, + "grad_norm_var": 0.07877827873621565, + "learning_rate": 2e-05, + "loss": 0.4321, + "loss/crossentropy": 2.7613970041275024, + "loss/hidden": 0.15625, + "loss/logits": 0.032078905031085014, + "loss/reg": 0.024372844025492668, + "step": 1102 + }, + { + "epoch": 0.5515, + "grad_norm": 1.4599738121032715, + "grad_norm_var": 0.07465265183359193, + "learning_rate": 2e-05, + "loss": 0.4598, + "loss/crossentropy": 2.228444457054138, + "loss/hidden": 0.19140625, + "loss/logits": 0.024730762466788292, + "loss/reg": 0.024370355531573296, + "step": 1103 + }, + { + "epoch": 0.552, + "grad_norm": 1.7234889268875122, + "grad_norm_var": 0.07339957389561243, + "learning_rate": 2e-05, + "loss": 0.4523, + "loss/crossentropy": 2.0966050028800964, + "loss/hidden": 0.179443359375, + "loss/logits": 0.029166480526328087, + "loss/reg": 0.024367934092879295, + "step": 1104 + }, + { + "epoch": 0.5525, + "grad_norm": 1.4158674478530884, + "grad_norm_var": 0.06417161394244413, + "learning_rate": 2e-05, + "loss": 0.4406, + "loss/crossentropy": 2.503642201423645, + "loss/hidden": 0.166015625, + "loss/logits": 0.030960144475102425, + "loss/reg": 0.024365652352571487, + "step": 1105 + }, + { + "epoch": 0.553, + "grad_norm": 1.1149797439575195, + "grad_norm_var": 0.08117155153877267, + "learning_rate": 2e-05, + "loss": 0.4326, + "loss/crossentropy": 2.4113690853118896, + "loss/hidden": 0.1611328125, + "loss/logits": 0.02782568149268627, + "loss/reg": 0.02436315082013607, + "step": 1106 + }, + { + "epoch": 0.5535, + "grad_norm": 1.5629899501800537, + "grad_norm_var": 0.07965819036262284, + "learning_rate": 2e-05, + "loss": 0.469, + "loss/crossentropy": 2.565138816833496, + "loss/hidden": 0.1904296875, + "loss/logits": 0.03499189019203186, + "loss/reg": 0.02436099573969841, + "step": 1107 + }, + { + "epoch": 0.554, + "grad_norm": 1.3787304162979126, + "grad_norm_var": 0.07359921908290872, + "learning_rate": 2e-05, + "loss": 0.4357, + "loss/crossentropy": 2.3302866220474243, + "loss/hidden": 0.16015625, + "loss/logits": 0.031937687657773495, + "loss/reg": 0.024358561262488365, + "step": 1108 + }, + { + "epoch": 0.5545, + "grad_norm": 1.5682357549667358, + "grad_norm_var": 0.07358856061888677, + "learning_rate": 2e-05, + "loss": 0.4596, + "loss/crossentropy": 2.4188989400863647, + "loss/hidden": 0.18505859375, + "loss/logits": 0.031012317165732384, + "loss/reg": 0.024356119334697723, + "step": 1109 + }, + { + "epoch": 0.555, + "grad_norm": 1.7214398384094238, + "grad_norm_var": 0.0752147876360846, + "learning_rate": 2e-05, + "loss": 0.4862, + "loss/crossentropy": 1.7631773948669434, + "loss/hidden": 0.21533203125, + "loss/logits": 0.027333957143127918, + "loss/reg": 0.024353839457035065, + "step": 1110 + }, + { + "epoch": 0.5555, + "grad_norm": 1.6694806814193726, + "grad_norm_var": 0.06622747638215376, + "learning_rate": 2e-05, + "loss": 0.4914, + "loss/crossentropy": 2.0741612911224365, + "loss/hidden": 0.21142578125, + "loss/logits": 0.03643079940229654, + "loss/reg": 0.02435164712369442, + "step": 1111 + }, + { + "epoch": 0.556, + "grad_norm": 1.240812063217163, + "grad_norm_var": 0.0723367202665381, + "learning_rate": 2e-05, + "loss": 0.4424, + "loss/crossentropy": 2.454365372657776, + "loss/hidden": 0.16455078125, + "loss/logits": 0.034358324483036995, + "loss/reg": 0.024349192157387733, + "step": 1112 + }, + { + "epoch": 0.5565, + "grad_norm": 1.4514116048812866, + "grad_norm_var": 0.0631099103755652, + "learning_rate": 2e-05, + "loss": 0.4955, + "loss/crossentropy": 2.4008172750473022, + "loss/hidden": 0.2060546875, + "loss/logits": 0.04597476311028004, + "loss/reg": 0.024346981197595596, + "step": 1113 + }, + { + "epoch": 0.557, + "grad_norm": 1.3681151866912842, + "grad_norm_var": 0.030255623557352607, + "learning_rate": 2e-05, + "loss": 0.4851, + "loss/crossentropy": 2.3843729496002197, + "loss/hidden": 0.20068359375, + "loss/logits": 0.040942758321762085, + "loss/reg": 0.02434452809393406, + "step": 1114 + }, + { + "epoch": 0.5575, + "grad_norm": 1.3606462478637695, + "grad_norm_var": 0.028109850014208366, + "learning_rate": 2e-05, + "loss": 0.469, + "loss/crossentropy": 2.2223979234695435, + "loss/hidden": 0.18994140625, + "loss/logits": 0.035602279007434845, + "loss/reg": 0.024342484772205353, + "step": 1115 + }, + { + "epoch": 0.558, + "grad_norm": 1.6001321077346802, + "grad_norm_var": 0.02862738311728966, + "learning_rate": 2e-05, + "loss": 0.4578, + "loss/crossentropy": 2.2725006341934204, + "loss/hidden": 0.18017578125, + "loss/logits": 0.03423896711319685, + "loss/reg": 0.024340493604540825, + "step": 1116 + }, + { + "epoch": 0.5585, + "grad_norm": 1.2636990547180176, + "grad_norm_var": 0.031044949777116432, + "learning_rate": 2e-05, + "loss": 0.451, + "loss/crossentropy": 2.3660874366760254, + "loss/hidden": 0.17333984375, + "loss/logits": 0.03424760699272156, + "loss/reg": 0.02433803491294384, + "step": 1117 + }, + { + "epoch": 0.559, + "grad_norm": 1.3616007566452026, + "grad_norm_var": 0.03101680909424142, + "learning_rate": 2e-05, + "loss": 0.4927, + "loss/crossentropy": 2.1571322679519653, + "loss/hidden": 0.205078125, + "loss/logits": 0.044297466054558754, + "loss/reg": 0.024335812777280807, + "step": 1118 + }, + { + "epoch": 0.5595, + "grad_norm": 2.230315685272217, + "grad_norm_var": 0.06873493913916656, + "learning_rate": 2e-05, + "loss": 0.4905, + "loss/crossentropy": 2.5267653465270996, + "loss/hidden": 0.201171875, + "loss/logits": 0.045963168144226074, + "loss/reg": 0.024333106353878975, + "step": 1119 + }, + { + "epoch": 0.56, + "grad_norm": 1.7945393323898315, + "grad_norm_var": 0.071148731844346, + "learning_rate": 2e-05, + "loss": 0.4996, + "loss/crossentropy": 2.512783646583557, + "loss/hidden": 0.2177734375, + "loss/logits": 0.03850918263196945, + "loss/reg": 0.024330556392669678, + "step": 1120 + }, + { + "epoch": 0.5605, + "grad_norm": 1.9248079061508179, + "grad_norm_var": 0.08119155521753, + "learning_rate": 2e-05, + "loss": 0.5227, + "loss/crossentropy": 2.2634390592575073, + "loss/hidden": 0.23681640625, + "loss/logits": 0.04262538440525532, + "loss/reg": 0.024327831342816353, + "step": 1121 + }, + { + "epoch": 0.561, + "grad_norm": 6.669369697570801, + "grad_norm_var": 1.695929746004041, + "learning_rate": 2e-05, + "loss": 0.9722, + "loss/crossentropy": 2.4415574073791504, + "loss/hidden": 0.544921875, + "loss/logits": 0.1840246431529522, + "loss/reg": 0.024325016885995865, + "step": 1122 + }, + { + "epoch": 0.5615, + "grad_norm": 1.315581202507019, + "grad_norm_var": 1.7103908959366814, + "learning_rate": 2e-05, + "loss": 0.4449, + "loss/crossentropy": 2.355056047439575, + "loss/hidden": 0.171875, + "loss/logits": 0.02983579970896244, + "loss/reg": 0.024322576820850372, + "step": 1123 + }, + { + "epoch": 0.562, + "grad_norm": 2.665045976638794, + "grad_norm_var": 1.729558453751204, + "learning_rate": 2e-05, + "loss": 0.5175, + "loss/crossentropy": 2.4953508377075195, + "loss/hidden": 0.2314453125, + "loss/logits": 0.042819553054869175, + "loss/reg": 0.024320153519511223, + "step": 1124 + }, + { + "epoch": 0.5625, + "grad_norm": 1.9144386053085327, + "grad_norm_var": 1.71941199935234, + "learning_rate": 2e-05, + "loss": 0.5167, + "loss/crossentropy": 2.513652205467224, + "loss/hidden": 0.21826171875, + "loss/logits": 0.0552450567483902, + "loss/reg": 0.024317733943462372, + "step": 1125 + }, + { + "epoch": 0.563, + "grad_norm": 2.0490505695343018, + "grad_norm_var": 1.715176762349163, + "learning_rate": 2e-05, + "loss": 0.4345, + "loss/crossentropy": 2.4084017276763916, + "loss/hidden": 0.1591796875, + "loss/logits": 0.032121747732162476, + "loss/reg": 0.02431519515812397, + "step": 1126 + }, + { + "epoch": 0.5635, + "grad_norm": 1.4523192644119263, + "grad_norm_var": 1.7274754574344684, + "learning_rate": 2e-05, + "loss": 0.4548, + "loss/crossentropy": 2.2959285974502563, + "loss/hidden": 0.17724609375, + "loss/logits": 0.03444240428507328, + "loss/reg": 0.0243125818669796, + "step": 1127 + }, + { + "epoch": 0.564, + "grad_norm": 1.5287126302719116, + "grad_norm_var": 1.704324322007363, + "learning_rate": 2e-05, + "loss": 0.4667, + "loss/crossentropy": 2.4687604904174805, + "loss/hidden": 0.1904296875, + "loss/logits": 0.03319636359810829, + "loss/reg": 0.024309968575835228, + "step": 1128 + }, + { + "epoch": 0.5645, + "grad_norm": 1.7322133779525757, + "grad_norm_var": 1.6888306469406487, + "learning_rate": 2e-05, + "loss": 0.4384, + "loss/crossentropy": 2.46881103515625, + "loss/hidden": 0.16552734375, + "loss/logits": 0.029783966951072216, + "loss/reg": 0.02430731989443302, + "step": 1129 + }, + { + "epoch": 0.565, + "grad_norm": 1.6788828372955322, + "grad_norm_var": 1.6680869393205444, + "learning_rate": 2e-05, + "loss": 0.4634, + "loss/crossentropy": 2.3073936700820923, + "loss/hidden": 0.189453125, + "loss/logits": 0.030876665376126766, + "loss/reg": 0.02430490031838417, + "step": 1130 + }, + { + "epoch": 0.5655, + "grad_norm": 1.89357328414917, + "grad_norm_var": 1.638002930492539, + "learning_rate": 2e-05, + "loss": 0.4736, + "loss/crossentropy": 2.3977789878845215, + "loss/hidden": 0.19287109375, + "loss/logits": 0.037660510279238224, + "loss/reg": 0.024302346631884575, + "step": 1131 + }, + { + "epoch": 0.566, + "grad_norm": 1.9893137216567993, + "grad_norm_var": 1.6232357375180981, + "learning_rate": 2e-05, + "loss": 0.5389, + "loss/crossentropy": 2.2793599367141724, + "loss/hidden": 0.2470703125, + "loss/logits": 0.04884297959506512, + "loss/reg": 0.02429981529712677, + "step": 1132 + }, + { + "epoch": 0.5665, + "grad_norm": 1.7860678434371948, + "grad_norm_var": 1.5826367428758024, + "learning_rate": 2e-05, + "loss": 0.4758, + "loss/crossentropy": 2.448815107345581, + "loss/hidden": 0.19921875, + "loss/logits": 0.033569784834980965, + "loss/reg": 0.024297522380948067, + "step": 1133 + }, + { + "epoch": 0.567, + "grad_norm": 1.0841586589813232, + "grad_norm_var": 1.615654748481629, + "learning_rate": 2e-05, + "loss": 0.4351, + "loss/crossentropy": 2.4790775775909424, + "loss/hidden": 0.1630859375, + "loss/logits": 0.02909463830292225, + "loss/reg": 0.024295024573802948, + "step": 1134 + }, + { + "epoch": 0.5675, + "grad_norm": 1.4538543224334717, + "grad_norm_var": 1.6405455106021212, + "learning_rate": 2e-05, + "loss": 0.4484, + "loss/crossentropy": 2.262555480003357, + "loss/hidden": 0.17529296875, + "loss/logits": 0.03016512282192707, + "loss/reg": 0.024292735382914543, + "step": 1135 + }, + { + "epoch": 0.568, + "grad_norm": 1.4780546426773071, + "grad_norm_var": 1.657933535725162, + "learning_rate": 2e-05, + "loss": 0.4848, + "loss/crossentropy": 2.3877243995666504, + "loss/hidden": 0.20361328125, + "loss/logits": 0.03827337175607681, + "loss/reg": 0.024290479719638824, + "step": 1136 + }, + { + "epoch": 0.5685, + "grad_norm": 1.9372481107711792, + "grad_norm_var": 1.6577546853387217, + "learning_rate": 2e-05, + "loss": 0.5327, + "loss/crossentropy": 2.5047531127929688, + "loss/hidden": 0.24462890625, + "loss/logits": 0.045230258256196976, + "loss/reg": 0.02428818680346012, + "step": 1137 + }, + { + "epoch": 0.569, + "grad_norm": 1.5927648544311523, + "grad_norm_var": 0.13445619453283364, + "learning_rate": 2e-05, + "loss": 0.4097, + "loss/crossentropy": 2.340881109237671, + "loss/hidden": 0.14013671875, + "loss/logits": 0.026726843789219856, + "loss/reg": 0.024285737425088882, + "step": 1138 + }, + { + "epoch": 0.5695, + "grad_norm": 1.7248023748397827, + "grad_norm_var": 0.12274966628292004, + "learning_rate": 2e-05, + "loss": 0.4644, + "loss/crossentropy": 2.322643995285034, + "loss/hidden": 0.19189453125, + "loss/logits": 0.029679549857974052, + "loss/reg": 0.024283410981297493, + "step": 1139 + }, + { + "epoch": 0.57, + "grad_norm": 1.2972499132156372, + "grad_norm_var": 0.07234907048121096, + "learning_rate": 2e-05, + "loss": 0.4139, + "loss/crossentropy": 2.3335630893707275, + "loss/hidden": 0.14794921875, + "loss/logits": 0.023186037316918373, + "loss/reg": 0.024281039834022522, + "step": 1140 + }, + { + "epoch": 0.5705, + "grad_norm": 1.733296513557434, + "grad_norm_var": 0.06830394569533192, + "learning_rate": 2e-05, + "loss": 0.4458, + "loss/crossentropy": 2.41671085357666, + "loss/hidden": 0.17236328125, + "loss/logits": 0.0306707676500082, + "loss/reg": 0.024278564378619194, + "step": 1141 + }, + { + "epoch": 0.571, + "grad_norm": 1.893109679222107, + "grad_norm_var": 0.06154171256225324, + "learning_rate": 2e-05, + "loss": 0.5721, + "loss/crossentropy": 2.1943784952163696, + "loss/hidden": 0.2763671875, + "loss/logits": 0.053009962663054466, + "loss/reg": 0.02427605725824833, + "step": 1142 + }, + { + "epoch": 0.5715, + "grad_norm": 1.6730619668960571, + "grad_norm_var": 0.05903454724422227, + "learning_rate": 2e-05, + "loss": 0.5033, + "loss/crossentropy": 2.3536800146102905, + "loss/hidden": 0.21337890625, + "loss/logits": 0.047144461423158646, + "loss/reg": 0.024273628368973732, + "step": 1143 + }, + { + "epoch": 0.572, + "grad_norm": 1.3953866958618164, + "grad_norm_var": 0.06238648029036706, + "learning_rate": 2e-05, + "loss": 0.4842, + "loss/crossentropy": 2.157352328300476, + "loss/hidden": 0.20556640625, + "loss/logits": 0.03593774512410164, + "loss/reg": 0.024271195754408836, + "step": 1144 + }, + { + "epoch": 0.5725, + "grad_norm": 2.192866325378418, + "grad_norm_var": 0.0809172906121536, + "learning_rate": 2e-05, + "loss": 0.531, + "loss/crossentropy": 2.570547342300415, + "loss/hidden": 0.2392578125, + "loss/logits": 0.0490174125880003, + "loss/reg": 0.024268826469779015, + "step": 1145 + }, + { + "epoch": 0.573, + "grad_norm": 1.5369664430618286, + "grad_norm_var": 0.08210695127014726, + "learning_rate": 2e-05, + "loss": 0.4454, + "loss/crossentropy": 2.435948967933655, + "loss/hidden": 0.17431640625, + "loss/logits": 0.028402727097272873, + "loss/reg": 0.024266386404633522, + "step": 1146 + }, + { + "epoch": 0.5735, + "grad_norm": 2.309359550476074, + "grad_norm_var": 0.10550807519647355, + "learning_rate": 2e-05, + "loss": 0.4775, + "loss/crossentropy": 2.4030030965805054, + "loss/hidden": 0.19775390625, + "loss/logits": 0.03707532212138176, + "loss/reg": 0.024263978004455566, + "step": 1147 + }, + { + "epoch": 0.574, + "grad_norm": 1.347505807876587, + "grad_norm_var": 0.10584021840658688, + "learning_rate": 2e-05, + "loss": 0.442, + "loss/crossentropy": 2.398911237716675, + "loss/hidden": 0.1650390625, + "loss/logits": 0.034335775300860405, + "loss/reg": 0.02426161989569664, + "step": 1148 + }, + { + "epoch": 0.5745, + "grad_norm": 1.4021785259246826, + "grad_norm_var": 0.10820061974491917, + "learning_rate": 2e-05, + "loss": 0.4522, + "loss/crossentropy": 2.36569881439209, + "loss/hidden": 0.17822265625, + "loss/logits": 0.031414832919836044, + "loss/reg": 0.024259256199002266, + "step": 1149 + }, + { + "epoch": 0.575, + "grad_norm": 1.4795446395874023, + "grad_norm_var": 0.08928821772785417, + "learning_rate": 2e-05, + "loss": 0.4868, + "loss/crossentropy": 2.4005424976348877, + "loss/hidden": 0.20458984375, + "loss/logits": 0.03963397815823555, + "loss/reg": 0.024256786331534386, + "step": 1150 + }, + { + "epoch": 0.5755, + "grad_norm": 1.2517884969711304, + "grad_norm_var": 0.09720427256013545, + "learning_rate": 2e-05, + "loss": 0.4264, + "loss/crossentropy": 2.4282373189926147, + "loss/hidden": 0.15771484375, + "loss/logits": 0.026178008876740932, + "loss/reg": 0.02425423264503479, + "step": 1151 + }, + { + "epoch": 0.576, + "grad_norm": 1.7204208374023438, + "grad_norm_var": 0.0956317930189319, + "learning_rate": 2e-05, + "loss": 0.546, + "loss/crossentropy": 2.2745801210403442, + "loss/hidden": 0.25244140625, + "loss/logits": 0.05107201635837555, + "loss/reg": 0.024251526221632957, + "step": 1152 + }, + { + "epoch": 0.5765, + "grad_norm": 1.5777848958969116, + "grad_norm_var": 0.09020256568864984, + "learning_rate": 2e-05, + "loss": 0.5091, + "loss/crossentropy": 2.4117172956466675, + "loss/hidden": 0.2265625, + "loss/logits": 0.040081385523080826, + "loss/reg": 0.02424911968410015, + "step": 1153 + }, + { + "epoch": 0.577, + "grad_norm": 1.783171534538269, + "grad_norm_var": 0.09144687374157971, + "learning_rate": 2e-05, + "loss": 0.4786, + "loss/crossentropy": 2.454118490219116, + "loss/hidden": 0.1962890625, + "loss/logits": 0.039798869751393795, + "loss/reg": 0.024246560409665108, + "step": 1154 + }, + { + "epoch": 0.5775, + "grad_norm": 2.023660659790039, + "grad_norm_var": 0.10021283785235559, + "learning_rate": 2e-05, + "loss": 0.4482, + "loss/crossentropy": 2.292509913444519, + "loss/hidden": 0.1748046875, + "loss/logits": 0.030983050353825092, + "loss/reg": 0.0242440365254879, + "step": 1155 + }, + { + "epoch": 0.578, + "grad_norm": 1.811361312866211, + "grad_norm_var": 0.09162067235455892, + "learning_rate": 2e-05, + "loss": 0.447, + "loss/crossentropy": 2.323517084121704, + "loss/hidden": 0.171875, + "loss/logits": 0.03270300664007664, + "loss/reg": 0.024241575971245766, + "step": 1156 + }, + { + "epoch": 0.5785, + "grad_norm": 1.7037845849990845, + "grad_norm_var": 0.09152723245676046, + "learning_rate": 2e-05, + "loss": 0.4631, + "loss/crossentropy": 2.222510814666748, + "loss/hidden": 0.19140625, + "loss/logits": 0.029271118342876434, + "loss/reg": 0.024239055812358856, + "step": 1157 + }, + { + "epoch": 0.579, + "grad_norm": 1.464499592781067, + "grad_norm_var": 0.09162285800122252, + "learning_rate": 2e-05, + "loss": 0.4244, + "loss/crossentropy": 2.477970004081726, + "loss/hidden": 0.158203125, + "loss/logits": 0.023815092630684376, + "loss/reg": 0.024236636236310005, + "step": 1158 + }, + { + "epoch": 0.5795, + "grad_norm": 1.6984034776687622, + "grad_norm_var": 0.09168319422315055, + "learning_rate": 2e-05, + "loss": 0.4446, + "loss/crossentropy": 2.3342912197113037, + "loss/hidden": 0.1796875, + "loss/logits": 0.022567021660506725, + "loss/reg": 0.024234119802713394, + "step": 1159 + }, + { + "epoch": 0.58, + "grad_norm": 2.0293402671813965, + "grad_norm_var": 0.09370210145535408, + "learning_rate": 2e-05, + "loss": 0.4915, + "loss/crossentropy": 2.2998613119125366, + "loss/hidden": 0.20556640625, + "loss/logits": 0.043666526675224304, + "loss/reg": 0.024231692776083946, + "step": 1160 + }, + { + "epoch": 0.5805, + "grad_norm": 1.280202865600586, + "grad_norm_var": 0.08679439278309259, + "learning_rate": 2e-05, + "loss": 0.4539, + "loss/crossentropy": 2.2879260778427124, + "loss/hidden": 0.173828125, + "loss/logits": 0.03776852414011955, + "loss/reg": 0.024229243397712708, + "step": 1161 + }, + { + "epoch": 0.581, + "grad_norm": 1.275728464126587, + "grad_norm_var": 0.09504035923803802, + "learning_rate": 2e-05, + "loss": 0.4329, + "loss/crossentropy": 2.229547381401062, + "loss/hidden": 0.162109375, + "loss/logits": 0.028550241142511368, + "loss/reg": 0.024226877838373184, + "step": 1162 + }, + { + "epoch": 0.5815, + "grad_norm": 2.9225845336914062, + "grad_norm_var": 0.17368750923172507, + "learning_rate": 2e-05, + "loss": 0.4976, + "loss/crossentropy": 2.2666972875595093, + "loss/hidden": 0.22021484375, + "loss/logits": 0.03513254597783089, + "loss/reg": 0.024224402382969856, + "step": 1163 + }, + { + "epoch": 0.582, + "grad_norm": 1.4278773069381714, + "grad_norm_var": 0.1706005194348809, + "learning_rate": 2e-05, + "loss": 0.427, + "loss/crossentropy": 2.4950149059295654, + "loss/hidden": 0.15576171875, + "loss/logits": 0.028986497782170773, + "loss/reg": 0.024221867322921753, + "step": 1164 + }, + { + "epoch": 0.5825, + "grad_norm": 2.058894395828247, + "grad_norm_var": 0.17338003347078695, + "learning_rate": 2e-05, + "loss": 0.463, + "loss/crossentropy": 2.4152116775512695, + "loss/hidden": 0.185546875, + "loss/logits": 0.035301932133734226, + "loss/reg": 0.02421954646706581, + "step": 1165 + }, + { + "epoch": 0.583, + "grad_norm": 1.8373039960861206, + "grad_norm_var": 0.1699421495295475, + "learning_rate": 2e-05, + "loss": 0.4683, + "loss/crossentropy": 2.154408037662506, + "loss/hidden": 0.1943359375, + "loss/logits": 0.03181672282516956, + "loss/reg": 0.024217093363404274, + "step": 1166 + }, + { + "epoch": 0.5835, + "grad_norm": 1.720379114151001, + "grad_norm_var": 0.15305819839326673, + "learning_rate": 2e-05, + "loss": 0.4963, + "loss/crossentropy": 2.205121636390686, + "loss/hidden": 0.2060546875, + "loss/logits": 0.0481159882619977, + "loss/reg": 0.024214772507548332, + "step": 1167 + }, + { + "epoch": 0.584, + "grad_norm": 1.5766927003860474, + "grad_norm_var": 0.155317874758835, + "learning_rate": 2e-05, + "loss": 0.4537, + "loss/crossentropy": 2.2504332065582275, + "loss/hidden": 0.17626953125, + "loss/logits": 0.03528860583901405, + "loss/reg": 0.024212457239627838, + "step": 1168 + }, + { + "epoch": 0.5845, + "grad_norm": 1.7561485767364502, + "grad_norm_var": 0.15292574466172837, + "learning_rate": 2e-05, + "loss": 0.5043, + "loss/crossentropy": 2.2984803915023804, + "loss/hidden": 0.224609375, + "loss/logits": 0.037589056417346, + "loss/reg": 0.02421003021299839, + "step": 1169 + }, + { + "epoch": 0.585, + "grad_norm": 1.8103101253509521, + "grad_norm_var": 0.15300812172836042, + "learning_rate": 2e-05, + "loss": 0.4426, + "loss/crossentropy": 2.35384738445282, + "loss/hidden": 0.17236328125, + "loss/logits": 0.028152812272310257, + "loss/reg": 0.02420770935714245, + "step": 1170 + }, + { + "epoch": 0.5855, + "grad_norm": 1.8475137948989868, + "grad_norm_var": 0.1491030967858634, + "learning_rate": 2e-05, + "loss": 0.4867, + "loss/crossentropy": 2.5606281757354736, + "loss/hidden": 0.20654296875, + "loss/logits": 0.03815155662596226, + "loss/reg": 0.024205291643738747, + "step": 1171 + }, + { + "epoch": 0.586, + "grad_norm": 1.2912832498550415, + "grad_norm_var": 0.16271106748652428, + "learning_rate": 2e-05, + "loss": 0.4484, + "loss/crossentropy": 2.3711254596710205, + "loss/hidden": 0.1728515625, + "loss/logits": 0.0335617596283555, + "loss/reg": 0.02420296147465706, + "step": 1172 + }, + { + "epoch": 0.5865, + "grad_norm": 1.606691837310791, + "grad_norm_var": 0.16365658036543582, + "learning_rate": 2e-05, + "loss": 0.4691, + "loss/crossentropy": 2.2857288122177124, + "loss/hidden": 0.18408203125, + "loss/logits": 0.04296381585299969, + "loss/reg": 0.02420070767402649, + "step": 1173 + }, + { + "epoch": 0.587, + "grad_norm": 1.4397433996200562, + "grad_norm_var": 0.16455554628546304, + "learning_rate": 2e-05, + "loss": 0.4426, + "loss/crossentropy": 2.4021177291870117, + "loss/hidden": 0.16796875, + "loss/logits": 0.03261144831776619, + "loss/reg": 0.024198230355978012, + "step": 1174 + }, + { + "epoch": 0.5875, + "grad_norm": 1.5037596225738525, + "grad_norm_var": 0.1675797787548589, + "learning_rate": 2e-05, + "loss": 0.4517, + "loss/crossentropy": 2.4898879528045654, + "loss/hidden": 0.1767578125, + "loss/logits": 0.03298352472484112, + "loss/reg": 0.024195775389671326, + "step": 1175 + }, + { + "epoch": 0.588, + "grad_norm": 1.6262377500534058, + "grad_norm_var": 0.16065407055809539, + "learning_rate": 2e-05, + "loss": 0.4587, + "loss/crossentropy": 2.2547478675842285, + "loss/hidden": 0.18115234375, + "loss/logits": 0.03557092510163784, + "loss/reg": 0.02419334463775158, + "step": 1176 + }, + { + "epoch": 0.5885, + "grad_norm": 1.2082576751708984, + "grad_norm_var": 0.16487347300328276, + "learning_rate": 2e-05, + "loss": 0.4253, + "loss/crossentropy": 2.432216763496399, + "loss/hidden": 0.15380859375, + "loss/logits": 0.029609275981783867, + "loss/reg": 0.024190889671444893, + "step": 1177 + }, + { + "epoch": 0.589, + "grad_norm": 1.4975124597549438, + "grad_norm_var": 0.1559385884208209, + "learning_rate": 2e-05, + "loss": 0.5032, + "loss/crossentropy": 2.235932469367981, + "loss/hidden": 0.203125, + "loss/logits": 0.058187903836369514, + "loss/reg": 0.024188483133912086, + "step": 1178 + }, + { + "epoch": 0.5895, + "grad_norm": 1.8868989944458008, + "grad_norm_var": 0.05355658095740689, + "learning_rate": 2e-05, + "loss": 0.4478, + "loss/crossentropy": 2.1847586631774902, + "loss/hidden": 0.17626953125, + "loss/logits": 0.029623565264046192, + "loss/reg": 0.024185974150896072, + "step": 1179 + }, + { + "epoch": 0.59, + "grad_norm": 1.7675682306289673, + "grad_norm_var": 0.05156999438171397, + "learning_rate": 2e-05, + "loss": 0.5382, + "loss/crossentropy": 2.1349618434906006, + "loss/hidden": 0.2548828125, + "loss/logits": 0.041456746868789196, + "loss/reg": 0.024183399975299835, + "step": 1180 + }, + { + "epoch": 0.5905, + "grad_norm": 1.6287496089935303, + "grad_norm_var": 0.03980901680952513, + "learning_rate": 2e-05, + "loss": 0.4349, + "loss/crossentropy": 2.344777226448059, + "loss/hidden": 0.16357421875, + "loss/logits": 0.02948729507625103, + "loss/reg": 0.024180879816412926, + "step": 1181 + }, + { + "epoch": 0.591, + "grad_norm": 1.8324546813964844, + "grad_norm_var": 0.039673420153317276, + "learning_rate": 2e-05, + "loss": 0.4473, + "loss/crossentropy": 2.274307608604431, + "loss/hidden": 0.171875, + "loss/logits": 0.03365413844585419, + "loss/reg": 0.024178462103009224, + "step": 1182 + }, + { + "epoch": 0.5915, + "grad_norm": 1.8862324953079224, + "grad_norm_var": 0.04350154335947139, + "learning_rate": 2e-05, + "loss": 0.484, + "loss/crossentropy": 2.469294786453247, + "loss/hidden": 0.20361328125, + "loss/logits": 0.03859470225870609, + "loss/reg": 0.02417594939470291, + "step": 1183 + }, + { + "epoch": 0.592, + "grad_norm": 1.8991292715072632, + "grad_norm_var": 0.04747638633534521, + "learning_rate": 2e-05, + "loss": 0.4209, + "loss/crossentropy": 2.328023672103882, + "loss/hidden": 0.15185546875, + "loss/logits": 0.027280107140541077, + "loss/reg": 0.024173393845558167, + "step": 1184 + }, + { + "epoch": 0.5925, + "grad_norm": 2.4212067127227783, + "grad_norm_var": 0.08404251009467104, + "learning_rate": 2e-05, + "loss": 0.602, + "loss/crossentropy": 2.2543612718582153, + "loss/hidden": 0.314453125, + "loss/logits": 0.04579521995037794, + "loss/reg": 0.024170896038413048, + "step": 1185 + }, + { + "epoch": 0.593, + "grad_norm": 1.5243364572525024, + "grad_norm_var": 0.08483701742637818, + "learning_rate": 2e-05, + "loss": 0.4508, + "loss/crossentropy": 2.3000658750534058, + "loss/hidden": 0.17578125, + "loss/logits": 0.03333883360028267, + "loss/reg": 0.02416837401688099, + "step": 1186 + }, + { + "epoch": 0.5935, + "grad_norm": 1.291556477546692, + "grad_norm_var": 0.09168008942992945, + "learning_rate": 2e-05, + "loss": 0.4442, + "loss/crossentropy": 2.1959608793258667, + "loss/hidden": 0.1748046875, + "loss/logits": 0.027781125158071518, + "loss/reg": 0.024165915325284004, + "step": 1187 + }, + { + "epoch": 0.594, + "grad_norm": 2.0484044551849365, + "grad_norm_var": 0.09185247402589478, + "learning_rate": 2e-05, + "loss": 0.4147, + "loss/crossentropy": 2.5615549087524414, + "loss/hidden": 0.14990234375, + "loss/logits": 0.023197302594780922, + "loss/reg": 0.02416372112929821, + "step": 1188 + }, + { + "epoch": 0.5945, + "grad_norm": 1.416138768196106, + "grad_norm_var": 0.09628413301188529, + "learning_rate": 2e-05, + "loss": 0.4125, + "loss/crossentropy": 2.387251138687134, + "loss/hidden": 0.14697265625, + "loss/logits": 0.023900354281067848, + "loss/reg": 0.02416159212589264, + "step": 1189 + }, + { + "epoch": 0.595, + "grad_norm": 1.9145230054855347, + "grad_norm_var": 0.0951705943310803, + "learning_rate": 2e-05, + "loss": 0.4606, + "loss/crossentropy": 2.314830780029297, + "loss/hidden": 0.1875, + "loss/logits": 0.03147210646420717, + "loss/reg": 0.02415909618139267, + "step": 1190 + }, + { + "epoch": 0.5955, + "grad_norm": 2.2240161895751953, + "grad_norm_var": 0.10782977301944445, + "learning_rate": 2e-05, + "loss": 0.4566, + "loss/crossentropy": 2.359019637107849, + "loss/hidden": 0.18115234375, + "loss/logits": 0.033831628039479256, + "loss/reg": 0.024156760424375534, + "step": 1191 + }, + { + "epoch": 0.596, + "grad_norm": 1.3667939901351929, + "grad_norm_var": 0.11647627127392604, + "learning_rate": 2e-05, + "loss": 0.4461, + "loss/crossentropy": 2.371762752532959, + "loss/hidden": 0.17041015625, + "loss/logits": 0.03416162542998791, + "loss/reg": 0.024154268205165863, + "step": 1192 + }, + { + "epoch": 0.5965, + "grad_norm": 1.9000991582870483, + "grad_norm_var": 0.09749187838186989, + "learning_rate": 2e-05, + "loss": 0.4426, + "loss/crossentropy": 2.3624730110168457, + "loss/hidden": 0.171875, + "loss/logits": 0.029253195971250534, + "loss/reg": 0.024151787161827087, + "step": 1193 + }, + { + "epoch": 0.597, + "grad_norm": 2.4538071155548096, + "grad_norm_var": 0.11842507530329692, + "learning_rate": 2e-05, + "loss": 0.5535, + "loss/crossentropy": 2.297299027442932, + "loss/hidden": 0.2734375, + "loss/logits": 0.0385602843016386, + "loss/reg": 0.024149475619196892, + "step": 1194 + }, + { + "epoch": 0.5975, + "grad_norm": 1.380436658859253, + "grad_norm_var": 0.13138206675488526, + "learning_rate": 2e-05, + "loss": 0.4057, + "loss/crossentropy": 2.476130962371826, + "loss/hidden": 0.13916015625, + "loss/logits": 0.025057541206479073, + "loss/reg": 0.024147171527147293, + "step": 1195 + }, + { + "epoch": 0.598, + "grad_norm": 1.3839375972747803, + "grad_norm_var": 0.14273622703758423, + "learning_rate": 2e-05, + "loss": 0.4528, + "loss/crossentropy": 2.396567940711975, + "loss/hidden": 0.18212890625, + "loss/logits": 0.02926408126950264, + "loss/reg": 0.024144427850842476, + "step": 1196 + }, + { + "epoch": 0.5985, + "grad_norm": 1.40784752368927, + "grad_norm_var": 0.150409987258331, + "learning_rate": 2e-05, + "loss": 0.4807, + "loss/crossentropy": 2.4952961206436157, + "loss/hidden": 0.1962890625, + "loss/logits": 0.04295238200575113, + "loss/reg": 0.024141840636730194, + "step": 1197 + }, + { + "epoch": 0.599, + "grad_norm": 1.2504740953445435, + "grad_norm_var": 0.16688246301015586, + "learning_rate": 2e-05, + "loss": 0.4178, + "loss/crossentropy": 2.395404577255249, + "loss/hidden": 0.1484375, + "loss/logits": 0.027952153235673904, + "loss/reg": 0.02413935586810112, + "step": 1198 + }, + { + "epoch": 0.5995, + "grad_norm": 1.3816466331481934, + "grad_norm_var": 0.17265834810284506, + "learning_rate": 2e-05, + "loss": 0.4251, + "loss/crossentropy": 2.547404170036316, + "loss/hidden": 0.154296875, + "loss/logits": 0.029406324960291386, + "loss/reg": 0.02413680963218212, + "step": 1199 + }, + { + "epoch": 0.6, + "grad_norm": 1.5290807485580444, + "grad_norm_var": 0.1715902945906383, + "learning_rate": 2e-05, + "loss": 0.4251, + "loss/crossentropy": 2.3598448038101196, + "loss/hidden": 0.15283203125, + "loss/logits": 0.030883144587278366, + "loss/reg": 0.02413429692387581, + "step": 1200 + }, + { + "epoch": 0.6005, + "grad_norm": 1.1940410137176514, + "grad_norm_var": 0.1445797734401556, + "learning_rate": 2e-05, + "loss": 0.3971, + "loss/crossentropy": 2.485979676246643, + "loss/hidden": 0.13330078125, + "loss/logits": 0.022492852061986923, + "loss/reg": 0.024131763726472855, + "step": 1201 + }, + { + "epoch": 0.601, + "grad_norm": 1.6485071182250977, + "grad_norm_var": 0.14422125485489776, + "learning_rate": 2e-05, + "loss": 0.4846, + "loss/crossentropy": 2.373944044113159, + "loss/hidden": 0.20458984375, + "loss/logits": 0.038744281977415085, + "loss/reg": 0.024129167199134827, + "step": 1202 + }, + { + "epoch": 0.6015, + "grad_norm": 1.1376245021820068, + "grad_norm_var": 0.1522781785188527, + "learning_rate": 2e-05, + "loss": 0.4531, + "loss/crossentropy": 2.4254151582717896, + "loss/hidden": 0.17529296875, + "loss/logits": 0.03653997741639614, + "loss/reg": 0.024126648902893066, + "step": 1203 + }, + { + "epoch": 0.602, + "grad_norm": 1.3931175470352173, + "grad_norm_var": 0.14014204164493524, + "learning_rate": 2e-05, + "loss": 0.4614, + "loss/crossentropy": 2.3730632066726685, + "loss/hidden": 0.18017578125, + "loss/logits": 0.0399714931845665, + "loss/reg": 0.024123938754200935, + "step": 1204 + }, + { + "epoch": 0.6025, + "grad_norm": 1.182810664176941, + "grad_norm_var": 0.14806320441701076, + "learning_rate": 2e-05, + "loss": 0.4377, + "loss/crossentropy": 2.3796987533569336, + "loss/hidden": 0.1630859375, + "loss/logits": 0.033448660746216774, + "loss/reg": 0.02412118948996067, + "step": 1205 + }, + { + "epoch": 0.603, + "grad_norm": 2.454332113265991, + "grad_norm_var": 0.19274218238629567, + "learning_rate": 2e-05, + "loss": 0.5843, + "loss/crossentropy": 2.41066837310791, + "loss/hidden": 0.2548828125, + "loss/logits": 0.08823728933930397, + "loss/reg": 0.024118369445204735, + "step": 1206 + }, + { + "epoch": 0.6035, + "grad_norm": 2.028047561645508, + "grad_norm_var": 0.17832881774557302, + "learning_rate": 2e-05, + "loss": 0.4994, + "loss/crossentropy": 2.4406535625457764, + "loss/hidden": 0.21826171875, + "loss/logits": 0.039943594485521317, + "loss/reg": 0.024115748703479767, + "step": 1207 + }, + { + "epoch": 0.604, + "grad_norm": 1.4872275590896606, + "grad_norm_var": 0.17599978463784297, + "learning_rate": 2e-05, + "loss": 0.4413, + "loss/crossentropy": 2.5775226354599, + "loss/hidden": 0.16748046875, + "loss/logits": 0.03267715871334076, + "loss/reg": 0.02411310188472271, + "step": 1208 + }, + { + "epoch": 0.6045, + "grad_norm": 1.444392204284668, + "grad_norm_var": 0.16927527117337202, + "learning_rate": 2e-05, + "loss": 0.4386, + "loss/crossentropy": 2.42952036857605, + "loss/hidden": 0.169921875, + "loss/logits": 0.02754312101751566, + "loss/reg": 0.024110691621899605, + "step": 1209 + }, + { + "epoch": 0.605, + "grad_norm": 1.3377629518508911, + "grad_norm_var": 0.11223377067241286, + "learning_rate": 2e-05, + "loss": 0.4506, + "loss/crossentropy": 2.463944435119629, + "loss/hidden": 0.17822265625, + "loss/logits": 0.031324658542871475, + "loss/reg": 0.024108313024044037, + "step": 1210 + }, + { + "epoch": 0.6055, + "grad_norm": 1.5005191564559937, + "grad_norm_var": 0.11157964006104232, + "learning_rate": 2e-05, + "loss": 0.4304, + "loss/crossentropy": 2.4867637157440186, + "loss/hidden": 0.16015625, + "loss/logits": 0.02921352256089449, + "loss/reg": 0.024105625227093697, + "step": 1211 + }, + { + "epoch": 0.606, + "grad_norm": 1.792034387588501, + "grad_norm_var": 0.1164848223260717, + "learning_rate": 2e-05, + "loss": 0.5558, + "loss/crossentropy": 2.144772946834564, + "loss/hidden": 0.25341796875, + "loss/logits": 0.0613440815359354, + "loss/reg": 0.02410317398607731, + "step": 1212 + }, + { + "epoch": 0.6065, + "grad_norm": 1.3222495317459106, + "grad_norm_var": 0.11811538585086864, + "learning_rate": 2e-05, + "loss": 0.459, + "loss/crossentropy": 2.4880030155181885, + "loss/hidden": 0.17724609375, + "loss/logits": 0.040756989270448685, + "loss/reg": 0.02410070225596428, + "step": 1213 + }, + { + "epoch": 0.607, + "grad_norm": 1.3210080862045288, + "grad_norm_var": 0.11603035562697338, + "learning_rate": 2e-05, + "loss": 0.4262, + "loss/crossentropy": 2.2644035816192627, + "loss/hidden": 0.15673828125, + "loss/logits": 0.02847316488623619, + "loss/reg": 0.024098023772239685, + "step": 1214 + }, + { + "epoch": 0.6075, + "grad_norm": 1.55643892288208, + "grad_norm_var": 0.1149566743584008, + "learning_rate": 2e-05, + "loss": 0.472, + "loss/crossentropy": 2.338138461112976, + "loss/hidden": 0.19482421875, + "loss/logits": 0.03619702160358429, + "loss/reg": 0.024095552042126656, + "step": 1215 + }, + { + "epoch": 0.608, + "grad_norm": 1.0845917463302612, + "grad_norm_var": 0.12680071206606555, + "learning_rate": 2e-05, + "loss": 0.3949, + "loss/crossentropy": 2.3922587633132935, + "loss/hidden": 0.13037109375, + "loss/logits": 0.023628353141248226, + "loss/reg": 0.02409297414124012, + "step": 1216 + }, + { + "epoch": 0.6085, + "grad_norm": 1.7885349988937378, + "grad_norm_var": 0.12520873664582974, + "learning_rate": 2e-05, + "loss": 0.5574, + "loss/crossentropy": 2.473549246788025, + "loss/hidden": 0.271484375, + "loss/logits": 0.045023126527667046, + "loss/reg": 0.024090547114610672, + "step": 1217 + }, + { + "epoch": 0.609, + "grad_norm": 1.367996096611023, + "grad_norm_var": 0.12569242606033507, + "learning_rate": 2e-05, + "loss": 0.4443, + "loss/crossentropy": 2.5233819484710693, + "loss/hidden": 0.16015625, + "loss/logits": 0.043247487396001816, + "loss/reg": 0.024087954312562943, + "step": 1218 + }, + { + "epoch": 0.6095, + "grad_norm": 2.246495485305786, + "grad_norm_var": 0.14712908643756276, + "learning_rate": 2e-05, + "loss": 0.4786, + "loss/crossentropy": 2.3473750352859497, + "loss/hidden": 0.2021484375, + "loss/logits": 0.035566676408052444, + "loss/reg": 0.024085314944386482, + "step": 1219 + }, + { + "epoch": 0.61, + "grad_norm": 1.4608843326568604, + "grad_norm_var": 0.14571195454986466, + "learning_rate": 2e-05, + "loss": 0.5275, + "loss/crossentropy": 2.1911109685897827, + "loss/hidden": 0.2412109375, + "loss/logits": 0.04546273872256279, + "loss/reg": 0.02408267930150032, + "step": 1220 + }, + { + "epoch": 0.6105, + "grad_norm": 3.359498977661133, + "grad_norm_var": 0.3248317660863883, + "learning_rate": 2e-05, + "loss": 0.5278, + "loss/crossentropy": 2.7323907613754272, + "loss/hidden": 0.2412109375, + "loss/logits": 0.04578916169703007, + "loss/reg": 0.024080097675323486, + "step": 1221 + }, + { + "epoch": 0.611, + "grad_norm": 1.410009741783142, + "grad_norm_var": 0.29102285697800256, + "learning_rate": 2e-05, + "loss": 0.442, + "loss/crossentropy": 2.259430766105652, + "loss/hidden": 0.173828125, + "loss/logits": 0.027424287050962448, + "loss/reg": 0.02407745271921158, + "step": 1222 + }, + { + "epoch": 0.6115, + "grad_norm": 1.7386364936828613, + "grad_norm_var": 0.28192935324309565, + "learning_rate": 2e-05, + "loss": 0.4869, + "loss/crossentropy": 2.1112271547317505, + "loss/hidden": 0.20556640625, + "loss/logits": 0.04061359539628029, + "loss/reg": 0.024074768647551537, + "step": 1223 + }, + { + "epoch": 0.612, + "grad_norm": 1.7512989044189453, + "grad_norm_var": 0.28095646018948417, + "learning_rate": 2e-05, + "loss": 0.4391, + "loss/crossentropy": 2.2848275899887085, + "loss/hidden": 0.16796875, + "loss/logits": 0.030420562252402306, + "loss/reg": 0.024072324857115746, + "step": 1224 + }, + { + "epoch": 0.6125, + "grad_norm": 2.1722567081451416, + "grad_norm_var": 0.2936146731009558, + "learning_rate": 2e-05, + "loss": 0.5109, + "loss/crossentropy": 2.3575836420059204, + "loss/hidden": 0.22705078125, + "loss/logits": 0.043132973834872246, + "loss/reg": 0.024069787934422493, + "step": 1225 + }, + { + "epoch": 0.613, + "grad_norm": 1.7794545888900757, + "grad_norm_var": 0.28443734408106686, + "learning_rate": 2e-05, + "loss": 0.4502, + "loss/crossentropy": 2.1393051147460938, + "loss/hidden": 0.1845703125, + "loss/logits": 0.024946999736130238, + "loss/reg": 0.024067340418696404, + "step": 1226 + }, + { + "epoch": 0.6135, + "grad_norm": 1.1268008947372437, + "grad_norm_var": 0.3045137650879551, + "learning_rate": 2e-05, + "loss": 0.4021, + "loss/crossentropy": 2.568060874938965, + "loss/hidden": 0.1376953125, + "loss/logits": 0.02373245358467102, + "loss/reg": 0.02406480722129345, + "step": 1227 + }, + { + "epoch": 0.614, + "grad_norm": 1.7132948637008667, + "grad_norm_var": 0.303986332406373, + "learning_rate": 2e-05, + "loss": 0.4984, + "loss/crossentropy": 2.4376784563064575, + "loss/hidden": 0.2138671875, + "loss/logits": 0.04392072185873985, + "loss/reg": 0.024062197655439377, + "step": 1228 + }, + { + "epoch": 0.6145, + "grad_norm": 1.6476460695266724, + "grad_norm_var": 0.29421634520029155, + "learning_rate": 2e-05, + "loss": 0.4641, + "loss/crossentropy": 2.292167067527771, + "loss/hidden": 0.1865234375, + "loss/logits": 0.03701779432594776, + "loss/reg": 0.024059604853391647, + "step": 1229 + }, + { + "epoch": 0.615, + "grad_norm": 1.1690088510513306, + "grad_norm_var": 0.3037526654890541, + "learning_rate": 2e-05, + "loss": 0.4071, + "loss/crossentropy": 2.5657061338424683, + "loss/hidden": 0.13818359375, + "loss/logits": 0.028342257253825665, + "loss/reg": 0.024057114496827126, + "step": 1230 + }, + { + "epoch": 0.6155, + "grad_norm": 1.7636140584945679, + "grad_norm_var": 0.3021712089508715, + "learning_rate": 2e-05, + "loss": 0.484, + "loss/crossentropy": 2.354526996612549, + "loss/hidden": 0.19970703125, + "loss/logits": 0.04370002634823322, + "loss/reg": 0.02405458688735962, + "step": 1231 + }, + { + "epoch": 0.616, + "grad_norm": 1.546012282371521, + "grad_norm_var": 0.2761551623079915, + "learning_rate": 2e-05, + "loss": 0.4528, + "loss/crossentropy": 2.3835500478744507, + "loss/hidden": 0.17333984375, + "loss/logits": 0.03897825721651316, + "loss/reg": 0.02405191771686077, + "step": 1232 + }, + { + "epoch": 0.6165, + "grad_norm": 2.489821434020996, + "grad_norm_var": 0.3102538412663264, + "learning_rate": 2e-05, + "loss": 0.486, + "loss/crossentropy": 2.604992389678955, + "loss/hidden": 0.20751953125, + "loss/logits": 0.037946032360196114, + "loss/reg": 0.024049216881394386, + "step": 1233 + }, + { + "epoch": 0.617, + "grad_norm": 1.2768479585647583, + "grad_norm_var": 0.31597976978417625, + "learning_rate": 2e-05, + "loss": 0.4123, + "loss/crossentropy": 2.3154995441436768, + "loss/hidden": 0.14697265625, + "loss/logits": 0.02490917406976223, + "loss/reg": 0.02404674142599106, + "step": 1234 + }, + { + "epoch": 0.6175, + "grad_norm": 4.095790386199951, + "grad_norm_var": 0.6421038174808378, + "learning_rate": 2e-05, + "loss": 0.6366, + "loss/crossentropy": 2.2328860759735107, + "loss/hidden": 0.34619140625, + "loss/logits": 0.049947988241910934, + "loss/reg": 0.024044139310717583, + "step": 1235 + }, + { + "epoch": 0.618, + "grad_norm": 1.500967264175415, + "grad_norm_var": 0.6398237315745594, + "learning_rate": 2e-05, + "loss": 0.4614, + "loss/crossentropy": 2.3703516721725464, + "loss/hidden": 0.18408203125, + "loss/logits": 0.03693939931690693, + "loss/reg": 0.02404148131608963, + "step": 1236 + }, + { + "epoch": 0.6185, + "grad_norm": 2.2723183631896973, + "grad_norm_var": 0.5034082078181905, + "learning_rate": 2e-05, + "loss": 0.5442, + "loss/crossentropy": 2.221264958381653, + "loss/hidden": 0.26318359375, + "loss/logits": 0.04067422728985548, + "loss/reg": 0.02403891831636429, + "step": 1237 + }, + { + "epoch": 0.619, + "grad_norm": 2.6256821155548096, + "grad_norm_var": 0.5259378567608592, + "learning_rate": 2e-05, + "loss": 0.4112, + "loss/crossentropy": 2.5045779943466187, + "loss/hidden": 0.14794921875, + "loss/logits": 0.022931482642889023, + "loss/reg": 0.024036424234509468, + "step": 1238 + }, + { + "epoch": 0.6195, + "grad_norm": 1.0845694541931152, + "grad_norm_var": 0.5682165874069398, + "learning_rate": 2e-05, + "loss": 0.4171, + "loss/crossentropy": 2.3470133543014526, + "loss/hidden": 0.14794921875, + "loss/logits": 0.028764693066477776, + "loss/reg": 0.024033887311816216, + "step": 1239 + }, + { + "epoch": 0.62, + "grad_norm": 1.6361056566238403, + "grad_norm_var": 0.5709606356025133, + "learning_rate": 2e-05, + "loss": 0.5352, + "loss/crossentropy": 2.156785488128662, + "loss/hidden": 0.24951171875, + "loss/logits": 0.04537991248071194, + "loss/reg": 0.024031352251768112, + "step": 1240 + }, + { + "epoch": 0.6205, + "grad_norm": 1.6204231977462769, + "grad_norm_var": 0.5676626713635791, + "learning_rate": 2e-05, + "loss": 0.4345, + "loss/crossentropy": 2.4386643171310425, + "loss/hidden": 0.162109375, + "loss/logits": 0.032054854556918144, + "loss/reg": 0.02402876876294613, + "step": 1241 + }, + { + "epoch": 0.621, + "grad_norm": 1.1746337413787842, + "grad_norm_var": 0.5949463432824259, + "learning_rate": 2e-05, + "loss": 0.4244, + "loss/crossentropy": 2.3799376487731934, + "loss/hidden": 0.154296875, + "loss/logits": 0.029872726649045944, + "loss/reg": 0.0240262970328331, + "step": 1242 + }, + { + "epoch": 0.6215, + "grad_norm": 1.6356711387634277, + "grad_norm_var": 0.5656939566181675, + "learning_rate": 2e-05, + "loss": 0.4244, + "loss/crossentropy": 2.3845585584640503, + "loss/hidden": 0.15380859375, + "loss/logits": 0.030385269783437252, + "loss/reg": 0.02402365952730179, + "step": 1243 + }, + { + "epoch": 0.622, + "grad_norm": 2.1370480060577393, + "grad_norm_var": 0.5704204269581301, + "learning_rate": 2e-05, + "loss": 0.5108, + "loss/crossentropy": 2.528768539428711, + "loss/hidden": 0.22216796875, + "loss/logits": 0.048394979909062386, + "loss/reg": 0.024021117016673088, + "step": 1244 + }, + { + "epoch": 0.6225, + "grad_norm": 2.0357038974761963, + "grad_norm_var": 0.569115940961103, + "learning_rate": 2e-05, + "loss": 0.4838, + "loss/crossentropy": 2.294468402862549, + "loss/hidden": 0.20751953125, + "loss/logits": 0.03613162599503994, + "loss/reg": 0.024018656462430954, + "step": 1245 + }, + { + "epoch": 0.623, + "grad_norm": 1.5724185705184937, + "grad_norm_var": 0.5410974439999165, + "learning_rate": 2e-05, + "loss": 0.4462, + "loss/crossentropy": 2.4699219465255737, + "loss/hidden": 0.1728515625, + "loss/logits": 0.033166331239044666, + "loss/reg": 0.02401614561676979, + "step": 1246 + }, + { + "epoch": 0.6235, + "grad_norm": 1.199653148651123, + "grad_norm_var": 0.5715490275335109, + "learning_rate": 2e-05, + "loss": 0.4286, + "loss/crossentropy": 2.542737126350403, + "loss/hidden": 0.158203125, + "loss/logits": 0.03024892695248127, + "loss/reg": 0.024013692513108253, + "step": 1247 + }, + { + "epoch": 0.624, + "grad_norm": 1.2950655221939087, + "grad_norm_var": 0.5862912521386784, + "learning_rate": 2e-05, + "loss": 0.4532, + "loss/crossentropy": 2.0858335494995117, + "loss/hidden": 0.18017578125, + "loss/logits": 0.03286417946219444, + "loss/reg": 0.024011155590415, + "step": 1248 + }, + { + "epoch": 0.6245, + "grad_norm": 1.3316272497177124, + "grad_norm_var": 0.5718334494050034, + "learning_rate": 2e-05, + "loss": 0.4137, + "loss/crossentropy": 2.394113779067993, + "loss/hidden": 0.146484375, + "loss/logits": 0.027119265869259834, + "loss/reg": 0.02400875836610794, + "step": 1249 + }, + { + "epoch": 0.625, + "grad_norm": 1.247865915298462, + "grad_norm_var": 0.5738337715934628, + "learning_rate": 2e-05, + "loss": 0.4395, + "loss/crossentropy": 2.2250888347625732, + "loss/hidden": 0.16845703125, + "loss/logits": 0.030949266627430916, + "loss/reg": 0.024006184190511703, + "step": 1250 + }, + { + "epoch": 0.6255, + "grad_norm": 1.4751828908920288, + "grad_norm_var": 0.1935716929082652, + "learning_rate": 2e-05, + "loss": 0.4572, + "loss/crossentropy": 2.395260810852051, + "loss/hidden": 0.18212890625, + "loss/logits": 0.03504170663654804, + "loss/reg": 0.024003824219107628, + "step": 1251 + }, + { + "epoch": 0.626, + "grad_norm": 1.179787516593933, + "grad_norm_var": 0.20491551538845407, + "learning_rate": 2e-05, + "loss": 0.3969, + "loss/crossentropy": 2.4494906663894653, + "loss/hidden": 0.134765625, + "loss/logits": 0.022070709615945816, + "loss/reg": 0.02400144934654236, + "step": 1252 + }, + { + "epoch": 0.6265, + "grad_norm": 1.586988925933838, + "grad_norm_var": 0.17240025072078843, + "learning_rate": 2e-05, + "loss": 0.4791, + "loss/crossentropy": 2.367736339569092, + "loss/hidden": 0.20263671875, + "loss/logits": 0.036459170281887054, + "loss/reg": 0.023999080061912537, + "step": 1253 + }, + { + "epoch": 0.627, + "grad_norm": 1.408430576324463, + "grad_norm_var": 0.0908129483057038, + "learning_rate": 2e-05, + "loss": 0.4469, + "loss/crossentropy": 2.5444475412368774, + "loss/hidden": 0.17578125, + "loss/logits": 0.03117147646844387, + "loss/reg": 0.023996589705348015, + "step": 1254 + }, + { + "epoch": 0.6275, + "grad_norm": 1.3537817001342773, + "grad_norm_var": 0.08128065351453409, + "learning_rate": 2e-05, + "loss": 0.4433, + "loss/crossentropy": 2.5234625339508057, + "loss/hidden": 0.1748046875, + "loss/logits": 0.028503548353910446, + "loss/reg": 0.023994173854589462, + "step": 1255 + }, + { + "epoch": 0.628, + "grad_norm": 1.5888077020645142, + "grad_norm_var": 0.08051893249327732, + "learning_rate": 2e-05, + "loss": 0.5085, + "loss/crossentropy": 2.2406824827194214, + "loss/hidden": 0.22509765625, + "loss/logits": 0.043473441153764725, + "loss/reg": 0.023991703987121582, + "step": 1256 + }, + { + "epoch": 0.6285, + "grad_norm": 1.1863782405853271, + "grad_norm_var": 0.08475685961338304, + "learning_rate": 2e-05, + "loss": 0.4252, + "loss/crossentropy": 2.3977235555648804, + "loss/hidden": 0.16015625, + "loss/logits": 0.025112398900091648, + "loss/reg": 0.023989345878362656, + "step": 1257 + }, + { + "epoch": 0.629, + "grad_norm": 2.7917957305908203, + "grad_norm_var": 0.18601559285113065, + "learning_rate": 2e-05, + "loss": 0.4964, + "loss/crossentropy": 2.582550048828125, + "loss/hidden": 0.20849609375, + "loss/logits": 0.0480042677372694, + "loss/reg": 0.023987185209989548, + "step": 1258 + }, + { + "epoch": 0.6295, + "grad_norm": 1.227421760559082, + "grad_norm_var": 0.1925385294557105, + "learning_rate": 2e-05, + "loss": 0.4472, + "loss/crossentropy": 2.3612314462661743, + "loss/hidden": 0.1728515625, + "loss/logits": 0.034451963379979134, + "loss/reg": 0.023985007777810097, + "step": 1259 + }, + { + "epoch": 0.63, + "grad_norm": 1.3271315097808838, + "grad_norm_var": 0.1689130153916018, + "learning_rate": 2e-05, + "loss": 0.4098, + "loss/crossentropy": 2.2717262506484985, + "loss/hidden": 0.146484375, + "loss/logits": 0.023538900539278984, + "loss/reg": 0.023982524871826172, + "step": 1260 + }, + { + "epoch": 0.6305, + "grad_norm": 1.5784635543823242, + "grad_norm_var": 0.1485889910484635, + "learning_rate": 2e-05, + "loss": 0.4908, + "loss/crossentropy": 2.5256478786468506, + "loss/hidden": 0.21044921875, + "loss/logits": 0.040558042004704475, + "loss/reg": 0.02397996559739113, + "step": 1261 + }, + { + "epoch": 0.631, + "grad_norm": 1.4419437646865845, + "grad_norm_var": 0.14768726273588845, + "learning_rate": 2e-05, + "loss": 0.453, + "loss/crossentropy": 2.2362372875213623, + "loss/hidden": 0.18310546875, + "loss/logits": 0.030156176537275314, + "loss/reg": 0.02397749572992325, + "step": 1262 + }, + { + "epoch": 0.6315, + "grad_norm": 1.3559249639511108, + "grad_norm_var": 0.14397081070207676, + "learning_rate": 2e-05, + "loss": 0.4499, + "loss/crossentropy": 2.6084084510803223, + "loss/hidden": 0.1767578125, + "loss/logits": 0.03335867449641228, + "loss/reg": 0.023975025862455368, + "step": 1263 + }, + { + "epoch": 0.632, + "grad_norm": 1.8681645393371582, + "grad_norm_var": 0.1518160274302981, + "learning_rate": 2e-05, + "loss": 0.4988, + "loss/crossentropy": 2.302277684211731, + "loss/hidden": 0.2177734375, + "loss/logits": 0.04134911857545376, + "loss/reg": 0.023972423747181892, + "step": 1264 + }, + { + "epoch": 0.6325, + "grad_norm": 1.2972159385681152, + "grad_norm_var": 0.15264813462290375, + "learning_rate": 2e-05, + "loss": 0.4542, + "loss/crossentropy": 2.126034438610077, + "loss/hidden": 0.1806640625, + "loss/logits": 0.03383249044418335, + "loss/reg": 0.02396974340081215, + "step": 1265 + }, + { + "epoch": 0.633, + "grad_norm": 1.1746324300765991, + "grad_norm_var": 0.1553935858025509, + "learning_rate": 2e-05, + "loss": 0.4072, + "loss/crossentropy": 2.2992480993270874, + "loss/hidden": 0.14208984375, + "loss/logits": 0.025420350953936577, + "loss/reg": 0.023967038840055466, + "step": 1266 + }, + { + "epoch": 0.6335, + "grad_norm": 1.0678731203079224, + "grad_norm_var": 0.16657406511629347, + "learning_rate": 2e-05, + "loss": 0.4074, + "loss/crossentropy": 2.2471213340759277, + "loss/hidden": 0.14208984375, + "loss/logits": 0.025705378502607346, + "loss/reg": 0.02396426908671856, + "step": 1267 + }, + { + "epoch": 0.634, + "grad_norm": 2.6190547943115234, + "grad_norm_var": 0.24137234026006044, + "learning_rate": 2e-05, + "loss": 0.5394, + "loss/crossentropy": 2.6888530254364014, + "loss/hidden": 0.24853515625, + "loss/logits": 0.05120135098695755, + "loss/reg": 0.023961780592799187, + "step": 1268 + }, + { + "epoch": 0.6345, + "grad_norm": 1.5800697803497314, + "grad_norm_var": 0.24134547552578448, + "learning_rate": 2e-05, + "loss": 0.4279, + "loss/crossentropy": 2.289852738380432, + "loss/hidden": 0.15966796875, + "loss/logits": 0.028674802742898464, + "loss/reg": 0.023959312587976456, + "step": 1269 + }, + { + "epoch": 0.635, + "grad_norm": 1.474374532699585, + "grad_norm_var": 0.240335642083797, + "learning_rate": 2e-05, + "loss": 0.4526, + "loss/crossentropy": 2.3954397439956665, + "loss/hidden": 0.1826171875, + "loss/logits": 0.030411606654524803, + "loss/reg": 0.023956701159477234, + "step": 1270 + }, + { + "epoch": 0.6355, + "grad_norm": 1.6741186380386353, + "grad_norm_var": 0.2380131997486006, + "learning_rate": 2e-05, + "loss": 0.5338, + "loss/crossentropy": 2.3165799379348755, + "loss/hidden": 0.2578125, + "loss/logits": 0.03643801715224981, + "loss/reg": 0.023954056203365326, + "step": 1271 + }, + { + "epoch": 0.636, + "grad_norm": 1.4911454916000366, + "grad_norm_var": 0.23847295627966883, + "learning_rate": 2e-05, + "loss": 0.4419, + "loss/crossentropy": 2.3574637174606323, + "loss/hidden": 0.17431640625, + "loss/logits": 0.028055937960743904, + "loss/reg": 0.023951426148414612, + "step": 1272 + }, + { + "epoch": 0.6365, + "grad_norm": 1.3399804830551147, + "grad_norm_var": 0.23204516308295423, + "learning_rate": 2e-05, + "loss": 0.4296, + "loss/crossentropy": 2.2917098999023438, + "loss/hidden": 0.1650390625, + "loss/logits": 0.02510044164955616, + "loss/reg": 0.02394864708185196, + "step": 1273 + }, + { + "epoch": 0.637, + "grad_norm": 1.1677303314208984, + "grad_norm_var": 0.13488639573788433, + "learning_rate": 2e-05, + "loss": 0.4029, + "loss/crossentropy": 2.272668480873108, + "loss/hidden": 0.143798828125, + "loss/logits": 0.019617602229118347, + "loss/reg": 0.023945819586515427, + "step": 1274 + }, + { + "epoch": 0.6375, + "grad_norm": 1.7385436296463013, + "grad_norm_var": 0.13397879899451534, + "learning_rate": 2e-05, + "loss": 0.5288, + "loss/crossentropy": 2.3414204120635986, + "loss/hidden": 0.2470703125, + "loss/logits": 0.04233134910464287, + "loss/reg": 0.023943088948726654, + "step": 1275 + }, + { + "epoch": 0.638, + "grad_norm": 0.977463960647583, + "grad_norm_var": 0.15025223921002726, + "learning_rate": 2e-05, + "loss": 0.403, + "loss/crossentropy": 2.4587652683258057, + "loss/hidden": 0.13916015625, + "loss/logits": 0.02442883513867855, + "loss/reg": 0.023940427228808403, + "step": 1276 + }, + { + "epoch": 0.6385, + "grad_norm": 2.247265577316284, + "grad_norm_var": 0.18605952102636442, + "learning_rate": 2e-05, + "loss": 0.4763, + "loss/crossentropy": 2.393397808074951, + "loss/hidden": 0.20703125, + "loss/logits": 0.029887165874242783, + "loss/reg": 0.02393791824579239, + "step": 1277 + }, + { + "epoch": 0.639, + "grad_norm": 1.2421715259552002, + "grad_norm_var": 0.19095842498212073, + "learning_rate": 2e-05, + "loss": 0.4199, + "loss/crossentropy": 2.3398871421813965, + "loss/hidden": 0.1494140625, + "loss/logits": 0.031092578545212746, + "loss/reg": 0.023935388773679733, + "step": 1278 + }, + { + "epoch": 0.6395, + "grad_norm": 1.630374550819397, + "grad_norm_var": 0.1896718089495029, + "learning_rate": 2e-05, + "loss": 0.4178, + "loss/crossentropy": 2.3470832109451294, + "loss/hidden": 0.15234375, + "loss/logits": 0.026134072802960873, + "loss/reg": 0.023932764306664467, + "step": 1279 + }, + { + "epoch": 0.64, + "grad_norm": 1.426863670349121, + "grad_norm_var": 0.1823510070964786, + "learning_rate": 2e-05, + "loss": 0.4318, + "loss/crossentropy": 2.281801223754883, + "loss/hidden": 0.16259765625, + "loss/logits": 0.029915660619735718, + "loss/reg": 0.023930255323648453, + "step": 1280 + }, + { + "epoch": 0.6405, + "grad_norm": 1.5419303178787231, + "grad_norm_var": 0.17917366497501538, + "learning_rate": 2e-05, + "loss": 0.4594, + "loss/crossentropy": 2.4495433568954468, + "loss/hidden": 0.18359375, + "loss/logits": 0.036524929106235504, + "loss/reg": 0.023927820846438408, + "step": 1281 + }, + { + "epoch": 0.641, + "grad_norm": 2.1934878826141357, + "grad_norm_var": 0.19651069564279305, + "learning_rate": 2e-05, + "loss": 0.5702, + "loss/crossentropy": 2.097387194633484, + "loss/hidden": 0.2783203125, + "loss/logits": 0.052668359130620956, + "loss/reg": 0.02392534911632538, + "step": 1282 + }, + { + "epoch": 0.6415, + "grad_norm": 1.5347496271133423, + "grad_norm_var": 0.17773874075004978, + "learning_rate": 2e-05, + "loss": 0.4635, + "loss/crossentropy": 2.2910990715026855, + "loss/hidden": 0.19140625, + "loss/logits": 0.032878163270652294, + "loss/reg": 0.02392282895743847, + "step": 1283 + }, + { + "epoch": 0.642, + "grad_norm": 1.4668471813201904, + "grad_norm_var": 0.10683961538937149, + "learning_rate": 2e-05, + "loss": 0.5058, + "loss/crossentropy": 2.1385812759399414, + "loss/hidden": 0.220703125, + "loss/logits": 0.04593625292181969, + "loss/reg": 0.023920193314552307, + "step": 1284 + }, + { + "epoch": 0.6425, + "grad_norm": 2.6525700092315674, + "grad_norm_var": 0.1836820315419103, + "learning_rate": 2e-05, + "loss": 0.4432, + "loss/crossentropy": 2.503835439682007, + "loss/hidden": 0.173828125, + "loss/logits": 0.030207850970327854, + "loss/reg": 0.02391754277050495, + "step": 1285 + }, + { + "epoch": 0.643, + "grad_norm": 1.9956448078155518, + "grad_norm_var": 0.19106626883691427, + "learning_rate": 2e-05, + "loss": 0.4742, + "loss/crossentropy": 2.4542654752731323, + "loss/hidden": 0.1962890625, + "loss/logits": 0.038763463497161865, + "loss/reg": 0.023914897814393044, + "step": 1286 + }, + { + "epoch": 0.6435, + "grad_norm": 1.4315065145492554, + "grad_norm_var": 0.19380491573572362, + "learning_rate": 2e-05, + "loss": 0.4204, + "loss/crossentropy": 2.544227123260498, + "loss/hidden": 0.15234375, + "loss/logits": 0.02888611890375614, + "loss/reg": 0.023912400007247925, + "step": 1287 + }, + { + "epoch": 0.644, + "grad_norm": 1.5084730386734009, + "grad_norm_var": 0.19350312891037896, + "learning_rate": 2e-05, + "loss": 0.423, + "loss/crossentropy": 2.5178849697113037, + "loss/hidden": 0.15478515625, + "loss/logits": 0.029108996503055096, + "loss/reg": 0.023909782990813255, + "step": 1288 + }, + { + "epoch": 0.6445, + "grad_norm": 1.9239797592163086, + "grad_norm_var": 0.19216031597426284, + "learning_rate": 2e-05, + "loss": 0.4724, + "loss/crossentropy": 2.4222670793533325, + "loss/hidden": 0.2021484375, + "loss/logits": 0.031173129566013813, + "loss/reg": 0.023907171562314034, + "step": 1289 + }, + { + "epoch": 0.645, + "grad_norm": 1.1635066270828247, + "grad_norm_var": 0.19244286753470394, + "learning_rate": 2e-05, + "loss": 0.4123, + "loss/crossentropy": 2.2675434350967407, + "loss/hidden": 0.146484375, + "loss/logits": 0.026736157946288586, + "loss/reg": 0.023904629051685333, + "step": 1290 + }, + { + "epoch": 0.6455, + "grad_norm": 1.2279921770095825, + "grad_norm_var": 0.20387843935832747, + "learning_rate": 2e-05, + "loss": 0.4294, + "loss/crossentropy": 2.409374237060547, + "loss/hidden": 0.1591796875, + "loss/logits": 0.031166162341833115, + "loss/reg": 0.02390221692621708, + "step": 1291 + }, + { + "epoch": 0.646, + "grad_norm": 1.5634331703186035, + "grad_norm_var": 0.17394207919523214, + "learning_rate": 2e-05, + "loss": 0.4418, + "loss/crossentropy": 2.3470332622528076, + "loss/hidden": 0.16943359375, + "loss/logits": 0.033411881886422634, + "loss/reg": 0.023899724707007408, + "step": 1292 + }, + { + "epoch": 0.6465, + "grad_norm": 1.4021070003509521, + "grad_norm_var": 0.15375149805387778, + "learning_rate": 2e-05, + "loss": 0.4517, + "loss/crossentropy": 2.4360326528549194, + "loss/hidden": 0.1845703125, + "loss/logits": 0.028121494688093662, + "loss/reg": 0.023897258564829826, + "step": 1293 + }, + { + "epoch": 0.647, + "grad_norm": 1.5751771926879883, + "grad_norm_var": 0.14394628232820703, + "learning_rate": 2e-05, + "loss": 0.4148, + "loss/crossentropy": 2.363794207572937, + "loss/hidden": 0.15087890625, + "loss/logits": 0.02497075777500868, + "loss/reg": 0.023894891142845154, + "step": 1294 + }, + { + "epoch": 0.6475, + "grad_norm": 1.5669056177139282, + "grad_norm_var": 0.14427878956963974, + "learning_rate": 2e-05, + "loss": 0.4636, + "loss/crossentropy": 2.2907302379608154, + "loss/hidden": 0.1884765625, + "loss/logits": 0.03620941936969757, + "loss/reg": 0.023892676457762718, + "step": 1295 + }, + { + "epoch": 0.648, + "grad_norm": 2.1094207763671875, + "grad_norm_var": 0.154368248754838, + "learning_rate": 2e-05, + "loss": 0.4793, + "loss/crossentropy": 2.2729530334472656, + "loss/hidden": 0.20654296875, + "loss/logits": 0.033849818632006645, + "loss/reg": 0.023890400305390358, + "step": 1296 + }, + { + "epoch": 0.6485, + "grad_norm": 1.4486722946166992, + "grad_norm_var": 0.15661132320615986, + "learning_rate": 2e-05, + "loss": 0.4313, + "loss/crossentropy": 2.376818895339966, + "loss/hidden": 0.16796875, + "loss/logits": 0.024429542012512684, + "loss/reg": 0.02388790063560009, + "step": 1297 + }, + { + "epoch": 0.649, + "grad_norm": 1.458891749382019, + "grad_norm_var": 0.13933691898384565, + "learning_rate": 2e-05, + "loss": 0.4831, + "loss/crossentropy": 2.1321340203285217, + "loss/hidden": 0.2080078125, + "loss/logits": 0.03623790666460991, + "loss/reg": 0.023885508999228477, + "step": 1298 + }, + { + "epoch": 0.6495, + "grad_norm": 1.397750735282898, + "grad_norm_var": 0.1421926325690795, + "learning_rate": 2e-05, + "loss": 0.4592, + "loss/crossentropy": 2.245489716529846, + "loss/hidden": 0.181640625, + "loss/logits": 0.03869971726089716, + "loss/reg": 0.02388302981853485, + "step": 1299 + }, + { + "epoch": 0.65, + "grad_norm": 2.3553082942962646, + "grad_norm_var": 0.1735859217612727, + "learning_rate": 2e-05, + "loss": 0.5102, + "loss/crossentropy": 2.422638416290283, + "loss/hidden": 0.22021484375, + "loss/logits": 0.051152704283595085, + "loss/reg": 0.02388053759932518, + "step": 1300 + }, + { + "epoch": 0.6505, + "grad_norm": 1.5121291875839233, + "grad_norm_var": 0.10604831093349745, + "learning_rate": 2e-05, + "loss": 0.476, + "loss/crossentropy": 2.5624797344207764, + "loss/hidden": 0.2021484375, + "loss/logits": 0.03510456532239914, + "loss/reg": 0.023878419771790504, + "step": 1301 + }, + { + "epoch": 0.651, + "grad_norm": 1.5881332159042358, + "grad_norm_var": 0.09506899424586326, + "learning_rate": 2e-05, + "loss": 0.4744, + "loss/crossentropy": 2.3333781957626343, + "loss/hidden": 0.1962890625, + "loss/logits": 0.03930900990962982, + "loss/reg": 0.023876061663031578, + "step": 1302 + }, + { + "epoch": 0.6515, + "grad_norm": 1.3511114120483398, + "grad_norm_var": 0.0970334796528732, + "learning_rate": 2e-05, + "loss": 0.4549, + "loss/crossentropy": 2.1691489219665527, + "loss/hidden": 0.185546875, + "loss/logits": 0.030610281974077225, + "loss/reg": 0.02387375757098198, + "step": 1303 + }, + { + "epoch": 0.652, + "grad_norm": 2.0355160236358643, + "grad_norm_var": 0.10992582401275346, + "learning_rate": 2e-05, + "loss": 0.5417, + "loss/crossentropy": 2.3282041549682617, + "loss/hidden": 0.27099609375, + "loss/logits": 0.03201697859913111, + "loss/reg": 0.02387123927474022, + "step": 1304 + }, + { + "epoch": 0.6525, + "grad_norm": 1.1647405624389648, + "grad_norm_var": 0.11366288198163511, + "learning_rate": 2e-05, + "loss": 0.3925, + "loss/crossentropy": 2.465924859046936, + "loss/hidden": 0.13232421875, + "loss/logits": 0.021514427848160267, + "loss/reg": 0.02386898547410965, + "step": 1305 + }, + { + "epoch": 0.653, + "grad_norm": 1.921985387802124, + "grad_norm_var": 0.10976873004807407, + "learning_rate": 2e-05, + "loss": 0.5147, + "loss/crossentropy": 2.435685157775879, + "loss/hidden": 0.23681640625, + "loss/logits": 0.03921514190733433, + "loss/reg": 0.02386675402522087, + "step": 1306 + }, + { + "epoch": 0.6535, + "grad_norm": 1.6400461196899414, + "grad_norm_var": 0.09966999048148933, + "learning_rate": 2e-05, + "loss": 0.4302, + "loss/crossentropy": 2.16571307182312, + "loss/hidden": 0.16748046875, + "loss/logits": 0.024049567990005016, + "loss/reg": 0.023864606395363808, + "step": 1307 + }, + { + "epoch": 0.654, + "grad_norm": 1.3085359334945679, + "grad_norm_var": 0.10601720206320617, + "learning_rate": 2e-05, + "loss": 0.4506, + "loss/crossentropy": 2.4350894689559937, + "loss/hidden": 0.1787109375, + "loss/logits": 0.03322593308985233, + "loss/reg": 0.023862628266215324, + "step": 1308 + }, + { + "epoch": 0.6545, + "grad_norm": 1.7091984748840332, + "grad_norm_var": 0.10320339085501071, + "learning_rate": 2e-05, + "loss": 0.4515, + "loss/crossentropy": 2.418308198451996, + "loss/hidden": 0.18359375, + "loss/logits": 0.02934916317462921, + "loss/reg": 0.023860609158873558, + "step": 1309 + }, + { + "epoch": 0.655, + "grad_norm": 1.2501323223114014, + "grad_norm_var": 0.11235482446353538, + "learning_rate": 2e-05, + "loss": 0.417, + "loss/crossentropy": 2.599029541015625, + "loss/hidden": 0.1494140625, + "loss/logits": 0.0289985379204154, + "loss/reg": 0.023858599364757538, + "step": 1310 + }, + { + "epoch": 0.6555, + "grad_norm": 1.5810778141021729, + "grad_norm_var": 0.11227903902704757, + "learning_rate": 2e-05, + "loss": 0.435, + "loss/crossentropy": 2.4957447052001953, + "loss/hidden": 0.16748046875, + "loss/logits": 0.028951111249625683, + "loss/reg": 0.02385612390935421, + "step": 1311 + }, + { + "epoch": 0.656, + "grad_norm": 1.6085758209228516, + "grad_norm_var": 0.09490913098407905, + "learning_rate": 2e-05, + "loss": 0.4276, + "loss/crossentropy": 2.525179862976074, + "loss/hidden": 0.1630859375, + "loss/logits": 0.02599877305328846, + "loss/reg": 0.023853624239563942, + "step": 1312 + }, + { + "epoch": 0.6565, + "grad_norm": 1.3025941848754883, + "grad_norm_var": 0.09886375082411777, + "learning_rate": 2e-05, + "loss": 0.4164, + "loss/crossentropy": 2.2159218788146973, + "loss/hidden": 0.15380859375, + "loss/logits": 0.02411420363932848, + "loss/reg": 0.023851484060287476, + "step": 1313 + }, + { + "epoch": 0.657, + "grad_norm": 1.7414854764938354, + "grad_norm_var": 0.09951370157159824, + "learning_rate": 2e-05, + "loss": 0.4552, + "loss/crossentropy": 2.6034278869628906, + "loss/hidden": 0.18359375, + "loss/logits": 0.033163596875965595, + "loss/reg": 0.023848969489336014, + "step": 1314 + }, + { + "epoch": 0.6575, + "grad_norm": 1.6796448230743408, + "grad_norm_var": 0.0971878321131148, + "learning_rate": 2e-05, + "loss": 0.5352, + "loss/crossentropy": 2.3006917238235474, + "loss/hidden": 0.2548828125, + "loss/logits": 0.04180280677974224, + "loss/reg": 0.02384648472070694, + "step": 1315 + }, + { + "epoch": 0.658, + "grad_norm": 1.5615240335464478, + "grad_norm_var": 0.057622080975028626, + "learning_rate": 2e-05, + "loss": 0.4302, + "loss/crossentropy": 2.188043475151062, + "loss/hidden": 0.16455078125, + "loss/logits": 0.02720883209258318, + "loss/reg": 0.023843981325626373, + "step": 1316 + }, + { + "epoch": 0.6585, + "grad_norm": 1.1154263019561768, + "grad_norm_var": 0.06997817065805308, + "learning_rate": 2e-05, + "loss": 0.4081, + "loss/crossentropy": 2.592913031578064, + "loss/hidden": 0.14453125, + "loss/logits": 0.025116360746324062, + "loss/reg": 0.023841451853513718, + "step": 1317 + }, + { + "epoch": 0.659, + "grad_norm": 1.5203436613082886, + "grad_norm_var": 0.06978498065926123, + "learning_rate": 2e-05, + "loss": 0.5075, + "loss/crossentropy": 2.2861050367355347, + "loss/hidden": 0.22412109375, + "loss/logits": 0.04499981366097927, + "loss/reg": 0.023838885128498077, + "step": 1318 + }, + { + "epoch": 0.6595, + "grad_norm": 1.2238833904266357, + "grad_norm_var": 0.07384394251173394, + "learning_rate": 2e-05, + "loss": 0.4196, + "loss/crossentropy": 2.538287878036499, + "loss/hidden": 0.15283203125, + "loss/logits": 0.02840554341673851, + "loss/reg": 0.023836364969611168, + "step": 1319 + }, + { + "epoch": 0.66, + "grad_norm": 1.191012978553772, + "grad_norm_var": 0.06068536610595358, + "learning_rate": 2e-05, + "loss": 0.4069, + "loss/crossentropy": 2.4569714069366455, + "loss/hidden": 0.1455078125, + "loss/logits": 0.02309222426265478, + "loss/reg": 0.023833919316530228, + "step": 1320 + }, + { + "epoch": 0.6605, + "grad_norm": 1.649925708770752, + "grad_norm_var": 0.0556496711602169, + "learning_rate": 2e-05, + "loss": 0.4694, + "loss/crossentropy": 2.3524088859558105, + "loss/hidden": 0.19189453125, + "loss/logits": 0.0391565915197134, + "loss/reg": 0.023831605911254883, + "step": 1321 + }, + { + "epoch": 0.661, + "grad_norm": 1.4207836389541626, + "grad_norm_var": 0.043172417948901656, + "learning_rate": 2e-05, + "loss": 0.4519, + "loss/crossentropy": 2.5140554904937744, + "loss/hidden": 0.1806640625, + "loss/logits": 0.03295655734837055, + "loss/reg": 0.023829326033592224, + "step": 1322 + }, + { + "epoch": 0.6615, + "grad_norm": 1.3647384643554688, + "grad_norm_var": 0.04163129199955975, + "learning_rate": 2e-05, + "loss": 0.4128, + "loss/crossentropy": 2.530665874481201, + "loss/hidden": 0.14990234375, + "loss/logits": 0.02467129472643137, + "loss/reg": 0.023827021941542625, + "step": 1323 + }, + { + "epoch": 0.662, + "grad_norm": 1.5273334980010986, + "grad_norm_var": 0.0404437201587351, + "learning_rate": 2e-05, + "loss": 0.4635, + "loss/crossentropy": 2.2921979427337646, + "loss/hidden": 0.189453125, + "loss/logits": 0.03580437693744898, + "loss/reg": 0.023824498057365417, + "step": 1324 + }, + { + "epoch": 0.6625, + "grad_norm": 3.4632437229156494, + "grad_norm_var": 0.28973497995353714, + "learning_rate": 2e-05, + "loss": 0.4845, + "loss/crossentropy": 2.3375691175460815, + "loss/hidden": 0.2119140625, + "loss/logits": 0.03432004339993, + "loss/reg": 0.02382197044789791, + "step": 1325 + }, + { + "epoch": 0.663, + "grad_norm": 1.2499359846115112, + "grad_norm_var": 0.28974348968956176, + "learning_rate": 2e-05, + "loss": 0.4141, + "loss/crossentropy": 2.437517523765564, + "loss/hidden": 0.15185546875, + "loss/logits": 0.024037906900048256, + "loss/reg": 0.023819534108042717, + "step": 1326 + }, + { + "epoch": 0.6635, + "grad_norm": 2.2203030586242676, + "grad_norm_var": 0.31579141158708024, + "learning_rate": 2e-05, + "loss": 0.4227, + "loss/crossentropy": 2.547585964202881, + "loss/hidden": 0.1591796875, + "loss/logits": 0.025345077738165855, + "loss/reg": 0.02381698414683342, + "step": 1327 + }, + { + "epoch": 0.664, + "grad_norm": 1.4417054653167725, + "grad_norm_var": 0.317675752358493, + "learning_rate": 2e-05, + "loss": 0.4383, + "loss/crossentropy": 2.5056179761886597, + "loss/hidden": 0.16845703125, + "loss/logits": 0.03173685912042856, + "loss/reg": 0.023814348503947258, + "step": 1328 + }, + { + "epoch": 0.6645, + "grad_norm": 1.6603224277496338, + "grad_norm_var": 0.3112681967737764, + "learning_rate": 2e-05, + "loss": 0.4843, + "loss/crossentropy": 2.1647554636001587, + "loss/hidden": 0.2021484375, + "loss/logits": 0.043993281200528145, + "loss/reg": 0.023811759427189827, + "step": 1329 + }, + { + "epoch": 0.665, + "grad_norm": 1.6948206424713135, + "grad_norm_var": 0.31069182045736876, + "learning_rate": 2e-05, + "loss": 0.4089, + "loss/crossentropy": 2.37821888923645, + "loss/hidden": 0.150390625, + "loss/logits": 0.020369217731058598, + "loss/reg": 0.023809220641851425, + "step": 1330 + }, + { + "epoch": 0.6655, + "grad_norm": 1.418535828590393, + "grad_norm_var": 0.3130177534653304, + "learning_rate": 2e-05, + "loss": 0.4358, + "loss/crossentropy": 2.3562744855880737, + "loss/hidden": 0.16552734375, + "loss/logits": 0.03218572027981281, + "loss/reg": 0.023806730285286903, + "step": 1331 + }, + { + "epoch": 0.666, + "grad_norm": 2.405161142349243, + "grad_norm_var": 0.35230188449184957, + "learning_rate": 2e-05, + "loss": 0.4954, + "loss/crossentropy": 2.5449503660202026, + "loss/hidden": 0.212158203125, + "loss/logits": 0.04522665124386549, + "loss/reg": 0.02380412258207798, + "step": 1332 + }, + { + "epoch": 0.6665, + "grad_norm": 2.3597934246063232, + "grad_norm_var": 0.3586491765370226, + "learning_rate": 2e-05, + "loss": 0.4877, + "loss/crossentropy": 2.3041821718215942, + "loss/hidden": 0.208984375, + "loss/logits": 0.040694585070014, + "loss/reg": 0.02380160056054592, + "step": 1333 + }, + { + "epoch": 0.667, + "grad_norm": 1.4537216424942017, + "grad_norm_var": 0.36086214325715854, + "learning_rate": 2e-05, + "loss": 0.4747, + "loss/crossentropy": 2.504698157310486, + "loss/hidden": 0.19921875, + "loss/logits": 0.03751707915216684, + "loss/reg": 0.023799141868948936, + "step": 1334 + }, + { + "epoch": 0.6675, + "grad_norm": 1.2887414693832397, + "grad_norm_var": 0.3567130361876489, + "learning_rate": 2e-05, + "loss": 0.4786, + "loss/crossentropy": 2.0967178344726562, + "loss/hidden": 0.20263671875, + "loss/logits": 0.03796843905001879, + "loss/reg": 0.023796530440449715, + "step": 1335 + }, + { + "epoch": 0.668, + "grad_norm": 1.360039234161377, + "grad_norm_var": 0.3461683691160814, + "learning_rate": 2e-05, + "loss": 0.4422, + "loss/crossentropy": 2.2293606996536255, + "loss/hidden": 0.173828125, + "loss/logits": 0.030479850247502327, + "loss/reg": 0.02379394881427288, + "step": 1336 + }, + { + "epoch": 0.6685, + "grad_norm": 1.5283000469207764, + "grad_norm_var": 0.34869462176112187, + "learning_rate": 2e-05, + "loss": 0.4369, + "loss/crossentropy": 2.55380380153656, + "loss/hidden": 0.16650390625, + "loss/logits": 0.032473089173436165, + "loss/reg": 0.023791363462805748, + "step": 1337 + }, + { + "epoch": 0.669, + "grad_norm": 1.3858225345611572, + "grad_norm_var": 0.3502641276347217, + "learning_rate": 2e-05, + "loss": 0.4403, + "loss/crossentropy": 2.364560842514038, + "loss/hidden": 0.17041015625, + "loss/logits": 0.03198127821087837, + "loss/reg": 0.02378905564546585, + "step": 1338 + }, + { + "epoch": 0.6695, + "grad_norm": 1.4333000183105469, + "grad_norm_var": 0.3471374399560941, + "learning_rate": 2e-05, + "loss": 0.4355, + "loss/crossentropy": 2.514798641204834, + "loss/hidden": 0.1611328125, + "loss/logits": 0.03649984207004309, + "loss/reg": 0.023786714300513268, + "step": 1339 + }, + { + "epoch": 0.67, + "grad_norm": 1.49425208568573, + "grad_norm_var": 0.34815796148798006, + "learning_rate": 2e-05, + "loss": 0.4127, + "loss/crossentropy": 2.451537013053894, + "loss/hidden": 0.14892578125, + "loss/logits": 0.025915359146893024, + "loss/reg": 0.02378448285162449, + "step": 1340 + }, + { + "epoch": 0.6705, + "grad_norm": 1.364202618598938, + "grad_norm_var": 0.14155822181354907, + "learning_rate": 2e-05, + "loss": 0.4458, + "loss/crossentropy": 2.2742252349853516, + "loss/hidden": 0.17822265625, + "loss/logits": 0.02972761169075966, + "loss/reg": 0.023781999945640564, + "step": 1341 + }, + { + "epoch": 0.671, + "grad_norm": 1.3675141334533691, + "grad_norm_var": 0.1367785272504178, + "learning_rate": 2e-05, + "loss": 0.4329, + "loss/crossentropy": 2.4399064779281616, + "loss/hidden": 0.1650390625, + "loss/logits": 0.03008042648434639, + "loss/reg": 0.02377980761229992, + "step": 1342 + }, + { + "epoch": 0.6715, + "grad_norm": 1.739666223526001, + "grad_norm_var": 0.11257230684105075, + "learning_rate": 2e-05, + "loss": 0.4296, + "loss/crossentropy": 2.3073580265045166, + "loss/hidden": 0.1533203125, + "loss/logits": 0.0385186281055212, + "loss/reg": 0.02377736195921898, + "step": 1343 + }, + { + "epoch": 0.672, + "grad_norm": 1.364190936088562, + "grad_norm_var": 0.11445201509484164, + "learning_rate": 2e-05, + "loss": 0.4681, + "loss/crossentropy": 2.3126569986343384, + "loss/hidden": 0.19189453125, + "loss/logits": 0.03849446773529053, + "loss/reg": 0.02377496473491192, + "step": 1344 + }, + { + "epoch": 0.6725, + "grad_norm": 1.7589856386184692, + "grad_norm_var": 0.11608550666011386, + "learning_rate": 2e-05, + "loss": 0.4438, + "loss/crossentropy": 2.2252254486083984, + "loss/hidden": 0.17578125, + "loss/logits": 0.030335014685988426, + "loss/reg": 0.02377244643867016, + "step": 1345 + }, + { + "epoch": 0.673, + "grad_norm": 1.5148929357528687, + "grad_norm_var": 0.1155597806029616, + "learning_rate": 2e-05, + "loss": 0.4564, + "loss/crossentropy": 2.173453211784363, + "loss/hidden": 0.18896484375, + "loss/logits": 0.02973311860114336, + "loss/reg": 0.023769889026880264, + "step": 1346 + }, + { + "epoch": 0.6735, + "grad_norm": 1.3687435388565063, + "grad_norm_var": 0.11676889873661077, + "learning_rate": 2e-05, + "loss": 0.4483, + "loss/crossentropy": 2.3290340900421143, + "loss/hidden": 0.171875, + "loss/logits": 0.03875895403325558, + "loss/reg": 0.023767419159412384, + "step": 1347 + }, + { + "epoch": 0.674, + "grad_norm": 3.605093479156494, + "grad_norm_var": 0.3397037594268179, + "learning_rate": 2e-05, + "loss": 0.4573, + "loss/crossentropy": 2.528464674949646, + "loss/hidden": 0.18505859375, + "loss/logits": 0.03462876006960869, + "loss/reg": 0.02376495860517025, + "step": 1348 + }, + { + "epoch": 0.6745, + "grad_norm": 1.2763408422470093, + "grad_norm_var": 0.31041857364604836, + "learning_rate": 2e-05, + "loss": 0.4117, + "loss/crossentropy": 2.3279651403427124, + "loss/hidden": 0.14892578125, + "loss/logits": 0.025109270587563515, + "loss/reg": 0.023762483149766922, + "step": 1349 + }, + { + "epoch": 0.675, + "grad_norm": 1.2447208166122437, + "grad_norm_var": 0.3167090932037666, + "learning_rate": 2e-05, + "loss": 0.4339, + "loss/crossentropy": 2.317818284034729, + "loss/hidden": 0.16357421875, + "loss/logits": 0.03267843183130026, + "loss/reg": 0.023760035634040833, + "step": 1350 + }, + { + "epoch": 0.6755, + "grad_norm": 1.469759225845337, + "grad_norm_var": 0.3120066895490725, + "learning_rate": 2e-05, + "loss": 0.4657, + "loss/crossentropy": 2.6747782230377197, + "loss/hidden": 0.189453125, + "loss/logits": 0.03863661177456379, + "loss/reg": 0.023757578805088997, + "step": 1351 + }, + { + "epoch": 0.676, + "grad_norm": 1.188594937324524, + "grad_norm_var": 0.3188659312546353, + "learning_rate": 2e-05, + "loss": 0.4495, + "loss/crossentropy": 2.6325184106826782, + "loss/hidden": 0.17578125, + "loss/logits": 0.03611903823912144, + "loss/reg": 0.0237550251185894, + "step": 1352 + }, + { + "epoch": 0.6765, + "grad_norm": 1.4543743133544922, + "grad_norm_var": 0.3196088985917853, + "learning_rate": 2e-05, + "loss": 0.4425, + "loss/crossentropy": 2.461496353149414, + "loss/hidden": 0.17236328125, + "loss/logits": 0.032596323639154434, + "loss/reg": 0.02375258132815361, + "step": 1353 + }, + { + "epoch": 0.677, + "grad_norm": 1.183428406715393, + "grad_norm_var": 0.32698827229071603, + "learning_rate": 2e-05, + "loss": 0.4135, + "loss/crossentropy": 2.411842107772827, + "loss/hidden": 0.14892578125, + "loss/logits": 0.02709823753684759, + "loss/reg": 0.023749923333525658, + "step": 1354 + }, + { + "epoch": 0.6775, + "grad_norm": 1.2251843214035034, + "grad_norm_var": 0.3329822256302141, + "learning_rate": 2e-05, + "loss": 0.4623, + "loss/crossentropy": 2.385651111602783, + "loss/hidden": 0.18603515625, + "loss/logits": 0.03881765343248844, + "loss/reg": 0.023747442290186882, + "step": 1355 + }, + { + "epoch": 0.678, + "grad_norm": 1.8396114110946655, + "grad_norm_var": 0.3383879160154535, + "learning_rate": 2e-05, + "loss": 0.5556, + "loss/crossentropy": 2.159119963645935, + "loss/hidden": 0.27587890625, + "loss/logits": 0.042316026985645294, + "loss/reg": 0.023744840174913406, + "step": 1356 + }, + { + "epoch": 0.6785, + "grad_norm": 1.6769040822982788, + "grad_norm_var": 0.33632199932469997, + "learning_rate": 2e-05, + "loss": 0.4362, + "loss/crossentropy": 2.420010805130005, + "loss/hidden": 0.17041015625, + "loss/logits": 0.028409303165972233, + "loss/reg": 0.023742124438285828, + "step": 1357 + }, + { + "epoch": 0.679, + "grad_norm": 1.4979515075683594, + "grad_norm_var": 0.3336920570721417, + "learning_rate": 2e-05, + "loss": 0.4697, + "loss/crossentropy": 2.002126097679138, + "loss/hidden": 0.197265625, + "loss/logits": 0.03505042381584644, + "loss/reg": 0.02373962290585041, + "step": 1358 + }, + { + "epoch": 0.6795, + "grad_norm": 1.339608907699585, + "grad_norm_var": 0.3356063743636861, + "learning_rate": 2e-05, + "loss": 0.4591, + "loss/crossentropy": 2.4204870462417603, + "loss/hidden": 0.18408203125, + "loss/logits": 0.03763877786695957, + "loss/reg": 0.02373688668012619, + "step": 1359 + }, + { + "epoch": 0.68, + "grad_norm": 2.6153082847595215, + "grad_norm_var": 0.4002688084625047, + "learning_rate": 2e-05, + "loss": 0.5644, + "loss/crossentropy": 1.8696808218955994, + "loss/hidden": 0.27783203125, + "loss/logits": 0.049255505204200745, + "loss/reg": 0.023734180256724358, + "step": 1360 + }, + { + "epoch": 0.6805, + "grad_norm": 1.7975633144378662, + "grad_norm_var": 0.4009675788079647, + "learning_rate": 2e-05, + "loss": 0.4502, + "loss/crossentropy": 2.283734917640686, + "loss/hidden": 0.1806640625, + "loss/logits": 0.03226998262107372, + "loss/reg": 0.02373143844306469, + "step": 1361 + }, + { + "epoch": 0.681, + "grad_norm": 1.3946635723114014, + "grad_norm_var": 0.40393475291140984, + "learning_rate": 2e-05, + "loss": 0.4437, + "loss/crossentropy": 2.3782224655151367, + "loss/hidden": 0.1728515625, + "loss/logits": 0.03351980075240135, + "loss/reg": 0.02372862957417965, + "step": 1362 + }, + { + "epoch": 0.6815, + "grad_norm": 1.5255178213119507, + "grad_norm_var": 0.39988194537197613, + "learning_rate": 2e-05, + "loss": 0.4427, + "loss/crossentropy": 2.39896559715271, + "loss/hidden": 0.171875, + "loss/logits": 0.03352793958038092, + "loss/reg": 0.023726122453808784, + "step": 1363 + }, + { + "epoch": 0.682, + "grad_norm": 1.2733867168426514, + "grad_norm_var": 0.13058789078406388, + "learning_rate": 2e-05, + "loss": 0.4295, + "loss/crossentropy": 2.261076331138611, + "loss/hidden": 0.162109375, + "loss/logits": 0.03012457862496376, + "loss/reg": 0.02372356690466404, + "step": 1364 + }, + { + "epoch": 0.6825, + "grad_norm": 1.837705135345459, + "grad_norm_var": 0.1335292862046036, + "learning_rate": 2e-05, + "loss": 0.4557, + "loss/crossentropy": 2.247922897338867, + "loss/hidden": 0.1865234375, + "loss/logits": 0.03198765777051449, + "loss/reg": 0.02372095361351967, + "step": 1365 + }, + { + "epoch": 0.683, + "grad_norm": 1.5753334760665894, + "grad_norm_var": 0.1275530359959636, + "learning_rate": 2e-05, + "loss": 0.4808, + "loss/crossentropy": 2.2536725997924805, + "loss/hidden": 0.2060546875, + "loss/logits": 0.03755863197147846, + "loss/reg": 0.02371850237250328, + "step": 1366 + }, + { + "epoch": 0.6835, + "grad_norm": 1.447576642036438, + "grad_norm_var": 0.12783865842738631, + "learning_rate": 2e-05, + "loss": 0.4403, + "loss/crossentropy": 2.3656728267669678, + "loss/hidden": 0.16357421875, + "loss/logits": 0.039546214044094086, + "loss/reg": 0.023716144263744354, + "step": 1367 + }, + { + "epoch": 0.684, + "grad_norm": 1.3603750467300415, + "grad_norm_var": 0.12130121846223171, + "learning_rate": 2e-05, + "loss": 0.4643, + "loss/crossentropy": 2.415152430534363, + "loss/hidden": 0.1865234375, + "loss/logits": 0.04059493914246559, + "loss/reg": 0.02371359057724476, + "step": 1368 + }, + { + "epoch": 0.6845, + "grad_norm": 1.0393022298812866, + "grad_norm_var": 0.13820691270152227, + "learning_rate": 2e-05, + "loss": 0.3977, + "loss/crossentropy": 2.226056694984436, + "loss/hidden": 0.1357421875, + "loss/logits": 0.024835828691720963, + "loss/reg": 0.02371094562113285, + "step": 1369 + }, + { + "epoch": 0.685, + "grad_norm": 1.7829720973968506, + "grad_norm_var": 0.1322215247018124, + "learning_rate": 2e-05, + "loss": 0.5041, + "loss/crossentropy": 2.510174036026001, + "loss/hidden": 0.21533203125, + "loss/logits": 0.0516891460865736, + "loss/reg": 0.02370813861489296, + "step": 1370 + }, + { + "epoch": 0.6855, + "grad_norm": 1.4974333047866821, + "grad_norm_var": 0.12409001917904922, + "learning_rate": 2e-05, + "loss": 0.4361, + "loss/crossentropy": 2.45763623714447, + "loss/hidden": 0.16943359375, + "loss/logits": 0.029570632614195347, + "loss/reg": 0.02370576746761799, + "step": 1371 + }, + { + "epoch": 0.686, + "grad_norm": 2.463162660598755, + "grad_norm_var": 0.16882568198071363, + "learning_rate": 2e-05, + "loss": 0.5276, + "loss/crossentropy": 2.1458094120025635, + "loss/hidden": 0.2373046875, + "loss/logits": 0.05326741002500057, + "loss/reg": 0.02370315231382847, + "step": 1372 + }, + { + "epoch": 0.6865, + "grad_norm": 2.5386321544647217, + "grad_norm_var": 0.2203043192597228, + "learning_rate": 2e-05, + "loss": 0.5752, + "loss/crossentropy": 2.3038665056228638, + "loss/hidden": 0.2705078125, + "loss/logits": 0.06766052544116974, + "loss/reg": 0.023700760677456856, + "step": 1373 + }, + { + "epoch": 0.687, + "grad_norm": 1.279981255531311, + "grad_norm_var": 0.2287580151051623, + "learning_rate": 2e-05, + "loss": 0.4214, + "loss/crossentropy": 2.436690330505371, + "loss/hidden": 0.14892578125, + "loss/logits": 0.035530680790543556, + "loss/reg": 0.02369816228747368, + "step": 1374 + }, + { + "epoch": 0.6875, + "grad_norm": 1.230238676071167, + "grad_norm_var": 0.2343678483688691, + "learning_rate": 2e-05, + "loss": 0.4142, + "loss/crossentropy": 2.427309036254883, + "loss/hidden": 0.14892578125, + "loss/logits": 0.028325392864644527, + "loss/reg": 0.02369537763297558, + "step": 1375 + }, + { + "epoch": 0.688, + "grad_norm": 2.1449315547943115, + "grad_norm_var": 0.18867092664802806, + "learning_rate": 2e-05, + "loss": 0.4724, + "loss/crossentropy": 2.393447160720825, + "loss/hidden": 0.20361328125, + "loss/logits": 0.0318912947550416, + "loss/reg": 0.02369256503880024, + "step": 1376 + }, + { + "epoch": 0.6885, + "grad_norm": 1.614142894744873, + "grad_norm_var": 0.18684194347558922, + "learning_rate": 2e-05, + "loss": 0.5406, + "loss/crossentropy": 2.102261245250702, + "loss/hidden": 0.24853515625, + "loss/logits": 0.05518599599599838, + "loss/reg": 0.02369013801217079, + "step": 1377 + }, + { + "epoch": 0.689, + "grad_norm": 1.2378525733947754, + "grad_norm_var": 0.1932017017733111, + "learning_rate": 2e-05, + "loss": 0.4366, + "loss/crossentropy": 2.2186710834503174, + "loss/hidden": 0.16943359375, + "loss/logits": 0.0302474033087492, + "loss/reg": 0.02368772216141224, + "step": 1378 + }, + { + "epoch": 0.6895, + "grad_norm": 1.3566957712173462, + "grad_norm_var": 0.1970092361753761, + "learning_rate": 2e-05, + "loss": 0.3982, + "loss/crossentropy": 2.546470046043396, + "loss/hidden": 0.13525390625, + "loss/logits": 0.026141813024878502, + "loss/reg": 0.02368505485355854, + "step": 1379 + }, + { + "epoch": 0.69, + "grad_norm": 1.2005629539489746, + "grad_norm_var": 0.2005604341405349, + "learning_rate": 2e-05, + "loss": 0.4418, + "loss/crossentropy": 2.2552963495254517, + "loss/hidden": 0.17333984375, + "loss/logits": 0.03166076820343733, + "loss/reg": 0.023682620376348495, + "step": 1380 + }, + { + "epoch": 0.6905, + "grad_norm": 1.9562398195266724, + "grad_norm_var": 0.20518861482912196, + "learning_rate": 2e-05, + "loss": 0.5098, + "loss/crossentropy": 2.2495819330215454, + "loss/hidden": 0.23095703125, + "loss/logits": 0.0420466773211956, + "loss/reg": 0.023680146783590317, + "step": 1381 + }, + { + "epoch": 0.691, + "grad_norm": 1.4204621315002441, + "grad_norm_var": 0.20735892064978187, + "learning_rate": 2e-05, + "loss": 0.4234, + "loss/crossentropy": 2.263777256011963, + "loss/hidden": 0.15966796875, + "loss/logits": 0.026978014037013054, + "loss/reg": 0.02367776446044445, + "step": 1382 + }, + { + "epoch": 0.6915, + "grad_norm": 2.1433403491973877, + "grad_norm_var": 0.22364496503635584, + "learning_rate": 2e-05, + "loss": 0.443, + "loss/crossentropy": 2.3404159545898438, + "loss/hidden": 0.183349609375, + "loss/logits": 0.0229120384901762, + "loss/reg": 0.023675233125686646, + "step": 1383 + }, + { + "epoch": 0.692, + "grad_norm": 1.4612318277359009, + "grad_norm_var": 0.22049831846723483, + "learning_rate": 2e-05, + "loss": 0.435, + "loss/crossentropy": 2.3787938356399536, + "loss/hidden": 0.16943359375, + "loss/logits": 0.028827445581555367, + "loss/reg": 0.023672768846154213, + "step": 1384 + }, + { + "epoch": 0.6925, + "grad_norm": 1.356377124786377, + "grad_norm_var": 0.20105030555048078, + "learning_rate": 2e-05, + "loss": 0.4449, + "loss/crossentropy": 2.473434090614319, + "loss/hidden": 0.1767578125, + "loss/logits": 0.0314208772033453, + "loss/reg": 0.02367040514945984, + "step": 1385 + }, + { + "epoch": 0.693, + "grad_norm": 1.1685643196105957, + "grad_norm_var": 0.21520606580289575, + "learning_rate": 2e-05, + "loss": 0.4541, + "loss/crossentropy": 2.138159155845642, + "loss/hidden": 0.185546875, + "loss/logits": 0.03185593895614147, + "loss/reg": 0.023668091744184494, + "step": 1386 + }, + { + "epoch": 0.6935, + "grad_norm": 1.520918846130371, + "grad_norm_var": 0.21482740549679208, + "learning_rate": 2e-05, + "loss": 0.3933, + "loss/crossentropy": 2.549217104911804, + "loss/hidden": 0.13720703125, + "loss/logits": 0.0194573812186718, + "loss/reg": 0.02366561070084572, + "step": 1387 + }, + { + "epoch": 0.694, + "grad_norm": 1.3178868293762207, + "grad_norm_var": 0.16970641122309568, + "learning_rate": 2e-05, + "loss": 0.4825, + "loss/crossentropy": 2.2156635522842407, + "loss/hidden": 0.20849609375, + "loss/logits": 0.03734987787902355, + "loss/reg": 0.02366327866911888, + "step": 1388 + }, + { + "epoch": 0.6945, + "grad_norm": 1.2805134057998657, + "grad_norm_var": 0.10434541468175282, + "learning_rate": 2e-05, + "loss": 0.4346, + "loss/crossentropy": 2.1902356147766113, + "loss/hidden": 0.169921875, + "loss/logits": 0.028107551857829094, + "loss/reg": 0.023660695180296898, + "step": 1389 + }, + { + "epoch": 0.695, + "grad_norm": 1.4917412996292114, + "grad_norm_var": 0.1014830543172114, + "learning_rate": 2e-05, + "loss": 0.3961, + "loss/crossentropy": 2.46234929561615, + "loss/hidden": 0.1357421875, + "loss/logits": 0.023821561597287655, + "loss/reg": 0.023658404126763344, + "step": 1390 + }, + { + "epoch": 0.6955, + "grad_norm": 1.2825431823730469, + "grad_norm_var": 0.09981558763132382, + "learning_rate": 2e-05, + "loss": 0.4386, + "loss/crossentropy": 2.4950649738311768, + "loss/hidden": 0.16748046875, + "loss/logits": 0.034576233476400375, + "loss/reg": 0.02365582063794136, + "step": 1391 + }, + { + "epoch": 0.696, + "grad_norm": 1.0627645254135132, + "grad_norm_var": 0.07953715480548619, + "learning_rate": 2e-05, + "loss": 0.385, + "loss/crossentropy": 2.324121117591858, + "loss/hidden": 0.12939453125, + "loss/logits": 0.019065213855355978, + "loss/reg": 0.0236531812697649, + "step": 1392 + }, + { + "epoch": 0.6965, + "grad_norm": 1.2363553047180176, + "grad_norm_var": 0.079156088219622, + "learning_rate": 2e-05, + "loss": 0.4086, + "loss/crossentropy": 2.692628264427185, + "loss/hidden": 0.146240234375, + "loss/logits": 0.02587859146296978, + "loss/reg": 0.023650668561458588, + "step": 1393 + }, + { + "epoch": 0.697, + "grad_norm": 1.3195236921310425, + "grad_norm_var": 0.07774326246347835, + "learning_rate": 2e-05, + "loss": 0.4268, + "loss/crossentropy": 2.2705594301223755, + "loss/hidden": 0.158203125, + "loss/logits": 0.0320826917886734, + "loss/reg": 0.02364785596728325, + "step": 1394 + }, + { + "epoch": 0.6975, + "grad_norm": 1.3812922239303589, + "grad_norm_var": 0.07760303897853754, + "learning_rate": 2e-05, + "loss": 0.4493, + "loss/crossentropy": 2.4334908723831177, + "loss/hidden": 0.17138671875, + "loss/logits": 0.04147607646882534, + "loss/reg": 0.023645086213946342, + "step": 1395 + }, + { + "epoch": 0.698, + "grad_norm": 1.3648511171340942, + "grad_norm_var": 0.07464701664065293, + "learning_rate": 2e-05, + "loss": 0.4789, + "loss/crossentropy": 2.5334564447402954, + "loss/hidden": 0.201171875, + "loss/logits": 0.04130409471690655, + "loss/reg": 0.02364257536828518, + "step": 1396 + }, + { + "epoch": 0.6985, + "grad_norm": 1.8778526782989502, + "grad_norm_var": 0.0694556142458523, + "learning_rate": 2e-05, + "loss": 0.4686, + "loss/crossentropy": 2.253718376159668, + "loss/hidden": 0.19873046875, + "loss/logits": 0.033460862934589386, + "loss/reg": 0.023639997467398643, + "step": 1397 + }, + { + "epoch": 0.699, + "grad_norm": 1.0649257898330688, + "grad_norm_var": 0.07723400074944091, + "learning_rate": 2e-05, + "loss": 0.3847, + "loss/crossentropy": 2.403126835823059, + "loss/hidden": 0.128662109375, + "loss/logits": 0.019640752114355564, + "loss/reg": 0.0236373171210289, + "step": 1398 + }, + { + "epoch": 0.6995, + "grad_norm": 0.9858599901199341, + "grad_norm_var": 0.04558018881049334, + "learning_rate": 2e-05, + "loss": 0.3982, + "loss/crossentropy": 2.2231950759887695, + "loss/hidden": 0.138671875, + "loss/logits": 0.02315397746860981, + "loss/reg": 0.02363484352827072, + "step": 1399 + }, + { + "epoch": 0.7, + "grad_norm": 1.3760892152786255, + "grad_norm_var": 0.04446770302423217, + "learning_rate": 2e-05, + "loss": 0.4886, + "loss/crossentropy": 2.3370308876037598, + "loss/hidden": 0.20654296875, + "loss/logits": 0.045770518481731415, + "loss/reg": 0.023632274940609932, + "step": 1400 + }, + { + "epoch": 0.7005, + "grad_norm": 1.633719563484192, + "grad_norm_var": 0.050694139558335634, + "learning_rate": 2e-05, + "loss": 0.4001, + "loss/crossentropy": 2.4175291061401367, + "loss/hidden": 0.14208984375, + "loss/logits": 0.021711762994527817, + "loss/reg": 0.023629970848560333, + "step": 1401 + }, + { + "epoch": 0.701, + "grad_norm": 1.5971498489379883, + "grad_norm_var": 0.052644270149189036, + "learning_rate": 2e-05, + "loss": 0.4603, + "loss/crossentropy": 2.2261587381362915, + "loss/hidden": 0.19140625, + "loss/logits": 0.032638235948979855, + "loss/reg": 0.023627305403351784, + "step": 1402 + }, + { + "epoch": 0.7015, + "grad_norm": 1.2570019960403442, + "grad_norm_var": 0.05140971627937218, + "learning_rate": 2e-05, + "loss": 0.3887, + "loss/crossentropy": 2.4443479776382446, + "loss/hidden": 0.12939453125, + "loss/logits": 0.0230065593495965, + "loss/reg": 0.023624898865818977, + "step": 1403 + }, + { + "epoch": 0.702, + "grad_norm": 1.5167655944824219, + "grad_norm_var": 0.05314610912009237, + "learning_rate": 2e-05, + "loss": 0.4748, + "loss/crossentropy": 2.462609887123108, + "loss/hidden": 0.20068359375, + "loss/logits": 0.037887776270508766, + "loss/reg": 0.02362249046564102, + "step": 1404 + }, + { + "epoch": 0.7025, + "grad_norm": 1.3424351215362549, + "grad_norm_var": 0.052745515833931715, + "learning_rate": 2e-05, + "loss": 0.4595, + "loss/crossentropy": 2.2617905139923096, + "loss/hidden": 0.18505859375, + "loss/logits": 0.03824649378657341, + "loss/reg": 0.02361990138888359, + "step": 1405 + }, + { + "epoch": 0.703, + "grad_norm": 1.2809338569641113, + "grad_norm_var": 0.05187429464569006, + "learning_rate": 2e-05, + "loss": 0.4088, + "loss/crossentropy": 2.3717641830444336, + "loss/hidden": 0.14306640625, + "loss/logits": 0.02956732176244259, + "loss/reg": 0.02361760474741459, + "step": 1406 + }, + { + "epoch": 0.7035, + "grad_norm": 1.7771258354187012, + "grad_norm_var": 0.06279631634374751, + "learning_rate": 2e-05, + "loss": 0.4556, + "loss/crossentropy": 2.469625473022461, + "loss/hidden": 0.18017578125, + "loss/logits": 0.03924528695642948, + "loss/reg": 0.02361505851149559, + "step": 1407 + }, + { + "epoch": 0.704, + "grad_norm": 1.414624571800232, + "grad_norm_var": 0.05566685888704838, + "learning_rate": 2e-05, + "loss": 0.4175, + "loss/crossentropy": 2.7455949783325195, + "loss/hidden": 0.1552734375, + "loss/logits": 0.026056132279336452, + "loss/reg": 0.023612603545188904, + "step": 1408 + }, + { + "epoch": 0.7045, + "grad_norm": 1.3036192655563354, + "grad_norm_var": 0.054467126651640524, + "learning_rate": 2e-05, + "loss": 0.4093, + "loss/crossentropy": 2.5232421159744263, + "loss/hidden": 0.14501953125, + "loss/logits": 0.028133532963693142, + "loss/reg": 0.023610040545463562, + "step": 1409 + }, + { + "epoch": 0.705, + "grad_norm": 1.2797057628631592, + "grad_norm_var": 0.05502458620776493, + "learning_rate": 2e-05, + "loss": 0.4161, + "loss/crossentropy": 2.2470325231552124, + "loss/hidden": 0.15283203125, + "loss/logits": 0.027143074199557304, + "loss/reg": 0.023607581853866577, + "step": 1410 + }, + { + "epoch": 0.7055, + "grad_norm": 1.2794984579086304, + "grad_norm_var": 0.05597188755687809, + "learning_rate": 2e-05, + "loss": 0.4053, + "loss/crossentropy": 2.288419008255005, + "loss/hidden": 0.14794921875, + "loss/logits": 0.021314891055226326, + "loss/reg": 0.0236049797385931, + "step": 1411 + }, + { + "epoch": 0.706, + "grad_norm": 2.200571060180664, + "grad_norm_var": 0.0960401931657619, + "learning_rate": 2e-05, + "loss": 0.6026, + "loss/crossentropy": 2.12148916721344, + "loss/hidden": 0.30029296875, + "loss/logits": 0.06627136748284101, + "loss/reg": 0.02360256016254425, + "step": 1412 + }, + { + "epoch": 0.7065, + "grad_norm": 2.5475215911865234, + "grad_norm_var": 0.16233898418936382, + "learning_rate": 2e-05, + "loss": 0.4245, + "loss/crossentropy": 2.7688039541244507, + "loss/hidden": 0.15966796875, + "loss/logits": 0.02882098313421011, + "loss/reg": 0.02360014244914055, + "step": 1413 + }, + { + "epoch": 0.707, + "grad_norm": 1.3649111986160278, + "grad_norm_var": 0.15091742893504806, + "learning_rate": 2e-05, + "loss": 0.3992, + "loss/crossentropy": 2.421576499938965, + "loss/hidden": 0.13916015625, + "loss/logits": 0.024032247252762318, + "loss/reg": 0.023597724735736847, + "step": 1414 + }, + { + "epoch": 0.7075, + "grad_norm": 1.353563904762268, + "grad_norm_var": 0.13367826295360388, + "learning_rate": 2e-05, + "loss": 0.4107, + "loss/crossentropy": 2.5319186449050903, + "loss/hidden": 0.14892578125, + "loss/logits": 0.02581237070262432, + "loss/reg": 0.023595217615365982, + "step": 1415 + }, + { + "epoch": 0.708, + "grad_norm": 1.1511586904525757, + "grad_norm_var": 0.1415410624712725, + "learning_rate": 2e-05, + "loss": 0.3948, + "loss/crossentropy": 2.396964430809021, + "loss/hidden": 0.13525390625, + "loss/logits": 0.02357430011034012, + "loss/reg": 0.023592684417963028, + "step": 1416 + }, + { + "epoch": 0.7085, + "grad_norm": 1.4777796268463135, + "grad_norm_var": 0.1406708433314444, + "learning_rate": 2e-05, + "loss": 0.4242, + "loss/crossentropy": 2.25082266330719, + "loss/hidden": 0.15185546875, + "loss/logits": 0.03641578182578087, + "loss/reg": 0.02359013259410858, + "step": 1417 + }, + { + "epoch": 0.709, + "grad_norm": 1.4813765287399292, + "grad_norm_var": 0.14014819307293463, + "learning_rate": 2e-05, + "loss": 0.4335, + "loss/crossentropy": 2.433822274208069, + "loss/hidden": 0.169921875, + "loss/logits": 0.027691357769072056, + "loss/reg": 0.02358764037489891, + "step": 1418 + }, + { + "epoch": 0.7095, + "grad_norm": 3.4135758876800537, + "grad_norm_var": 0.3604375985303822, + "learning_rate": 2e-05, + "loss": 0.5497, + "loss/crossentropy": 2.3843711614608765, + "loss/hidden": 0.2734375, + "loss/logits": 0.0403892807662487, + "loss/reg": 0.02358505129814148, + "step": 1419 + }, + { + "epoch": 0.71, + "grad_norm": 1.213165521621704, + "grad_norm_var": 0.37104821359083356, + "learning_rate": 2e-05, + "loss": 0.3982, + "loss/crossentropy": 2.5343793630599976, + "loss/hidden": 0.13623046875, + "loss/logits": 0.02614509966224432, + "loss/reg": 0.023582441732287407, + "step": 1420 + }, + { + "epoch": 0.7105, + "grad_norm": 1.5525851249694824, + "grad_norm_var": 0.3660983405644202, + "learning_rate": 2e-05, + "loss": 0.4927, + "loss/crossentropy": 2.1852606534957886, + "loss/hidden": 0.21630859375, + "loss/logits": 0.04058670625090599, + "loss/reg": 0.023579921573400497, + "step": 1421 + }, + { + "epoch": 0.711, + "grad_norm": 1.3050885200500488, + "grad_norm_var": 0.36500823755956063, + "learning_rate": 2e-05, + "loss": 0.4211, + "loss/crossentropy": 2.417192816734314, + "loss/hidden": 0.162109375, + "loss/logits": 0.02324726153165102, + "loss/reg": 0.023577282205224037, + "step": 1422 + }, + { + "epoch": 0.7115, + "grad_norm": 1.6903064250946045, + "grad_norm_var": 0.36380217397103765, + "learning_rate": 2e-05, + "loss": 0.4785, + "loss/crossentropy": 2.4316320419311523, + "loss/hidden": 0.208984375, + "loss/logits": 0.0337921567261219, + "loss/reg": 0.023574667051434517, + "step": 1423 + }, + { + "epoch": 0.712, + "grad_norm": 1.1675231456756592, + "grad_norm_var": 0.3746094012963262, + "learning_rate": 2e-05, + "loss": 0.4142, + "loss/crossentropy": 2.2177504301071167, + "loss/hidden": 0.1494140625, + "loss/logits": 0.02909334283322096, + "loss/reg": 0.023572128266096115, + "step": 1424 + }, + { + "epoch": 0.7125, + "grad_norm": 1.718462586402893, + "grad_norm_var": 0.3683427865372977, + "learning_rate": 2e-05, + "loss": 0.5107, + "loss/crossentropy": 2.356824278831482, + "loss/hidden": 0.2265625, + "loss/logits": 0.048446234315633774, + "loss/reg": 0.023569492623209953, + "step": 1425 + }, + { + "epoch": 0.713, + "grad_norm": 2.538555145263672, + "grad_norm_var": 0.4073657383302283, + "learning_rate": 2e-05, + "loss": 0.4659, + "loss/crossentropy": 2.4271206855773926, + "loss/hidden": 0.197265625, + "loss/logits": 0.03294919244945049, + "loss/reg": 0.023566963151097298, + "step": 1426 + }, + { + "epoch": 0.7135, + "grad_norm": 1.6605249643325806, + "grad_norm_var": 0.3942648744597231, + "learning_rate": 2e-05, + "loss": 0.4182, + "loss/crossentropy": 2.5462480783462524, + "loss/hidden": 0.15283203125, + "loss/logits": 0.029766596853733063, + "loss/reg": 0.023564644157886505, + "step": 1427 + }, + { + "epoch": 0.714, + "grad_norm": 1.6154025793075562, + "grad_norm_var": 0.3797151310753638, + "learning_rate": 2e-05, + "loss": 0.474, + "loss/crossentropy": 2.253910183906555, + "loss/hidden": 0.2021484375, + "loss/logits": 0.03623027540743351, + "loss/reg": 0.023562012240290642, + "step": 1428 + }, + { + "epoch": 0.7145, + "grad_norm": 2.050323963165283, + "grad_norm_var": 0.3391940969651538, + "learning_rate": 2e-05, + "loss": 0.4883, + "loss/crossentropy": 2.319291830062866, + "loss/hidden": 0.203125, + "loss/logits": 0.049589984118938446, + "loss/reg": 0.023559633642435074, + "step": 1429 + }, + { + "epoch": 0.715, + "grad_norm": 1.2968723773956299, + "grad_norm_var": 0.3422705946198695, + "learning_rate": 2e-05, + "loss": 0.4308, + "loss/crossentropy": 2.3565025329589844, + "loss/hidden": 0.166015625, + "loss/logits": 0.029243918135762215, + "loss/reg": 0.023557225242257118, + "step": 1430 + }, + { + "epoch": 0.7155, + "grad_norm": 1.465996265411377, + "grad_norm_var": 0.3383485792832592, + "learning_rate": 2e-05, + "loss": 0.4574, + "loss/crossentropy": 2.4091076850891113, + "loss/hidden": 0.18603515625, + "loss/logits": 0.03585449419915676, + "loss/reg": 0.023554889485239983, + "step": 1431 + }, + { + "epoch": 0.716, + "grad_norm": 1.6185139417648315, + "grad_norm_var": 0.31936229587676096, + "learning_rate": 2e-05, + "loss": 0.4477, + "loss/crossentropy": 1.988387107849121, + "loss/hidden": 0.1845703125, + "loss/logits": 0.027648674324154854, + "loss/reg": 0.02355222962796688, + "step": 1432 + }, + { + "epoch": 0.7165, + "grad_norm": 1.5127618312835693, + "grad_norm_var": 0.3183830238570701, + "learning_rate": 2e-05, + "loss": 0.4341, + "loss/crossentropy": 2.4324188232421875, + "loss/hidden": 0.16650390625, + "loss/logits": 0.03210682421922684, + "loss/reg": 0.02354956604540348, + "step": 1433 + }, + { + "epoch": 0.717, + "grad_norm": 1.5678179264068604, + "grad_norm_var": 0.3162575020195957, + "learning_rate": 2e-05, + "loss": 0.4676, + "loss/crossentropy": 2.4026317596435547, + "loss/hidden": 0.19921875, + "loss/logits": 0.03290037252008915, + "loss/reg": 0.023546863347291946, + "step": 1434 + }, + { + "epoch": 0.7175, + "grad_norm": 1.6551094055175781, + "grad_norm_var": 0.11049876185942271, + "learning_rate": 2e-05, + "loss": 0.4312, + "loss/crossentropy": 2.351606845855713, + "loss/hidden": 0.1650390625, + "loss/logits": 0.030725182965397835, + "loss/reg": 0.02354429103434086, + "step": 1435 + }, + { + "epoch": 0.718, + "grad_norm": 1.3460294008255005, + "grad_norm_var": 0.10471709905145345, + "learning_rate": 2e-05, + "loss": 0.4204, + "loss/crossentropy": 2.403334140777588, + "loss/hidden": 0.15771484375, + "loss/logits": 0.02723412588238716, + "loss/reg": 0.023541752249002457, + "step": 1436 + }, + { + "epoch": 0.7185, + "grad_norm": 1.1729974746704102, + "grad_norm_var": 0.1166343133725992, + "learning_rate": 2e-05, + "loss": 0.4015, + "loss/crossentropy": 2.3932619094848633, + "loss/hidden": 0.14111328125, + "loss/logits": 0.02496551349759102, + "loss/reg": 0.02353922463953495, + "step": 1437 + }, + { + "epoch": 0.719, + "grad_norm": 1.7645087242126465, + "grad_norm_var": 0.11259440907935142, + "learning_rate": 2e-05, + "loss": 0.5111, + "loss/crossentropy": 2.400877833366394, + "loss/hidden": 0.23486328125, + "loss/logits": 0.0408332534134388, + "loss/reg": 0.02353672869503498, + "step": 1438 + }, + { + "epoch": 0.7195, + "grad_norm": 1.3634532690048218, + "grad_norm_var": 0.11599423217601744, + "learning_rate": 2e-05, + "loss": 0.4493, + "loss/crossentropy": 2.424346089363098, + "loss/hidden": 0.1796875, + "loss/logits": 0.03430754691362381, + "loss/reg": 0.023534253239631653, + "step": 1439 + }, + { + "epoch": 0.72, + "grad_norm": 1.3123286962509155, + "grad_norm_var": 0.10905751409417323, + "learning_rate": 2e-05, + "loss": 0.4399, + "loss/crossentropy": 2.2828234434127808, + "loss/hidden": 0.169921875, + "loss/logits": 0.03468863479793072, + "loss/reg": 0.023531882092356682, + "step": 1440 + }, + { + "epoch": 0.7205, + "grad_norm": 3.1446728706359863, + "grad_norm_var": 0.2580052108983352, + "learning_rate": 2e-05, + "loss": 0.6787, + "loss/crossentropy": 1.8992632031440735, + "loss/hidden": 0.33447265625, + "loss/logits": 0.10889805294573307, + "loss/reg": 0.023529645055532455, + "step": 1441 + }, + { + "epoch": 0.721, + "grad_norm": 1.5229130983352661, + "grad_norm_var": 0.20795354022681156, + "learning_rate": 2e-05, + "loss": 0.4591, + "loss/crossentropy": 2.284720540046692, + "loss/hidden": 0.1875, + "loss/logits": 0.036282142624258995, + "loss/reg": 0.02352738194167614, + "step": 1442 + }, + { + "epoch": 0.7215, + "grad_norm": 1.6657564640045166, + "grad_norm_var": 0.20797696901367027, + "learning_rate": 2e-05, + "loss": 0.4496, + "loss/crossentropy": 2.7669016122817993, + "loss/hidden": 0.18310546875, + "loss/logits": 0.031232742592692375, + "loss/reg": 0.023524843156337738, + "step": 1443 + }, + { + "epoch": 0.722, + "grad_norm": 2.9521846771240234, + "grad_norm_var": 0.3171124021500166, + "learning_rate": 2e-05, + "loss": 0.5604, + "loss/crossentropy": 2.3520604372024536, + "loss/hidden": 0.2763671875, + "loss/logits": 0.04880333133041859, + "loss/reg": 0.023522403091192245, + "step": 1444 + }, + { + "epoch": 0.7225, + "grad_norm": 1.5790318250656128, + "grad_norm_var": 0.3098142392085926, + "learning_rate": 2e-05, + "loss": 0.4899, + "loss/crossentropy": 2.0901917219161987, + "loss/hidden": 0.21533203125, + "loss/logits": 0.039392558857798576, + "loss/reg": 0.023519445210695267, + "step": 1445 + }, + { + "epoch": 0.723, + "grad_norm": 1.3354227542877197, + "grad_norm_var": 0.3079182473817125, + "learning_rate": 2e-05, + "loss": 0.4233, + "loss/crossentropy": 2.3203498125076294, + "loss/hidden": 0.16064453125, + "loss/logits": 0.02744780946522951, + "loss/reg": 0.02351679466664791, + "step": 1446 + }, + { + "epoch": 0.7235, + "grad_norm": 1.3747113943099976, + "grad_norm_var": 0.3111194517989119, + "learning_rate": 2e-05, + "loss": 0.4114, + "loss/crossentropy": 2.5399714708328247, + "loss/hidden": 0.14990234375, + "loss/logits": 0.026368978433310986, + "loss/reg": 0.02351376973092556, + "step": 1447 + }, + { + "epoch": 0.724, + "grad_norm": 1.1484705209732056, + "grad_norm_var": 0.3288139086814922, + "learning_rate": 2e-05, + "loss": 0.4122, + "loss/crossentropy": 2.4500149488449097, + "loss/hidden": 0.15087890625, + "loss/logits": 0.026164425536990166, + "loss/reg": 0.023511258885264397, + "step": 1448 + }, + { + "epoch": 0.7245, + "grad_norm": 1.3708717823028564, + "grad_norm_var": 0.3326900567825229, + "learning_rate": 2e-05, + "loss": 0.3965, + "loss/crossentropy": 2.305969476699829, + "loss/hidden": 0.138671875, + "loss/logits": 0.022724819369614124, + "loss/reg": 0.02350870706140995, + "step": 1449 + }, + { + "epoch": 0.725, + "grad_norm": 2.349400520324707, + "grad_norm_var": 0.3631110489319557, + "learning_rate": 2e-05, + "loss": 0.443, + "loss/crossentropy": 2.395747423171997, + "loss/hidden": 0.1796875, + "loss/logits": 0.028218965977430344, + "loss/reg": 0.023506123572587967, + "step": 1450 + }, + { + "epoch": 0.7255, + "grad_norm": 1.7106391191482544, + "grad_norm_var": 0.3630371761170198, + "learning_rate": 2e-05, + "loss": 0.4614, + "loss/crossentropy": 2.6804983615875244, + "loss/hidden": 0.1904296875, + "loss/logits": 0.03596752695739269, + "loss/reg": 0.02350357361137867, + "step": 1451 + }, + { + "epoch": 0.726, + "grad_norm": 2.972860813140869, + "grad_norm_var": 0.4528425190093097, + "learning_rate": 2e-05, + "loss": 0.4555, + "loss/crossentropy": 2.1960572004318237, + "loss/hidden": 0.181396484375, + "loss/logits": 0.0390651635825634, + "loss/reg": 0.023501023650169373, + "step": 1452 + }, + { + "epoch": 0.7265, + "grad_norm": 1.1931060552597046, + "grad_norm_var": 0.45119672384324666, + "learning_rate": 2e-05, + "loss": 0.409, + "loss/crossentropy": 2.4346343278884888, + "loss/hidden": 0.14501953125, + "loss/logits": 0.02896373998373747, + "loss/reg": 0.023498453199863434, + "step": 1453 + }, + { + "epoch": 0.727, + "grad_norm": 1.793229579925537, + "grad_norm_var": 0.45112186135817844, + "learning_rate": 2e-05, + "loss": 0.4728, + "loss/crossentropy": 2.494977831840515, + "loss/hidden": 0.20068359375, + "loss/logits": 0.037164075300097466, + "loss/reg": 0.023496052250266075, + "step": 1454 + }, + { + "epoch": 0.7275, + "grad_norm": 1.9371393918991089, + "grad_norm_var": 0.4383518223702936, + "learning_rate": 2e-05, + "loss": 0.5224, + "loss/crossentropy": 2.0521376729011536, + "loss/hidden": 0.2529296875, + "loss/logits": 0.034543922170996666, + "loss/reg": 0.02349347248673439, + "step": 1455 + }, + { + "epoch": 0.728, + "grad_norm": 1.477908968925476, + "grad_norm_var": 0.4285223862932327, + "learning_rate": 2e-05, + "loss": 0.4217, + "loss/crossentropy": 2.2566416263580322, + "loss/hidden": 0.1572265625, + "loss/logits": 0.029540160670876503, + "loss/reg": 0.02349095791578293, + "step": 1456 + }, + { + "epoch": 0.7285, + "grad_norm": 1.43665611743927, + "grad_norm_var": 0.3149916450445355, + "learning_rate": 2e-05, + "loss": 0.435, + "loss/crossentropy": 2.3300145864486694, + "loss/hidden": 0.16650390625, + "loss/logits": 0.03365709260106087, + "loss/reg": 0.023488519713282585, + "step": 1457 + }, + { + "epoch": 0.729, + "grad_norm": 2.223034381866455, + "grad_norm_var": 0.32547722216857267, + "learning_rate": 2e-05, + "loss": 0.5049, + "loss/crossentropy": 2.456981062889099, + "loss/hidden": 0.22509765625, + "loss/logits": 0.04493347555398941, + "loss/reg": 0.023486167192459106, + "step": 1458 + }, + { + "epoch": 0.7295, + "grad_norm": 1.679583191871643, + "grad_norm_var": 0.3252738977751884, + "learning_rate": 2e-05, + "loss": 0.4379, + "loss/crossentropy": 2.432957887649536, + "loss/hidden": 0.16943359375, + "loss/logits": 0.03366055339574814, + "loss/reg": 0.023483600467443466, + "step": 1459 + }, + { + "epoch": 0.73, + "grad_norm": 1.673349380493164, + "grad_norm_var": 0.22819496323031288, + "learning_rate": 2e-05, + "loss": 0.492, + "loss/crossentropy": 2.410443902015686, + "loss/hidden": 0.2109375, + "loss/logits": 0.046233994886279106, + "loss/reg": 0.023481376469135284, + "step": 1460 + }, + { + "epoch": 0.7305, + "grad_norm": 1.5115046501159668, + "grad_norm_var": 0.22960029400701293, + "learning_rate": 2e-05, + "loss": 0.4361, + "loss/crossentropy": 2.6036850214004517, + "loss/hidden": 0.17041015625, + "loss/logits": 0.030860383063554764, + "loss/reg": 0.023478906601667404, + "step": 1461 + }, + { + "epoch": 0.731, + "grad_norm": 1.3442504405975342, + "grad_norm_var": 0.22917693899710986, + "learning_rate": 2e-05, + "loss": 0.4744, + "loss/crossentropy": 2.410821318626404, + "loss/hidden": 0.19384765625, + "loss/logits": 0.04574625752866268, + "loss/reg": 0.02347634732723236, + "step": 1462 + }, + { + "epoch": 0.7315, + "grad_norm": 1.2325595617294312, + "grad_norm_var": 0.23660137846549864, + "learning_rate": 2e-05, + "loss": 0.4353, + "loss/crossentropy": 2.428195834159851, + "loss/hidden": 0.1669921875, + "loss/logits": 0.03353757597506046, + "loss/reg": 0.023473726585507393, + "step": 1463 + }, + { + "epoch": 0.732, + "grad_norm": 1.4386786222457886, + "grad_norm_var": 0.22087578651664874, + "learning_rate": 2e-05, + "loss": 0.4601, + "loss/crossentropy": 2.128316283226013, + "loss/hidden": 0.19189453125, + "loss/logits": 0.03346416354179382, + "loss/reg": 0.023471109569072723, + "step": 1464 + }, + { + "epoch": 0.7325, + "grad_norm": 1.5255026817321777, + "grad_norm_var": 0.2153978679484633, + "learning_rate": 2e-05, + "loss": 0.426, + "loss/crossentropy": 2.4801331758499146, + "loss/hidden": 0.162109375, + "loss/logits": 0.029225386679172516, + "loss/reg": 0.023468641564249992, + "step": 1465 + }, + { + "epoch": 0.733, + "grad_norm": 1.558826208114624, + "grad_norm_var": 0.18798010841390062, + "learning_rate": 2e-05, + "loss": 0.4271, + "loss/crossentropy": 2.345631241798401, + "loss/hidden": 0.1591796875, + "loss/logits": 0.03320986311882734, + "loss/reg": 0.02346622571349144, + "step": 1466 + }, + { + "epoch": 0.7335, + "grad_norm": 1.4616813659667969, + "grad_norm_var": 0.1904816907030834, + "learning_rate": 2e-05, + "loss": 0.4534, + "loss/crossentropy": 2.260239005088806, + "loss/hidden": 0.1826171875, + "loss/logits": 0.03611057437956333, + "loss/reg": 0.023463619872927666, + "step": 1467 + }, + { + "epoch": 0.734, + "grad_norm": 1.2021178007125854, + "grad_norm_var": 0.07500963522951735, + "learning_rate": 2e-05, + "loss": 0.441, + "loss/crossentropy": 2.4786767959594727, + "loss/hidden": 0.17041015625, + "loss/logits": 0.03600460663437843, + "loss/reg": 0.023461153730750084, + "step": 1468 + }, + { + "epoch": 0.7345, + "grad_norm": 1.321462869644165, + "grad_norm_var": 0.07004997562031713, + "learning_rate": 2e-05, + "loss": 0.4248, + "loss/crossentropy": 2.4473639726638794, + "loss/hidden": 0.16064453125, + "loss/logits": 0.029547326266765594, + "loss/reg": 0.023458639159798622, + "step": 1469 + }, + { + "epoch": 0.735, + "grad_norm": 1.302802324295044, + "grad_norm_var": 0.06924901126378126, + "learning_rate": 2e-05, + "loss": 0.4392, + "loss/crossentropy": 2.320843458175659, + "loss/hidden": 0.1708984375, + "loss/logits": 0.03376789018511772, + "loss/reg": 0.02345600537955761, + "step": 1470 + }, + { + "epoch": 0.7355, + "grad_norm": 1.744510293006897, + "grad_norm_var": 0.06086570608285336, + "learning_rate": 2e-05, + "loss": 0.4287, + "loss/crossentropy": 2.4008067846298218, + "loss/hidden": 0.16259765625, + "loss/logits": 0.03158361464738846, + "loss/reg": 0.02345338650047779, + "step": 1471 + }, + { + "epoch": 0.736, + "grad_norm": 1.1985650062561035, + "grad_norm_var": 0.06687850358083645, + "learning_rate": 2e-05, + "loss": 0.4133, + "loss/crossentropy": 2.5214314460754395, + "loss/hidden": 0.14794921875, + "loss/logits": 0.03085363283753395, + "loss/reg": 0.023450734093785286, + "step": 1472 + }, + { + "epoch": 0.7365, + "grad_norm": 2.1167149543762207, + "grad_norm_var": 0.090861085965173, + "learning_rate": 2e-05, + "loss": 0.4895, + "loss/crossentropy": 1.9878064393997192, + "loss/hidden": 0.22265625, + "loss/logits": 0.03239255491644144, + "loss/reg": 0.023448146879673004, + "step": 1473 + }, + { + "epoch": 0.737, + "grad_norm": 1.5386013984680176, + "grad_norm_var": 0.057208890733344654, + "learning_rate": 2e-05, + "loss": 0.4493, + "loss/crossentropy": 2.1566672325134277, + "loss/hidden": 0.1826171875, + "loss/logits": 0.03220840450376272, + "loss/reg": 0.0234454907476902, + "step": 1474 + }, + { + "epoch": 0.7375, + "grad_norm": 1.3006364107131958, + "grad_norm_var": 0.05663883015899618, + "learning_rate": 2e-05, + "loss": 0.4451, + "loss/crossentropy": 2.2811367511749268, + "loss/hidden": 0.1748046875, + "loss/logits": 0.03587420843541622, + "loss/reg": 0.02344280481338501, + "step": 1475 + }, + { + "epoch": 0.738, + "grad_norm": 2.268388271331787, + "grad_norm_var": 0.09514090985834489, + "learning_rate": 2e-05, + "loss": 0.4557, + "loss/crossentropy": 2.2090927362442017, + "loss/hidden": 0.18994140625, + "loss/logits": 0.03136050421744585, + "loss/reg": 0.02344009466469288, + "step": 1476 + }, + { + "epoch": 0.7385, + "grad_norm": 1.514344334602356, + "grad_norm_var": 0.09514418896451105, + "learning_rate": 2e-05, + "loss": 0.4319, + "loss/crossentropy": 2.2778546810150146, + "loss/hidden": 0.16943359375, + "loss/logits": 0.028109371662139893, + "loss/reg": 0.023437298834323883, + "step": 1477 + }, + { + "epoch": 0.739, + "grad_norm": 1.1976886987686157, + "grad_norm_var": 0.09961535847471854, + "learning_rate": 2e-05, + "loss": 0.4028, + "loss/crossentropy": 2.5273643732070923, + "loss/hidden": 0.14794921875, + "loss/logits": 0.02050770726054907, + "loss/reg": 0.02343466505408287, + "step": 1478 + }, + { + "epoch": 0.7395, + "grad_norm": 1.8187288045883179, + "grad_norm_var": 0.10056368997682572, + "learning_rate": 2e-05, + "loss": 0.4755, + "loss/crossentropy": 2.3583481311798096, + "loss/hidden": 0.21044921875, + "loss/logits": 0.030701249837875366, + "loss/reg": 0.023432079702615738, + "step": 1479 + }, + { + "epoch": 0.74, + "grad_norm": 2.092109441757202, + "grad_norm_var": 0.11913386201947915, + "learning_rate": 2e-05, + "loss": 0.57, + "loss/crossentropy": 2.2834445238113403, + "loss/hidden": 0.27978515625, + "loss/logits": 0.05595431476831436, + "loss/reg": 0.023429367691278458, + "step": 1480 + }, + { + "epoch": 0.7405, + "grad_norm": 1.6731066703796387, + "grad_norm_var": 0.11956731584117103, + "learning_rate": 2e-05, + "loss": 0.47, + "loss/crossentropy": 2.29840624332428, + "loss/hidden": 0.19775390625, + "loss/logits": 0.03799319267272949, + "loss/reg": 0.023426661267876625, + "step": 1481 + }, + { + "epoch": 0.741, + "grad_norm": 1.7056869268417358, + "grad_norm_var": 0.12046364336034585, + "learning_rate": 2e-05, + "loss": 0.4905, + "loss/crossentropy": 2.46663236618042, + "loss/hidden": 0.2119140625, + "loss/logits": 0.04434940032660961, + "loss/reg": 0.02342418022453785, + "step": 1482 + }, + { + "epoch": 0.7415, + "grad_norm": 1.5112969875335693, + "grad_norm_var": 0.11976152998951234, + "learning_rate": 2e-05, + "loss": 0.4492, + "loss/crossentropy": 2.5104438066482544, + "loss/hidden": 0.181640625, + "loss/logits": 0.03338887542486191, + "loss/reg": 0.023421762511134148, + "step": 1483 + }, + { + "epoch": 0.742, + "grad_norm": 2.16302227973938, + "grad_norm_var": 0.1272398268385037, + "learning_rate": 2e-05, + "loss": 0.4431, + "loss/crossentropy": 2.4524015188217163, + "loss/hidden": 0.1708984375, + "loss/logits": 0.03798994794487953, + "loss/reg": 0.023419423028826714, + "step": 1484 + }, + { + "epoch": 0.7425, + "grad_norm": 2.6009202003479004, + "grad_norm_var": 0.1727849916739044, + "learning_rate": 2e-05, + "loss": 0.4471, + "loss/crossentropy": 2.459054470062256, + "loss/hidden": 0.18017578125, + "loss/logits": 0.032748810946941376, + "loss/reg": 0.023416871204972267, + "step": 1485 + }, + { + "epoch": 0.743, + "grad_norm": 1.2313926219940186, + "grad_norm_var": 0.177211118899447, + "learning_rate": 2e-05, + "loss": 0.4193, + "loss/crossentropy": 2.4845768213272095, + "loss/hidden": 0.15576171875, + "loss/logits": 0.02941302303224802, + "loss/reg": 0.023414650931954384, + "step": 1486 + }, + { + "epoch": 0.7435, + "grad_norm": 1.3175305128097534, + "grad_norm_var": 0.18776426918117484, + "learning_rate": 2e-05, + "loss": 0.4491, + "loss/crossentropy": 2.5155017375946045, + "loss/hidden": 0.18310546875, + "loss/logits": 0.03185183368623257, + "loss/reg": 0.023412445560097694, + "step": 1487 + }, + { + "epoch": 0.744, + "grad_norm": 1.3121087551116943, + "grad_norm_var": 0.18093261119129972, + "learning_rate": 2e-05, + "loss": 0.4527, + "loss/crossentropy": 2.266068696975708, + "loss/hidden": 0.18359375, + "loss/logits": 0.03500186279416084, + "loss/reg": 0.02340994030237198, + "step": 1488 + }, + { + "epoch": 0.7445, + "grad_norm": 2.539462089538574, + "grad_norm_var": 0.2150192957888348, + "learning_rate": 2e-05, + "loss": 0.5812, + "loss/crossentropy": 2.3054516315460205, + "loss/hidden": 0.29296875, + "loss/logits": 0.05416359752416611, + "loss/reg": 0.023407652974128723, + "step": 1489 + }, + { + "epoch": 0.745, + "grad_norm": 1.8638911247253418, + "grad_norm_var": 0.21304660583959933, + "learning_rate": 2e-05, + "loss": 0.4883, + "loss/crossentropy": 2.217733383178711, + "loss/hidden": 0.2177734375, + "loss/logits": 0.03648427501320839, + "loss/reg": 0.023405244573950768, + "step": 1490 + }, + { + "epoch": 0.7455, + "grad_norm": 1.4341673851013184, + "grad_norm_var": 0.2060377327406276, + "learning_rate": 2e-05, + "loss": 0.4652, + "loss/crossentropy": 2.259947180747986, + "loss/hidden": 0.1962890625, + "loss/logits": 0.03487166576087475, + "loss/reg": 0.02340288832783699, + "step": 1491 + }, + { + "epoch": 0.746, + "grad_norm": 1.286834478378296, + "grad_norm_var": 0.20040431914197107, + "learning_rate": 2e-05, + "loss": 0.4281, + "loss/crossentropy": 2.373807907104492, + "loss/hidden": 0.16259765625, + "loss/logits": 0.03151876013725996, + "loss/reg": 0.02340046875178814, + "step": 1492 + }, + { + "epoch": 0.7465, + "grad_norm": 2.74072527885437, + "grad_norm_var": 0.263410407901218, + "learning_rate": 2e-05, + "loss": 0.5302, + "loss/crossentropy": 2.263616144657135, + "loss/hidden": 0.2568359375, + "loss/logits": 0.039356544613838196, + "loss/reg": 0.02339823544025421, + "step": 1493 + }, + { + "epoch": 0.747, + "grad_norm": 1.9128124713897705, + "grad_norm_var": 0.23979806512014498, + "learning_rate": 2e-05, + "loss": 0.5741, + "loss/crossentropy": 2.0934388637542725, + "loss/hidden": 0.2841796875, + "loss/logits": 0.055920008569955826, + "loss/reg": 0.023396024480462074, + "step": 1494 + }, + { + "epoch": 0.7475, + "grad_norm": 1.9756958484649658, + "grad_norm_var": 0.24120176602785268, + "learning_rate": 2e-05, + "loss": 0.4228, + "loss/crossentropy": 2.3994463682174683, + "loss/hidden": 0.16064453125, + "loss/logits": 0.028181973844766617, + "loss/reg": 0.02339351177215576, + "step": 1495 + }, + { + "epoch": 0.748, + "grad_norm": 1.7724146842956543, + "grad_norm_var": 0.23663205632003587, + "learning_rate": 2e-05, + "loss": 0.461, + "loss/crossentropy": 2.1939653158187866, + "loss/hidden": 0.18994140625, + "loss/logits": 0.03715855535119772, + "loss/reg": 0.023391004651784897, + "step": 1496 + }, + { + "epoch": 0.7485, + "grad_norm": 1.5231564044952393, + "grad_norm_var": 0.24087563457875186, + "learning_rate": 2e-05, + "loss": 0.4467, + "loss/crossentropy": 2.418076753616333, + "loss/hidden": 0.17919921875, + "loss/logits": 0.03366350382566452, + "loss/reg": 0.02338848076760769, + "step": 1497 + }, + { + "epoch": 0.749, + "grad_norm": 1.3560765981674194, + "grad_norm_var": 0.2531766876431429, + "learning_rate": 2e-05, + "loss": 0.4376, + "loss/crossentropy": 2.092953681945801, + "loss/hidden": 0.16796875, + "loss/logits": 0.035764566622674465, + "loss/reg": 0.023386115208268166, + "step": 1498 + }, + { + "epoch": 0.7495, + "grad_norm": 2.6173150539398193, + "grad_norm_var": 0.28943914508452623, + "learning_rate": 2e-05, + "loss": 0.5938, + "loss/crossentropy": 2.3732458353042603, + "loss/hidden": 0.291015625, + "loss/logits": 0.06896837241947651, + "loss/reg": 0.023383593186736107, + "step": 1499 + }, + { + "epoch": 0.75, + "grad_norm": 2.2990541458129883, + "grad_norm_var": 0.2962192790031487, + "learning_rate": 2e-05, + "loss": 0.5086, + "loss/crossentropy": 2.3469722270965576, + "loss/hidden": 0.24072265625, + "loss/logits": 0.03404225967824459, + "loss/reg": 0.02338109351694584, + "step": 1500 + }, + { + "epoch": 0.7505, + "grad_norm": 6.053563594818115, + "grad_norm_var": 1.3816725595266346, + "learning_rate": 2e-05, + "loss": 0.8284, + "loss/crossentropy": 2.2309868335723877, + "loss/hidden": 0.4892578125, + "loss/logits": 0.1053722184151411, + "loss/reg": 0.023378517478704453, + "step": 1501 + }, + { + "epoch": 0.751, + "grad_norm": 1.4381011724472046, + "grad_norm_var": 1.361029946092843, + "learning_rate": 2e-05, + "loss": 0.4233, + "loss/crossentropy": 2.3353075981140137, + "loss/hidden": 0.15869140625, + "loss/logits": 0.030879972502589226, + "loss/reg": 0.023375999182462692, + "step": 1502 + }, + { + "epoch": 0.7515, + "grad_norm": 1.7041223049163818, + "grad_norm_var": 1.330544016606859, + "learning_rate": 2e-05, + "loss": 0.4519, + "loss/crossentropy": 2.3510212898254395, + "loss/hidden": 0.18310546875, + "loss/logits": 0.035089364275336266, + "loss/reg": 0.023373527452349663, + "step": 1503 + }, + { + "epoch": 0.752, + "grad_norm": 1.176741600036621, + "grad_norm_var": 1.346168787370407, + "learning_rate": 2e-05, + "loss": 0.4271, + "loss/crossentropy": 2.301971435546875, + "loss/hidden": 0.162109375, + "loss/logits": 0.03127031493932009, + "loss/reg": 0.023370975628495216, + "step": 1504 + }, + { + "epoch": 0.7525, + "grad_norm": 1.3868812322616577, + "grad_norm_var": 1.362565183966303, + "learning_rate": 2e-05, + "loss": 0.4299, + "loss/crossentropy": 2.300473690032959, + "loss/hidden": 0.1650390625, + "loss/logits": 0.031166426837444305, + "loss/reg": 0.023368434980511665, + "step": 1505 + }, + { + "epoch": 0.753, + "grad_norm": 1.5717381238937378, + "grad_norm_var": 1.3745201891776084, + "learning_rate": 2e-05, + "loss": 0.434, + "loss/crossentropy": 2.2145345211029053, + "loss/hidden": 0.1650390625, + "loss/logits": 0.03533552121371031, + "loss/reg": 0.02336590550839901, + "step": 1506 + }, + { + "epoch": 0.7535, + "grad_norm": 1.5625638961791992, + "grad_norm_var": 1.3655969008810318, + "learning_rate": 2e-05, + "loss": 0.4732, + "loss/crossentropy": 2.4806735515594482, + "loss/hidden": 0.19384765625, + "loss/logits": 0.045676751993596554, + "loss/reg": 0.02336341328918934, + "step": 1507 + }, + { + "epoch": 0.754, + "grad_norm": 1.5724704265594482, + "grad_norm_var": 1.3426361132111952, + "learning_rate": 2e-05, + "loss": 0.458, + "loss/crossentropy": 2.5147154331207275, + "loss/hidden": 0.189453125, + "loss/logits": 0.03498086519539356, + "loss/reg": 0.02336088940501213, + "step": 1508 + }, + { + "epoch": 0.7545, + "grad_norm": 1.2432793378829956, + "grad_norm_var": 1.3431686166198147, + "learning_rate": 2e-05, + "loss": 0.3892, + "loss/crossentropy": 2.441771388053894, + "loss/hidden": 0.13134765625, + "loss/logits": 0.024279465898871422, + "loss/reg": 0.023358337581157684, + "step": 1509 + }, + { + "epoch": 0.755, + "grad_norm": 1.1907211542129517, + "grad_norm_var": 1.3791328093235484, + "learning_rate": 2e-05, + "loss": 0.4286, + "loss/crossentropy": 2.4157201051712036, + "loss/hidden": 0.1650390625, + "loss/logits": 0.029976122081279755, + "loss/reg": 0.023355863988399506, + "step": 1510 + }, + { + "epoch": 0.7555, + "grad_norm": 1.7555755376815796, + "grad_norm_var": 1.3800200121858432, + "learning_rate": 2e-05, + "loss": 0.4285, + "loss/crossentropy": 2.5669732093811035, + "loss/hidden": 0.16259765625, + "loss/logits": 0.032341357320547104, + "loss/reg": 0.02335333824157715, + "step": 1511 + }, + { + "epoch": 0.756, + "grad_norm": 1.4463238716125488, + "grad_norm_var": 1.3917343393376849, + "learning_rate": 2e-05, + "loss": 0.4156, + "loss/crossentropy": 2.3183934688568115, + "loss/hidden": 0.1494140625, + "loss/logits": 0.032720635645091534, + "loss/reg": 0.02335066720843315, + "step": 1512 + }, + { + "epoch": 0.7565, + "grad_norm": 1.8817647695541382, + "grad_norm_var": 1.3832543893532858, + "learning_rate": 2e-05, + "loss": 0.4843, + "loss/crossentropy": 2.045651853084564, + "loss/hidden": 0.21044921875, + "loss/logits": 0.040373530238866806, + "loss/reg": 0.023348016664385796, + "step": 1513 + }, + { + "epoch": 0.757, + "grad_norm": 2.0054848194122314, + "grad_norm_var": 1.3632931739013794, + "learning_rate": 2e-05, + "loss": 0.6077, + "loss/crossentropy": 2.3150511980056763, + "loss/hidden": 0.30810546875, + "loss/logits": 0.066120695322752, + "loss/reg": 0.02334539033472538, + "step": 1514 + }, + { + "epoch": 0.7575, + "grad_norm": 1.4316192865371704, + "grad_norm_var": 1.3427547339581412, + "learning_rate": 2e-05, + "loss": 0.4183, + "loss/crossentropy": 2.3082213401794434, + "loss/hidden": 0.1552734375, + "loss/logits": 0.02960424032062292, + "loss/reg": 0.023342687636613846, + "step": 1515 + }, + { + "epoch": 0.758, + "grad_norm": 1.4792301654815674, + "grad_norm_var": 1.3364955062616057, + "learning_rate": 2e-05, + "loss": 0.4542, + "loss/crossentropy": 2.229594111442566, + "loss/hidden": 0.18701171875, + "loss/logits": 0.03382623475044966, + "loss/reg": 0.023340150713920593, + "step": 1516 + }, + { + "epoch": 0.7585, + "grad_norm": 1.378159523010254, + "grad_norm_var": 0.05499430187053349, + "learning_rate": 2e-05, + "loss": 0.4331, + "loss/crossentropy": 2.3700714111328125, + "loss/hidden": 0.16455078125, + "loss/logits": 0.035162342712283134, + "loss/reg": 0.02333764359354973, + "step": 1517 + }, + { + "epoch": 0.759, + "grad_norm": 1.4192622900009155, + "grad_norm_var": 0.05520725190068181, + "learning_rate": 2e-05, + "loss": 0.4744, + "loss/crossentropy": 2.1727020740509033, + "loss/hidden": 0.19970703125, + "loss/logits": 0.04138432815670967, + "loss/reg": 0.023334944620728493, + "step": 1518 + }, + { + "epoch": 0.7595, + "grad_norm": 1.5111662149429321, + "grad_norm_var": 0.052613845086676686, + "learning_rate": 2e-05, + "loss": 0.4544, + "loss/crossentropy": 2.2911019325256348, + "loss/hidden": 0.18505859375, + "loss/logits": 0.035999225452542305, + "loss/reg": 0.02333231456577778, + "step": 1519 + }, + { + "epoch": 0.76, + "grad_norm": 1.5904415845870972, + "grad_norm_var": 0.04543488593400274, + "learning_rate": 2e-05, + "loss": 0.411, + "loss/crossentropy": 2.3194239139556885, + "loss/hidden": 0.15380859375, + "loss/logits": 0.023918326012790203, + "loss/reg": 0.023329300805926323, + "step": 1520 + }, + { + "epoch": 0.7605, + "grad_norm": 1.1604093313217163, + "grad_norm_var": 0.0528615068401732, + "learning_rate": 2e-05, + "loss": 0.4225, + "loss/crossentropy": 2.206283152103424, + "loss/hidden": 0.1591796875, + "loss/logits": 0.03008684329688549, + "loss/reg": 0.023326555266976357, + "step": 1521 + }, + { + "epoch": 0.761, + "grad_norm": 1.8183667659759521, + "grad_norm_var": 0.05861065574009997, + "learning_rate": 2e-05, + "loss": 0.4382, + "loss/crossentropy": 2.4522966146469116, + "loss/hidden": 0.17578125, + "loss/logits": 0.02915840595960617, + "loss/reg": 0.023324020206928253, + "step": 1522 + }, + { + "epoch": 0.7615, + "grad_norm": 1.9318912029266357, + "grad_norm_var": 0.06884144736975527, + "learning_rate": 2e-05, + "loss": 0.4305, + "loss/crossentropy": 2.4032152891159058, + "loss/hidden": 0.16259765625, + "loss/logits": 0.03465164825320244, + "loss/reg": 0.023321056738495827, + "step": 1523 + }, + { + "epoch": 0.762, + "grad_norm": 1.3856819868087769, + "grad_norm_var": 0.07048760261173755, + "learning_rate": 2e-05, + "loss": 0.4574, + "loss/crossentropy": 2.4888235330581665, + "loss/hidden": 0.1904296875, + "loss/logits": 0.03376224543899298, + "loss/reg": 0.023318205028772354, + "step": 1524 + }, + { + "epoch": 0.7625, + "grad_norm": 2.0698816776275635, + "grad_norm_var": 0.08056257023113833, + "learning_rate": 2e-05, + "loss": 0.4702, + "loss/crossentropy": 2.4298810958862305, + "loss/hidden": 0.19677734375, + "loss/logits": 0.04024036321789026, + "loss/reg": 0.02331569977104664, + "step": 1525 + }, + { + "epoch": 0.763, + "grad_norm": 1.7094788551330566, + "grad_norm_var": 0.06969563841946425, + "learning_rate": 2e-05, + "loss": 0.4486, + "loss/crossentropy": 2.4382940530776978, + "loss/hidden": 0.1845703125, + "loss/logits": 0.03088864777237177, + "loss/reg": 0.023313157260417938, + "step": 1526 + }, + { + "epoch": 0.7635, + "grad_norm": 1.7802170515060425, + "grad_norm_var": 0.07016778667789912, + "learning_rate": 2e-05, + "loss": 0.4732, + "loss/crossentropy": 2.4365394115448, + "loss/hidden": 0.2001953125, + "loss/logits": 0.039911434054374695, + "loss/reg": 0.023310648277401924, + "step": 1527 + }, + { + "epoch": 0.764, + "grad_norm": 2.6927785873413086, + "grad_norm_var": 0.13758242415195784, + "learning_rate": 2e-05, + "loss": 0.645, + "loss/crossentropy": 2.0314077138900757, + "loss/hidden": 0.35546875, + "loss/logits": 0.056443119421601295, + "loss/reg": 0.02330797351896763, + "step": 1528 + }, + { + "epoch": 0.7645, + "grad_norm": 2.23351788520813, + "grad_norm_var": 0.15370605581981486, + "learning_rate": 2e-05, + "loss": 0.4534, + "loss/crossentropy": 2.5266857147216797, + "loss/hidden": 0.18896484375, + "loss/logits": 0.03136393055319786, + "loss/reg": 0.023305490612983704, + "step": 1529 + }, + { + "epoch": 0.765, + "grad_norm": 1.495396375656128, + "grad_norm_var": 0.1508814132006193, + "learning_rate": 2e-05, + "loss": 0.4171, + "loss/crossentropy": 2.4519113302230835, + "loss/hidden": 0.15283203125, + "loss/logits": 0.031247646547853947, + "loss/reg": 0.02330303005874157, + "step": 1530 + }, + { + "epoch": 0.7655, + "grad_norm": 2.117763042449951, + "grad_norm_var": 0.15639622485214394, + "learning_rate": 2e-05, + "loss": 0.4553, + "loss/crossentropy": 2.728012442588806, + "loss/hidden": 0.18408203125, + "loss/logits": 0.03820735961198807, + "loss/reg": 0.023300379514694214, + "step": 1531 + }, + { + "epoch": 0.766, + "grad_norm": 1.2518669366836548, + "grad_norm_var": 0.16740663803541475, + "learning_rate": 2e-05, + "loss": 0.4509, + "loss/crossentropy": 2.3151432275772095, + "loss/hidden": 0.18408203125, + "loss/logits": 0.033854938112199306, + "loss/reg": 0.023297840729355812, + "step": 1532 + }, + { + "epoch": 0.7665, + "grad_norm": 1.6661626100540161, + "grad_norm_var": 0.15940086312676746, + "learning_rate": 2e-05, + "loss": 0.4237, + "loss/crossentropy": 2.624950885772705, + "loss/hidden": 0.15966796875, + "loss/logits": 0.03104830253869295, + "loss/reg": 0.023295121267437935, + "step": 1533 + }, + { + "epoch": 0.767, + "grad_norm": 1.2690476179122925, + "grad_norm_var": 0.1672279185359154, + "learning_rate": 2e-05, + "loss": 0.4322, + "loss/crossentropy": 2.2488889694213867, + "loss/hidden": 0.17041015625, + "loss/logits": 0.02886138390749693, + "loss/reg": 0.02329253777861595, + "step": 1534 + }, + { + "epoch": 0.7675, + "grad_norm": 1.4908874034881592, + "grad_norm_var": 0.16784599970408365, + "learning_rate": 2e-05, + "loss": 0.4199, + "loss/crossentropy": 2.32344913482666, + "loss/hidden": 0.1572265625, + "loss/logits": 0.029767291620373726, + "loss/reg": 0.023290077224373817, + "step": 1535 + }, + { + "epoch": 0.768, + "grad_norm": 1.5248539447784424, + "grad_norm_var": 0.1693264389141717, + "learning_rate": 2e-05, + "loss": 0.4806, + "loss/crossentropy": 2.282503128051758, + "loss/hidden": 0.20556640625, + "loss/logits": 0.042184172198176384, + "loss/reg": 0.02328774333000183, + "step": 1536 + }, + { + "epoch": 0.7685, + "grad_norm": 1.3552334308624268, + "grad_norm_var": 0.1570355202480712, + "learning_rate": 2e-05, + "loss": 0.4453, + "loss/crossentropy": 2.2128005027770996, + "loss/hidden": 0.1787109375, + "loss/logits": 0.03375644236803055, + "loss/reg": 0.02328518033027649, + "step": 1537 + }, + { + "epoch": 0.769, + "grad_norm": 1.7702122926712036, + "grad_norm_var": 0.15665843688097023, + "learning_rate": 2e-05, + "loss": 0.4218, + "loss/crossentropy": 2.2681163549423218, + "loss/hidden": 0.16259765625, + "loss/logits": 0.026351372711360455, + "loss/reg": 0.02328294701874256, + "step": 1538 + }, + { + "epoch": 0.7695, + "grad_norm": 2.065890073776245, + "grad_norm_var": 0.161315321835504, + "learning_rate": 2e-05, + "loss": 0.5305, + "loss/crossentropy": 2.223512649536133, + "loss/hidden": 0.25048828125, + "loss/logits": 0.047157226130366325, + "loss/reg": 0.023280519992113113, + "step": 1539 + }, + { + "epoch": 0.77, + "grad_norm": 1.4778271913528442, + "grad_norm_var": 0.15746298503991624, + "learning_rate": 2e-05, + "loss": 0.4345, + "loss/crossentropy": 2.3824329376220703, + "loss/hidden": 0.1669921875, + "loss/logits": 0.03470621630549431, + "loss/reg": 0.023278141394257545, + "step": 1540 + }, + { + "epoch": 0.7705, + "grad_norm": 1.643192172050476, + "grad_norm_var": 0.15054023023162647, + "learning_rate": 2e-05, + "loss": 0.4223, + "loss/crossentropy": 2.65035343170166, + "loss/hidden": 0.15673828125, + "loss/logits": 0.032792385667562485, + "loss/reg": 0.023275921121239662, + "step": 1541 + }, + { + "epoch": 0.771, + "grad_norm": 1.5637279748916626, + "grad_norm_var": 0.15210194531488597, + "learning_rate": 2e-05, + "loss": 0.4743, + "loss/crossentropy": 2.5058376789093018, + "loss/hidden": 0.2060546875, + "loss/logits": 0.03547433018684387, + "loss/reg": 0.023273425176739693, + "step": 1542 + }, + { + "epoch": 0.7715, + "grad_norm": 5.894736289978027, + "grad_norm_var": 1.2473798526593487, + "learning_rate": 2e-05, + "loss": 0.6759, + "loss/crossentropy": 2.7393654584884644, + "loss/hidden": 0.3525390625, + "loss/logits": 0.09067841898649931, + "loss/reg": 0.023270903155207634, + "step": 1543 + }, + { + "epoch": 0.772, + "grad_norm": 1.4357421398162842, + "grad_norm_var": 1.2249250941187129, + "learning_rate": 2e-05, + "loss": 0.4196, + "loss/crossentropy": 2.1938605308532715, + "loss/hidden": 0.158203125, + "loss/logits": 0.028688468039035797, + "loss/reg": 0.023268546909093857, + "step": 1544 + }, + { + "epoch": 0.7725, + "grad_norm": 1.4271492958068848, + "grad_norm_var": 1.2287387850562255, + "learning_rate": 2e-05, + "loss": 0.4829, + "loss/crossentropy": 2.1993138790130615, + "loss/hidden": 0.21240234375, + "loss/logits": 0.03782237879931927, + "loss/reg": 0.023266203701496124, + "step": 1545 + }, + { + "epoch": 0.773, + "grad_norm": 1.6962809562683105, + "grad_norm_var": 1.2220146551281785, + "learning_rate": 2e-05, + "loss": 0.4143, + "loss/crossentropy": 2.3468743562698364, + "loss/hidden": 0.15380859375, + "loss/logits": 0.027830702252686024, + "loss/reg": 0.023263977840542793, + "step": 1546 + }, + { + "epoch": 0.7735, + "grad_norm": 1.708454966545105, + "grad_norm_var": 1.2180449645963336, + "learning_rate": 2e-05, + "loss": 0.437, + "loss/crossentropy": 2.366321086883545, + "loss/hidden": 0.17626953125, + "loss/logits": 0.028090238571166992, + "loss/reg": 0.023261502385139465, + "step": 1547 + }, + { + "epoch": 0.774, + "grad_norm": 2.553924083709717, + "grad_norm_var": 1.2240565005172073, + "learning_rate": 2e-05, + "loss": 0.6236, + "loss/crossentropy": 2.093143939971924, + "loss/hidden": 0.32373046875, + "loss/logits": 0.06730393506586552, + "loss/reg": 0.023259302601218224, + "step": 1548 + }, + { + "epoch": 0.7745, + "grad_norm": 1.2026230096817017, + "grad_norm_var": 1.2524918261951337, + "learning_rate": 2e-05, + "loss": 0.4183, + "loss/crossentropy": 2.4064027070999146, + "loss/hidden": 0.1572265625, + "loss/logits": 0.028545232489705086, + "loss/reg": 0.02325684390962124, + "step": 1549 + }, + { + "epoch": 0.775, + "grad_norm": 1.5950380563735962, + "grad_norm_var": 1.2325789918369907, + "learning_rate": 2e-05, + "loss": 0.3969, + "loss/crossentropy": 2.371984839439392, + "loss/hidden": 0.140625, + "loss/logits": 0.02372877486050129, + "loss/reg": 0.02325470745563507, + "step": 1550 + }, + { + "epoch": 0.7755, + "grad_norm": 1.067559003829956, + "grad_norm_var": 1.2668916559295922, + "learning_rate": 2e-05, + "loss": 0.4115, + "loss/crossentropy": 2.2676392793655396, + "loss/hidden": 0.15478515625, + "loss/logits": 0.0241701677441597, + "loss/reg": 0.023252317681908607, + "step": 1551 + }, + { + "epoch": 0.776, + "grad_norm": 2.6447858810424805, + "grad_norm_var": 1.2931606651566094, + "learning_rate": 2e-05, + "loss": 0.4562, + "loss/crossentropy": 2.527026653289795, + "loss/hidden": 0.19482421875, + "loss/logits": 0.028835158795118332, + "loss/reg": 0.0232497937977314, + "step": 1552 + }, + { + "epoch": 0.7765, + "grad_norm": 1.3525029420852661, + "grad_norm_var": 1.2933754435969356, + "learning_rate": 2e-05, + "loss": 0.4655, + "loss/crossentropy": 2.107416331768036, + "loss/hidden": 0.1982421875, + "loss/logits": 0.0347739988937974, + "loss/reg": 0.023247426375746727, + "step": 1553 + }, + { + "epoch": 0.777, + "grad_norm": 1.3164350986480713, + "grad_norm_var": 1.3167433755836309, + "learning_rate": 2e-05, + "loss": 0.4165, + "loss/crossentropy": 2.509757399559021, + "loss/hidden": 0.15673828125, + "loss/logits": 0.027274997904896736, + "loss/reg": 0.023244967684149742, + "step": 1554 + }, + { + "epoch": 0.7775, + "grad_norm": 1.9293817281723022, + "grad_norm_var": 1.3151683429148335, + "learning_rate": 2e-05, + "loss": 0.4151, + "loss/crossentropy": 2.647552251815796, + "loss/hidden": 0.158203125, + "loss/logits": 0.024518443271517754, + "loss/reg": 0.023242756724357605, + "step": 1555 + }, + { + "epoch": 0.778, + "grad_norm": 1.7657341957092285, + "grad_norm_var": 1.3038804133117678, + "learning_rate": 2e-05, + "loss": 0.4489, + "loss/crossentropy": 2.2430570125579834, + "loss/hidden": 0.17822265625, + "loss/logits": 0.038234325125813484, + "loss/reg": 0.02324022725224495, + "step": 1556 + }, + { + "epoch": 0.7785, + "grad_norm": 2.497610330581665, + "grad_norm_var": 1.3174225363238234, + "learning_rate": 2e-05, + "loss": 0.4612, + "loss/crossentropy": 2.505717158317566, + "loss/hidden": 0.19775390625, + "loss/logits": 0.03111663181334734, + "loss/reg": 0.023237932473421097, + "step": 1557 + }, + { + "epoch": 0.779, + "grad_norm": 1.704455852508545, + "grad_norm_var": 1.3108827016120235, + "learning_rate": 2e-05, + "loss": 0.4357, + "loss/crossentropy": 2.389075994491577, + "loss/hidden": 0.1767578125, + "loss/logits": 0.02655597310513258, + "loss/reg": 0.02323562279343605, + "step": 1558 + }, + { + "epoch": 0.7795, + "grad_norm": 1.900311827659607, + "grad_norm_var": 0.22688966028548616, + "learning_rate": 2e-05, + "loss": 0.5081, + "loss/crossentropy": 2.4398266077041626, + "loss/hidden": 0.23583984375, + "loss/logits": 0.03995893709361553, + "loss/reg": 0.023233113810420036, + "step": 1559 + }, + { + "epoch": 0.78, + "grad_norm": 1.3378366231918335, + "grad_norm_var": 0.2314262808823021, + "learning_rate": 2e-05, + "loss": 0.4106, + "loss/crossentropy": 2.1936975717544556, + "loss/hidden": 0.15576171875, + "loss/logits": 0.022547971457242966, + "loss/reg": 0.023230722174048424, + "step": 1560 + }, + { + "epoch": 0.7805, + "grad_norm": 1.7965880632400513, + "grad_norm_var": 0.22497679016718142, + "learning_rate": 2e-05, + "loss": 0.446, + "loss/crossentropy": 2.281595468521118, + "loss/hidden": 0.18359375, + "loss/logits": 0.030129313468933105, + "loss/reg": 0.02322840318083763, + "step": 1561 + }, + { + "epoch": 0.781, + "grad_norm": 1.6652514934539795, + "grad_norm_var": 0.22527719371189883, + "learning_rate": 2e-05, + "loss": 0.4288, + "loss/crossentropy": 2.3167933225631714, + "loss/hidden": 0.17041015625, + "loss/logits": 0.02617516089230776, + "loss/reg": 0.02322593703866005, + "step": 1562 + }, + { + "epoch": 0.7815, + "grad_norm": 1.6363804340362549, + "grad_norm_var": 0.2260242298357046, + "learning_rate": 2e-05, + "loss": 0.4647, + "loss/crossentropy": 1.9782673716545105, + "loss/hidden": 0.20556640625, + "loss/logits": 0.026893844828009605, + "loss/reg": 0.023223651573061943, + "step": 1563 + }, + { + "epoch": 0.782, + "grad_norm": 1.7427809238433838, + "grad_norm_var": 0.1799729760451602, + "learning_rate": 2e-05, + "loss": 0.4594, + "loss/crossentropy": 2.475069522857666, + "loss/hidden": 0.1875, + "loss/logits": 0.03972475230693817, + "loss/reg": 0.023221155628561974, + "step": 1564 + }, + { + "epoch": 0.7825, + "grad_norm": 1.4905965328216553, + "grad_norm_var": 0.16616583137613528, + "learning_rate": 2e-05, + "loss": 0.4287, + "loss/crossentropy": 2.290923833847046, + "loss/hidden": 0.16943359375, + "loss/logits": 0.027077090926468372, + "loss/reg": 0.023218607529997826, + "step": 1565 + }, + { + "epoch": 0.783, + "grad_norm": 1.8255786895751953, + "grad_norm_var": 0.16579392065956847, + "learning_rate": 2e-05, + "loss": 0.4183, + "loss/crossentropy": 2.6674692630767822, + "loss/hidden": 0.1591796875, + "loss/logits": 0.026930052787065506, + "loss/reg": 0.023216072469949722, + "step": 1566 + }, + { + "epoch": 0.7835, + "grad_norm": 2.251720428466797, + "grad_norm_var": 0.14890348739905898, + "learning_rate": 2e-05, + "loss": 0.4939, + "loss/crossentropy": 2.6777660846710205, + "loss/hidden": 0.22119140625, + "loss/logits": 0.040533529594540596, + "loss/reg": 0.023213520646095276, + "step": 1567 + }, + { + "epoch": 0.784, + "grad_norm": 1.9184706211090088, + "grad_norm_var": 0.10041432594898173, + "learning_rate": 2e-05, + "loss": 0.4455, + "loss/crossentropy": 2.48405659198761, + "loss/hidden": 0.18017578125, + "loss/logits": 0.03325136937201023, + "loss/reg": 0.0232110396027565, + "step": 1568 + }, + { + "epoch": 0.7845, + "grad_norm": 2.3974111080169678, + "grad_norm_var": 0.11212794269452289, + "learning_rate": 2e-05, + "loss": 0.5316, + "loss/crossentropy": 2.4565058946609497, + "loss/hidden": 0.24951171875, + "loss/logits": 0.04996338486671448, + "loss/reg": 0.023208467289805412, + "step": 1569 + }, + { + "epoch": 0.785, + "grad_norm": 1.3549920320510864, + "grad_norm_var": 0.10961390038742369, + "learning_rate": 2e-05, + "loss": 0.4089, + "loss/crossentropy": 2.359605550765991, + "loss/hidden": 0.1484375, + "loss/logits": 0.028434154577553272, + "loss/reg": 0.023205863311886787, + "step": 1570 + }, + { + "epoch": 0.7855, + "grad_norm": 1.2709044218063354, + "grad_norm_var": 0.12763188642899853, + "learning_rate": 2e-05, + "loss": 0.45, + "loss/crossentropy": 2.117920219898224, + "loss/hidden": 0.1826171875, + "loss/logits": 0.035326533019542694, + "loss/reg": 0.023203279823064804, + "step": 1571 + }, + { + "epoch": 0.786, + "grad_norm": 1.4363126754760742, + "grad_norm_var": 0.13525123557490268, + "learning_rate": 2e-05, + "loss": 0.4487, + "loss/crossentropy": 2.070175528526306, + "loss/hidden": 0.1875, + "loss/logits": 0.029196069575846195, + "loss/reg": 0.023200761526823044, + "step": 1572 + }, + { + "epoch": 0.7865, + "grad_norm": 1.4775161743164062, + "grad_norm_var": 0.10053524622992847, + "learning_rate": 2e-05, + "loss": 0.446, + "loss/crossentropy": 2.640804171562195, + "loss/hidden": 0.17919921875, + "loss/logits": 0.034833875484764576, + "loss/reg": 0.02319827489554882, + "step": 1573 + }, + { + "epoch": 0.787, + "grad_norm": 1.5340831279754639, + "grad_norm_var": 0.10225829614935306, + "learning_rate": 2e-05, + "loss": 0.4402, + "loss/crossentropy": 2.298627734184265, + "loss/hidden": 0.1748046875, + "loss/logits": 0.03342457953840494, + "loss/reg": 0.02319585159420967, + "step": 1574 + }, + { + "epoch": 0.7875, + "grad_norm": 1.4753564596176147, + "grad_norm_var": 0.10161700731717402, + "learning_rate": 2e-05, + "loss": 0.4379, + "loss/crossentropy": 2.3629835844039917, + "loss/hidden": 0.17626953125, + "loss/logits": 0.029691355302929878, + "loss/reg": 0.02319331094622612, + "step": 1575 + }, + { + "epoch": 0.788, + "grad_norm": 2.231339454650879, + "grad_norm_var": 0.11274765054892152, + "learning_rate": 2e-05, + "loss": 0.5219, + "loss/crossentropy": 2.3111387491226196, + "loss/hidden": 0.25048828125, + "loss/logits": 0.03952763415873051, + "loss/reg": 0.023190749809145927, + "step": 1576 + }, + { + "epoch": 0.7885, + "grad_norm": 1.3988337516784668, + "grad_norm_var": 0.11852513456262694, + "learning_rate": 2e-05, + "loss": 0.4242, + "loss/crossentropy": 2.541161060333252, + "loss/hidden": 0.16259765625, + "loss/logits": 0.029688138514757156, + "loss/reg": 0.023188097402453423, + "step": 1577 + }, + { + "epoch": 0.789, + "grad_norm": 1.5263261795043945, + "grad_norm_var": 0.12026800389912082, + "learning_rate": 2e-05, + "loss": 0.4356, + "loss/crossentropy": 2.4031816720962524, + "loss/hidden": 0.17431640625, + "loss/logits": 0.02942817658185959, + "loss/reg": 0.023185575380921364, + "step": 1578 + }, + { + "epoch": 0.7895, + "grad_norm": 1.917905330657959, + "grad_norm_var": 0.12337632181772822, + "learning_rate": 2e-05, + "loss": 0.4671, + "loss/crossentropy": 2.400526762008667, + "loss/hidden": 0.2021484375, + "loss/logits": 0.033078462816774845, + "loss/reg": 0.023183133453130722, + "step": 1579 + }, + { + "epoch": 0.79, + "grad_norm": 2.0703365802764893, + "grad_norm_var": 0.13181370320904567, + "learning_rate": 2e-05, + "loss": 0.4093, + "loss/crossentropy": 1.912480115890503, + "loss/hidden": 0.15869140625, + "loss/logits": 0.018787679262459278, + "loss/reg": 0.023180615156888962, + "step": 1580 + }, + { + "epoch": 0.7905, + "grad_norm": 2.055941343307495, + "grad_norm_var": 0.1342255915417723, + "learning_rate": 2e-05, + "loss": 0.526, + "loss/crossentropy": 2.1755658388137817, + "loss/hidden": 0.25927734375, + "loss/logits": 0.03490355238318443, + "loss/reg": 0.02317827008664608, + "step": 1581 + }, + { + "epoch": 0.791, + "grad_norm": 1.1831824779510498, + "grad_norm_var": 0.15430979289185978, + "learning_rate": 2e-05, + "loss": 0.4029, + "loss/crossentropy": 2.3617440462112427, + "loss/hidden": 0.14697265625, + "loss/logits": 0.024197732098400593, + "loss/reg": 0.02317577414214611, + "step": 1582 + }, + { + "epoch": 0.7915, + "grad_norm": 1.4519609212875366, + "grad_norm_var": 0.13745687144184232, + "learning_rate": 2e-05, + "loss": 0.4694, + "loss/crossentropy": 2.295761823654175, + "loss/hidden": 0.19921875, + "loss/logits": 0.03844046592712402, + "loss/reg": 0.02317335642874241, + "step": 1583 + }, + { + "epoch": 0.792, + "grad_norm": 1.534197449684143, + "grad_norm_var": 0.13389399149251766, + "learning_rate": 2e-05, + "loss": 0.4531, + "loss/crossentropy": 2.2924489974975586, + "loss/hidden": 0.1767578125, + "loss/logits": 0.04463693127036095, + "loss/reg": 0.023170989006757736, + "step": 1584 + }, + { + "epoch": 0.7925, + "grad_norm": 1.8300005197525024, + "grad_norm_var": 0.09707661533023403, + "learning_rate": 2e-05, + "loss": 0.5176, + "loss/crossentropy": 2.3310474157333374, + "loss/hidden": 0.23486328125, + "loss/logits": 0.05109906196594238, + "loss/reg": 0.023168709129095078, + "step": 1585 + }, + { + "epoch": 0.793, + "grad_norm": 1.2439618110656738, + "grad_norm_var": 0.10161223968455312, + "learning_rate": 2e-05, + "loss": 0.4037, + "loss/crossentropy": 2.2324079275131226, + "loss/hidden": 0.14453125, + "loss/logits": 0.027506624348461628, + "loss/reg": 0.02316616289317608, + "step": 1586 + }, + { + "epoch": 0.7935, + "grad_norm": 1.6163288354873657, + "grad_norm_var": 0.0938027555024466, + "learning_rate": 2e-05, + "loss": 0.4392, + "loss/crossentropy": 2.536410927772522, + "loss/hidden": 0.169921875, + "loss/logits": 0.037604911252856255, + "loss/reg": 0.023163635283708572, + "step": 1587 + }, + { + "epoch": 0.794, + "grad_norm": 1.1579729318618774, + "grad_norm_var": 0.10560929736321494, + "learning_rate": 2e-05, + "loss": 0.4123, + "loss/crossentropy": 2.4252489805221558, + "loss/hidden": 0.150390625, + "loss/logits": 0.03028416447341442, + "loss/reg": 0.023161334916949272, + "step": 1588 + }, + { + "epoch": 0.7945, + "grad_norm": 2.056169271469116, + "grad_norm_var": 0.11657917936412522, + "learning_rate": 2e-05, + "loss": 0.4863, + "loss/crossentropy": 2.3423261642456055, + "loss/hidden": 0.20361328125, + "loss/logits": 0.05107624363154173, + "loss/reg": 0.023159068077802658, + "step": 1589 + }, + { + "epoch": 0.795, + "grad_norm": 1.2322633266448975, + "grad_norm_var": 0.12664541026909054, + "learning_rate": 2e-05, + "loss": 0.4201, + "loss/crossentropy": 2.258147120475769, + "loss/hidden": 0.15966796875, + "loss/logits": 0.028841855004429817, + "loss/reg": 0.023156482726335526, + "step": 1590 + }, + { + "epoch": 0.7955, + "grad_norm": 1.4345581531524658, + "grad_norm_var": 0.12755737501392914, + "learning_rate": 2e-05, + "loss": 0.4291, + "loss/crossentropy": 2.4102907180786133, + "loss/hidden": 0.169921875, + "loss/logits": 0.027651555836200714, + "loss/reg": 0.023154061287641525, + "step": 1591 + }, + { + "epoch": 0.796, + "grad_norm": 2.623196601867676, + "grad_norm_var": 0.16902592388542997, + "learning_rate": 2e-05, + "loss": 0.5085, + "loss/crossentropy": 2.4452123641967773, + "loss/hidden": 0.23828125, + "loss/logits": 0.03867449425160885, + "loss/reg": 0.023151807487010956, + "step": 1592 + }, + { + "epoch": 0.7965, + "grad_norm": 1.1513710021972656, + "grad_norm_var": 0.18100263857273305, + "learning_rate": 2e-05, + "loss": 0.439, + "loss/crossentropy": 2.2152241468429565, + "loss/hidden": 0.1767578125, + "loss/logits": 0.030737859196960926, + "loss/reg": 0.023149540647864342, + "step": 1593 + }, + { + "epoch": 0.797, + "grad_norm": 1.4377129077911377, + "grad_norm_var": 0.18272251392224484, + "learning_rate": 2e-05, + "loss": 0.4097, + "loss/crossentropy": 2.3043720722198486, + "loss/hidden": 0.15283203125, + "loss/logits": 0.025365683250129223, + "loss/reg": 0.023146886378526688, + "step": 1594 + }, + { + "epoch": 0.7975, + "grad_norm": 1.639028549194336, + "grad_norm_var": 0.1766851802806513, + "learning_rate": 2e-05, + "loss": 0.4636, + "loss/crossentropy": 2.422786235809326, + "loss/hidden": 0.177734375, + "loss/logits": 0.0544711509719491, + "loss/reg": 0.023144405335187912, + "step": 1595 + }, + { + "epoch": 0.798, + "grad_norm": 1.2421246767044067, + "grad_norm_var": 0.1684333370511676, + "learning_rate": 2e-05, + "loss": 0.3977, + "loss/crossentropy": 2.6057989597320557, + "loss/hidden": 0.14111328125, + "loss/logits": 0.025157983414828777, + "loss/reg": 0.023141996935009956, + "step": 1596 + }, + { + "epoch": 0.7985, + "grad_norm": 1.2711045742034912, + "grad_norm_var": 0.1545756380850302, + "learning_rate": 2e-05, + "loss": 0.4284, + "loss/crossentropy": 2.264176368713379, + "loss/hidden": 0.16845703125, + "loss/logits": 0.028568227775394917, + "loss/reg": 0.02313930355012417, + "step": 1597 + }, + { + "epoch": 0.799, + "grad_norm": 1.5185736417770386, + "grad_norm_var": 0.14714454199060936, + "learning_rate": 2e-05, + "loss": 0.4114, + "loss/crossentropy": 2.471498489379883, + "loss/hidden": 0.14990234375, + "loss/logits": 0.030109106563031673, + "loss/reg": 0.02313670702278614, + "step": 1598 + }, + { + "epoch": 0.7995, + "grad_norm": 1.553176760673523, + "grad_norm_var": 0.14676495590723318, + "learning_rate": 2e-05, + "loss": 0.3957, + "loss/crossentropy": 2.330732226371765, + "loss/hidden": 0.13916015625, + "loss/logits": 0.025215147994458675, + "loss/reg": 0.023134108632802963, + "step": 1599 + }, + { + "epoch": 0.8, + "grad_norm": 1.5054106712341309, + "grad_norm_var": 0.14681544855401113, + "learning_rate": 2e-05, + "loss": 0.4242, + "loss/crossentropy": 2.3858243227005005, + "loss/hidden": 0.162109375, + "loss/logits": 0.030760521069169044, + "loss/reg": 0.023131774738430977, + "step": 1600 + }, + { + "epoch": 0.8005, + "grad_norm": 1.565080165863037, + "grad_norm_var": 0.1406777927219105, + "learning_rate": 2e-05, + "loss": 0.4291, + "loss/crossentropy": 2.4610700607299805, + "loss/hidden": 0.1630859375, + "loss/logits": 0.03468041494488716, + "loss/reg": 0.023129595443606377, + "step": 1601 + }, + { + "epoch": 0.801, + "grad_norm": 1.212703824043274, + "grad_norm_var": 0.1418705661983741, + "learning_rate": 2e-05, + "loss": 0.4174, + "loss/crossentropy": 2.3021336793899536, + "loss/hidden": 0.15966796875, + "loss/logits": 0.026437725871801376, + "loss/reg": 0.023127034306526184, + "step": 1602 + }, + { + "epoch": 0.8015, + "grad_norm": 1.2017550468444824, + "grad_norm_var": 0.1469311922279634, + "learning_rate": 2e-05, + "loss": 0.3971, + "loss/crossentropy": 2.5906589031219482, + "loss/hidden": 0.14208984375, + "loss/logits": 0.023727728985249996, + "loss/reg": 0.023124776780605316, + "step": 1603 + }, + { + "epoch": 0.802, + "grad_norm": 3.218196392059326, + "grad_norm_var": 0.3216560098977896, + "learning_rate": 2e-05, + "loss": 0.5194, + "loss/crossentropy": 2.1525968313217163, + "loss/hidden": 0.24365234375, + "loss/logits": 0.044501783326268196, + "loss/reg": 0.023122500628232956, + "step": 1604 + }, + { + "epoch": 0.8025, + "grad_norm": 1.5749452114105225, + "grad_norm_var": 0.3079126424294389, + "learning_rate": 2e-05, + "loss": 0.4253, + "loss/crossentropy": 2.2141406536102295, + "loss/hidden": 0.16552734375, + "loss/logits": 0.02860566135495901, + "loss/reg": 0.0231203343719244, + "step": 1605 + }, + { + "epoch": 0.803, + "grad_norm": 1.7416962385177612, + "grad_norm_var": 0.3000833317033832, + "learning_rate": 2e-05, + "loss": 0.4899, + "loss/crossentropy": 2.4038604497909546, + "loss/hidden": 0.22021484375, + "loss/logits": 0.038527075201272964, + "loss/reg": 0.023117849603295326, + "step": 1606 + }, + { + "epoch": 0.8035, + "grad_norm": 12.239812850952148, + "grad_norm_var": 7.332656902880251, + "learning_rate": 2e-05, + "loss": 0.5506, + "loss/crossentropy": 2.144862651824951, + "loss/hidden": 0.2763671875, + "loss/logits": 0.04309249948710203, + "loss/reg": 0.023115267977118492, + "step": 1607 + }, + { + "epoch": 0.804, + "grad_norm": 1.863564133644104, + "grad_norm_var": 7.335328194748469, + "learning_rate": 2e-05, + "loss": 0.4323, + "loss/crossentropy": 2.537988543510437, + "loss/hidden": 0.16845703125, + "loss/logits": 0.032738376408815384, + "loss/reg": 0.023112677037715912, + "step": 1608 + }, + { + "epoch": 0.8045, + "grad_norm": 1.3629024028778076, + "grad_norm_var": 7.307251217498798, + "learning_rate": 2e-05, + "loss": 0.4164, + "loss/crossentropy": 2.1855788230895996, + "loss/hidden": 0.1572265625, + "loss/logits": 0.028044618666172028, + "loss/reg": 0.023110322654247284, + "step": 1609 + }, + { + "epoch": 0.805, + "grad_norm": 1.3974899053573608, + "grad_norm_var": 7.311758223035725, + "learning_rate": 2e-05, + "loss": 0.4062, + "loss/crossentropy": 2.217948317527771, + "loss/hidden": 0.1484375, + "loss/logits": 0.026731343939900398, + "loss/reg": 0.02310797944664955, + "step": 1610 + }, + { + "epoch": 0.8055, + "grad_norm": 2.0517637729644775, + "grad_norm_var": 7.28841256335691, + "learning_rate": 2e-05, + "loss": 0.4398, + "loss/crossentropy": 2.581295609474182, + "loss/hidden": 0.17431640625, + "loss/logits": 0.03437975142151117, + "loss/reg": 0.023105405271053314, + "step": 1611 + }, + { + "epoch": 0.806, + "grad_norm": 2.579063892364502, + "grad_norm_var": 7.2146663129960755, + "learning_rate": 2e-05, + "loss": 0.4879, + "loss/crossentropy": 2.704404354095459, + "loss/hidden": 0.2119140625, + "loss/logits": 0.04493995010852814, + "loss/reg": 0.023102805018424988, + "step": 1612 + }, + { + "epoch": 0.8065, + "grad_norm": 1.4066587686538696, + "grad_norm_var": 7.196024324251629, + "learning_rate": 2e-05, + "loss": 0.4595, + "loss/crossentropy": 2.1706148386001587, + "loss/hidden": 0.19287109375, + "loss/logits": 0.03565484471619129, + "loss/reg": 0.023100463673472404, + "step": 1613 + }, + { + "epoch": 0.807, + "grad_norm": 1.691936731338501, + "grad_norm_var": 7.178116795127499, + "learning_rate": 2e-05, + "loss": 0.4514, + "loss/crossentropy": 2.369131565093994, + "loss/hidden": 0.18505859375, + "loss/logits": 0.03535500913858414, + "loss/reg": 0.02309785783290863, + "step": 1614 + }, + { + "epoch": 0.8075, + "grad_norm": 1.2398101091384888, + "grad_norm_var": 7.21902571074465, + "learning_rate": 2e-05, + "loss": 0.3903, + "loss/crossentropy": 2.3177562952041626, + "loss/hidden": 0.13525390625, + "loss/logits": 0.0241070706397295, + "loss/reg": 0.02309543453156948, + "step": 1615 + }, + { + "epoch": 0.808, + "grad_norm": 2.775026559829712, + "grad_norm_var": 7.17412256855064, + "learning_rate": 2e-05, + "loss": 0.4131, + "loss/crossentropy": 2.624569892883301, + "loss/hidden": 0.15576171875, + "loss/logits": 0.02637580782175064, + "loss/reg": 0.02309308759868145, + "step": 1616 + }, + { + "epoch": 0.8085, + "grad_norm": 1.959854006767273, + "grad_norm_var": 7.137539141392571, + "learning_rate": 2e-05, + "loss": 0.4264, + "loss/crossentropy": 2.393427848815918, + "loss/hidden": 0.16796875, + "loss/logits": 0.0275327330455184, + "loss/reg": 0.023090790957212448, + "step": 1617 + }, + { + "epoch": 0.809, + "grad_norm": 1.2265393733978271, + "grad_norm_var": 7.135232046007838, + "learning_rate": 2e-05, + "loss": 0.4337, + "loss/crossentropy": 2.4681339263916016, + "loss/hidden": 0.17138671875, + "loss/logits": 0.03143185377120972, + "loss/reg": 0.023088427260518074, + "step": 1618 + }, + { + "epoch": 0.8095, + "grad_norm": 1.782827377319336, + "grad_norm_var": 7.05802258224737, + "learning_rate": 2e-05, + "loss": 0.414, + "loss/crossentropy": 2.497174382209778, + "loss/hidden": 0.154296875, + "loss/logits": 0.02879659365862608, + "loss/reg": 0.023086171597242355, + "step": 1619 + }, + { + "epoch": 0.81, + "grad_norm": 1.3240845203399658, + "grad_norm_var": 7.1026412994491706, + "learning_rate": 2e-05, + "loss": 0.4085, + "loss/crossentropy": 2.3640178442001343, + "loss/hidden": 0.1533203125, + "loss/logits": 0.024376518093049526, + "loss/reg": 0.023083915933966637, + "step": 1620 + }, + { + "epoch": 0.8105, + "grad_norm": 1.827821969985962, + "grad_norm_var": 7.079203255275327, + "learning_rate": 2e-05, + "loss": 0.4672, + "loss/crossentropy": 2.2232565879821777, + "loss/hidden": 0.19677734375, + "loss/logits": 0.039636192843317986, + "loss/reg": 0.02308170683681965, + "step": 1621 + }, + { + "epoch": 0.811, + "grad_norm": 1.488737940788269, + "grad_norm_var": 7.105554975206242, + "learning_rate": 2e-05, + "loss": 0.4777, + "loss/crossentropy": 2.3754860162734985, + "loss/hidden": 0.201171875, + "loss/logits": 0.04573565348982811, + "loss/reg": 0.023079518228769302, + "step": 1622 + }, + { + "epoch": 0.8115, + "grad_norm": 1.6029181480407715, + "grad_norm_var": 0.20554311559602045, + "learning_rate": 2e-05, + "loss": 0.5146, + "loss/crossentropy": 2.1231746673583984, + "loss/hidden": 0.2275390625, + "loss/logits": 0.056324394419789314, + "loss/reg": 0.023077305406332016, + "step": 1623 + }, + { + "epoch": 0.812, + "grad_norm": 1.273179054260254, + "grad_norm_var": 0.21632680198712767, + "learning_rate": 2e-05, + "loss": 0.4327, + "loss/crossentropy": 2.269154667854309, + "loss/hidden": 0.16845703125, + "loss/logits": 0.03346337750554085, + "loss/reg": 0.02307521365582943, + "step": 1624 + }, + { + "epoch": 0.8125, + "grad_norm": 1.5247483253479004, + "grad_norm_var": 0.21097195205831215, + "learning_rate": 2e-05, + "loss": 0.4599, + "loss/crossentropy": 2.123014211654663, + "loss/hidden": 0.197265625, + "loss/logits": 0.03189415484666824, + "loss/reg": 0.02307269349694252, + "step": 1625 + }, + { + "epoch": 0.813, + "grad_norm": 1.631244421005249, + "grad_norm_var": 0.20505121684640598, + "learning_rate": 2e-05, + "loss": 0.4684, + "loss/crossentropy": 2.1642907857894897, + "loss/hidden": 0.19677734375, + "loss/logits": 0.040870968252420425, + "loss/reg": 0.02307022735476494, + "step": 1626 + }, + { + "epoch": 0.8135, + "grad_norm": 1.8798401355743408, + "grad_norm_var": 0.19910183072842996, + "learning_rate": 2e-05, + "loss": 0.4351, + "loss/crossentropy": 2.588270664215088, + "loss/hidden": 0.17333984375, + "loss/logits": 0.031081863678991795, + "loss/reg": 0.023067684844136238, + "step": 1627 + }, + { + "epoch": 0.814, + "grad_norm": 1.2950395345687866, + "grad_norm_var": 0.15180106705406657, + "learning_rate": 2e-05, + "loss": 0.4603, + "loss/crossentropy": 2.419018030166626, + "loss/hidden": 0.17138671875, + "loss/logits": 0.05826069973409176, + "loss/reg": 0.02306544780731201, + "step": 1628 + }, + { + "epoch": 0.8145, + "grad_norm": 24.923316955566406, + "grad_norm_var": 34.04542175114692, + "learning_rate": 2e-05, + "loss": 0.7786, + "loss/crossentropy": 2.3988600969314575, + "loss/hidden": 0.49560546875, + "loss/logits": 0.0523617435246706, + "loss/reg": 0.023063141852617264, + "step": 1629 + }, + { + "epoch": 0.815, + "grad_norm": 2.5214052200317383, + "grad_norm_var": 33.933755082592214, + "learning_rate": 2e-05, + "loss": 0.5531, + "loss/crossentropy": 2.5637893676757812, + "loss/hidden": 0.275390625, + "loss/logits": 0.04713789001107216, + "loss/reg": 0.023060709238052368, + "step": 1630 + }, + { + "epoch": 0.8155, + "grad_norm": 1.2581124305725098, + "grad_norm_var": 33.92913341630277, + "learning_rate": 2e-05, + "loss": 0.4302, + "loss/crossentropy": 2.1320899724960327, + "loss/hidden": 0.16845703125, + "loss/logits": 0.03112439066171646, + "loss/reg": 0.023058375343680382, + "step": 1631 + }, + { + "epoch": 0.816, + "grad_norm": 2.5482232570648193, + "grad_norm_var": 33.94348873438556, + "learning_rate": 2e-05, + "loss": 0.6642, + "loss/crossentropy": 1.9117431640625, + "loss/hidden": 0.37109375, + "loss/logits": 0.0625968836247921, + "loss/reg": 0.02305583469569683, + "step": 1632 + }, + { + "epoch": 0.8165, + "grad_norm": 1.597198486328125, + "grad_norm_var": 34.008253404182256, + "learning_rate": 2e-05, + "loss": 0.4802, + "loss/crossentropy": 2.3248562812805176, + "loss/hidden": 0.2060546875, + "loss/logits": 0.04361774958670139, + "loss/reg": 0.023053383454680443, + "step": 1633 + }, + { + "epoch": 0.817, + "grad_norm": 1.1865489482879639, + "grad_norm_var": 34.018377825217904, + "learning_rate": 2e-05, + "loss": 0.4076, + "loss/crossentropy": 2.3980835676193237, + "loss/hidden": 0.1484375, + "loss/logits": 0.028623439371585846, + "loss/reg": 0.023050816729664803, + "step": 1634 + }, + { + "epoch": 0.8175, + "grad_norm": 1.4046670198440552, + "grad_norm_var": 34.09393493073639, + "learning_rate": 2e-05, + "loss": 0.4186, + "loss/crossentropy": 2.367344379425049, + "loss/hidden": 0.15771484375, + "loss/logits": 0.030404978431761265, + "loss/reg": 0.023048415780067444, + "step": 1635 + }, + { + "epoch": 0.818, + "grad_norm": 1.5358824729919434, + "grad_norm_var": 34.04713949789893, + "learning_rate": 2e-05, + "loss": 0.3885, + "loss/crossentropy": 2.635706663131714, + "loss/hidden": 0.1318359375, + "loss/logits": 0.026234203949570656, + "loss/reg": 0.02304593101143837, + "step": 1636 + }, + { + "epoch": 0.8185, + "grad_norm": 1.2111634016036987, + "grad_norm_var": 34.174986550380915, + "learning_rate": 2e-05, + "loss": 0.4149, + "loss/crossentropy": 2.4690955877304077, + "loss/hidden": 0.15283203125, + "loss/logits": 0.0316432137042284, + "loss/reg": 0.023043323308229446, + "step": 1637 + }, + { + "epoch": 0.819, + "grad_norm": 1.3363852500915527, + "grad_norm_var": 34.20825665031358, + "learning_rate": 2e-05, + "loss": 0.4009, + "loss/crossentropy": 2.242287516593933, + "loss/hidden": 0.14501953125, + "loss/logits": 0.025482993572950363, + "loss/reg": 0.023040831089019775, + "step": 1638 + }, + { + "epoch": 0.8195, + "grad_norm": 1.8490867614746094, + "grad_norm_var": 34.16469112797808, + "learning_rate": 2e-05, + "loss": 0.4946, + "loss/crossentropy": 2.4505655765533447, + "loss/hidden": 0.21337890625, + "loss/logits": 0.0508628049865365, + "loss/reg": 0.023038217797875404, + "step": 1639 + }, + { + "epoch": 0.82, + "grad_norm": 1.4550219774246216, + "grad_norm_var": 34.12341073128782, + "learning_rate": 2e-05, + "loss": 0.4708, + "loss/crossentropy": 2.149672269821167, + "loss/hidden": 0.19970703125, + "loss/logits": 0.040755780413746834, + "loss/reg": 0.023035811260342598, + "step": 1640 + }, + { + "epoch": 0.8205, + "grad_norm": 1.5251140594482422, + "grad_norm_var": 34.12333527068636, + "learning_rate": 2e-05, + "loss": 0.4627, + "loss/crossentropy": 2.3447247743606567, + "loss/hidden": 0.197265625, + "loss/logits": 0.03514695540070534, + "loss/reg": 0.023033197969198227, + "step": 1641 + }, + { + "epoch": 0.821, + "grad_norm": 13.460094451904297, + "grad_norm_var": 40.59549407786183, + "learning_rate": 2e-05, + "loss": 0.5042, + "loss/crossentropy": 2.601618528366089, + "loss/hidden": 0.23779296875, + "loss/logits": 0.03612148202955723, + "loss/reg": 0.023030424490571022, + "step": 1642 + }, + { + "epoch": 0.8215, + "grad_norm": 1.2327735424041748, + "grad_norm_var": 40.78833425322313, + "learning_rate": 2e-05, + "loss": 0.4008, + "loss/crossentropy": 2.4731369018554688, + "loss/hidden": 0.140625, + "loss/logits": 0.02993260882794857, + "loss/reg": 0.023027580231428146, + "step": 1643 + }, + { + "epoch": 0.822, + "grad_norm": 1.524492859840393, + "grad_norm_var": 40.715868110383035, + "learning_rate": 2e-05, + "loss": 0.4387, + "loss/crossentropy": 2.4846259355545044, + "loss/hidden": 0.1748046875, + "loss/logits": 0.03361409995704889, + "loss/reg": 0.02302512526512146, + "step": 1644 + }, + { + "epoch": 0.8225, + "grad_norm": 1.2758790254592896, + "grad_norm_var": 9.018881776760608, + "learning_rate": 2e-05, + "loss": 0.434, + "loss/crossentropy": 2.2878633737564087, + "loss/hidden": 0.16796875, + "loss/logits": 0.03582877665758133, + "loss/reg": 0.023022696375846863, + "step": 1645 + }, + { + "epoch": 0.823, + "grad_norm": 1.5986084938049316, + "grad_norm_var": 9.045800842250319, + "learning_rate": 2e-05, + "loss": 0.4196, + "loss/crossentropy": 2.3812626600265503, + "loss/hidden": 0.1591796875, + "loss/logits": 0.03021799586713314, + "loss/reg": 0.02302025444805622, + "step": 1646 + }, + { + "epoch": 0.8235, + "grad_norm": 1.4929598569869995, + "grad_norm_var": 9.018190421650518, + "learning_rate": 2e-05, + "loss": 0.4045, + "loss/crossentropy": 2.4511682987213135, + "loss/hidden": 0.14892578125, + "loss/logits": 0.025395757518708706, + "loss/reg": 0.02301778830587864, + "step": 1647 + }, + { + "epoch": 0.824, + "grad_norm": 1.4948441982269287, + "grad_norm_var": 9.047710234698881, + "learning_rate": 2e-05, + "loss": 0.4522, + "loss/crossentropy": 2.2096160650253296, + "loss/hidden": 0.18505859375, + "loss/logits": 0.03697221539914608, + "loss/reg": 0.023015225306153297, + "step": 1648 + }, + { + "epoch": 0.8245, + "grad_norm": 1.7332985401153564, + "grad_norm_var": 9.0379509596088, + "learning_rate": 2e-05, + "loss": 0.4668, + "loss/crossentropy": 2.3227975368499756, + "loss/hidden": 0.19921875, + "loss/logits": 0.037440777756273746, + "loss/reg": 0.023012757301330566, + "step": 1649 + }, + { + "epoch": 0.825, + "grad_norm": 2.2488136291503906, + "grad_norm_var": 8.963901793690663, + "learning_rate": 2e-05, + "loss": 0.4448, + "loss/crossentropy": 2.352696657180786, + "loss/hidden": 0.18310546875, + "loss/logits": 0.0316165778785944, + "loss/reg": 0.023010345175862312, + "step": 1650 + }, + { + "epoch": 0.8255, + "grad_norm": 1.5257325172424316, + "grad_norm_var": 8.950789974582705, + "learning_rate": 2e-05, + "loss": 0.4114, + "loss/crossentropy": 2.37747323513031, + "loss/hidden": 0.1572265625, + "loss/logits": 0.02410216350108385, + "loss/reg": 0.02300778217613697, + "step": 1651 + }, + { + "epoch": 0.826, + "grad_norm": 1.8591517210006714, + "grad_norm_var": 8.92519375229253, + "learning_rate": 2e-05, + "loss": 0.4952, + "loss/crossentropy": 2.2731701135635376, + "loss/hidden": 0.2119140625, + "loss/logits": 0.05323890969157219, + "loss/reg": 0.023005163297057152, + "step": 1652 + }, + { + "epoch": 0.8265, + "grad_norm": 1.6279247999191284, + "grad_norm_var": 8.875463367206468, + "learning_rate": 2e-05, + "loss": 0.4234, + "loss/crossentropy": 2.2822866439819336, + "loss/hidden": 0.1630859375, + "loss/logits": 0.030256139114499092, + "loss/reg": 0.02300269901752472, + "step": 1653 + }, + { + "epoch": 0.827, + "grad_norm": 1.2949538230895996, + "grad_norm_var": 8.881045821586516, + "learning_rate": 2e-05, + "loss": 0.4295, + "loss/crossentropy": 2.4483895301818848, + "loss/hidden": 0.1630859375, + "loss/logits": 0.0363735593855381, + "loss/reg": 0.023000460118055344, + "step": 1654 + }, + { + "epoch": 0.8275, + "grad_norm": 1.2874404191970825, + "grad_norm_var": 8.936394709625619, + "learning_rate": 2e-05, + "loss": 0.4334, + "loss/crossentropy": 2.3898115158081055, + "loss/hidden": 0.16796875, + "loss/logits": 0.03548043966293335, + "loss/reg": 0.022998474538326263, + "step": 1655 + }, + { + "epoch": 0.828, + "grad_norm": 1.5840253829956055, + "grad_norm_var": 8.923075939282624, + "learning_rate": 2e-05, + "loss": 0.456, + "loss/crossentropy": 2.3553361892700195, + "loss/hidden": 0.1884765625, + "loss/logits": 0.03758828155696392, + "loss/reg": 0.022995930165052414, + "step": 1656 + }, + { + "epoch": 0.8285, + "grad_norm": 1.1669474840164185, + "grad_norm_var": 8.96799758421738, + "learning_rate": 2e-05, + "loss": 0.381, + "loss/crossentropy": 2.548807144165039, + "loss/hidden": 0.12744140625, + "loss/logits": 0.023608416318893433, + "loss/reg": 0.022993767634034157, + "step": 1657 + }, + { + "epoch": 0.829, + "grad_norm": 1.3104734420776367, + "grad_norm_var": 0.07534442062330123, + "learning_rate": 2e-05, + "loss": 0.4122, + "loss/crossentropy": 2.2222912311553955, + "loss/hidden": 0.154296875, + "loss/logits": 0.027987757697701454, + "loss/reg": 0.022991687059402466, + "step": 1658 + }, + { + "epoch": 0.8295, + "grad_norm": 2.410997152328491, + "grad_norm_var": 0.11759094401073747, + "learning_rate": 2e-05, + "loss": 0.6151, + "loss/crossentropy": 2.0733728408813477, + "loss/hidden": 0.333984375, + "loss/logits": 0.05117853730916977, + "loss/reg": 0.022989830002188683, + "step": 1659 + }, + { + "epoch": 0.83, + "grad_norm": 1.55594003200531, + "grad_norm_var": 0.11737898907536574, + "learning_rate": 2e-05, + "loss": 0.4358, + "loss/crossentropy": 2.2041454315185547, + "loss/hidden": 0.16845703125, + "loss/logits": 0.03746410086750984, + "loss/reg": 0.022987263277173042, + "step": 1660 + }, + { + "epoch": 0.8305, + "grad_norm": 1.7716691493988037, + "grad_norm_var": 0.11186125740769033, + "learning_rate": 2e-05, + "loss": 0.3991, + "loss/crossentropy": 2.4366633892059326, + "loss/hidden": 0.14697265625, + "loss/logits": 0.02228802628815174, + "loss/reg": 0.02298472821712494, + "step": 1661 + }, + { + "epoch": 0.831, + "grad_norm": 1.4420490264892578, + "grad_norm_var": 0.11389684457441239, + "learning_rate": 2e-05, + "loss": 0.4123, + "loss/crossentropy": 2.599808931350708, + "loss/hidden": 0.15673828125, + "loss/logits": 0.025708637200295925, + "loss/reg": 0.02298245020210743, + "step": 1662 + }, + { + "epoch": 0.8315, + "grad_norm": 1.126309871673584, + "grad_norm_var": 0.12816484039347759, + "learning_rate": 2e-05, + "loss": 0.3985, + "loss/crossentropy": 2.2323700189590454, + "loss/hidden": 0.1396484375, + "loss/logits": 0.029093537479639053, + "loss/reg": 0.022979876026511192, + "step": 1663 + }, + { + "epoch": 0.832, + "grad_norm": 1.8172188997268677, + "grad_norm_var": 0.13056853667108398, + "learning_rate": 2e-05, + "loss": 0.4711, + "loss/crossentropy": 2.5063730478286743, + "loss/hidden": 0.19873046875, + "loss/logits": 0.0425629410892725, + "loss/reg": 0.022977303713560104, + "step": 1664 + }, + { + "epoch": 0.8325, + "grad_norm": 3.0363452434539795, + "grad_norm_var": 0.2580790516701178, + "learning_rate": 2e-05, + "loss": 0.5382, + "loss/crossentropy": 2.3657928705215454, + "loss/hidden": 0.268310546875, + "loss/logits": 0.04010665416717529, + "loss/reg": 0.022974872961640358, + "step": 1665 + }, + { + "epoch": 0.833, + "grad_norm": 1.2701658010482788, + "grad_norm_var": 0.24523293891671988, + "learning_rate": 2e-05, + "loss": 0.3718, + "loss/crossentropy": 2.386319637298584, + "loss/hidden": 0.124267578125, + "loss/logits": 0.01782753597944975, + "loss/reg": 0.022972485050559044, + "step": 1666 + }, + { + "epoch": 0.8335, + "grad_norm": 2.1191272735595703, + "grad_norm_var": 0.2589543825866409, + "learning_rate": 2e-05, + "loss": 0.4094, + "loss/crossentropy": 2.2309489250183105, + "loss/hidden": 0.1494140625, + "loss/logits": 0.030267059803009033, + "loss/reg": 0.02296994999051094, + "step": 1667 + }, + { + "epoch": 0.834, + "grad_norm": 1.3244271278381348, + "grad_norm_var": 0.26316420886106257, + "learning_rate": 2e-05, + "loss": 0.4091, + "loss/crossentropy": 2.2874940633773804, + "loss/hidden": 0.1494140625, + "loss/logits": 0.03000403381884098, + "loss/reg": 0.02296753227710724, + "step": 1668 + }, + { + "epoch": 0.8345, + "grad_norm": 1.4092116355895996, + "grad_norm_var": 0.2663347603033822, + "learning_rate": 2e-05, + "loss": 0.4036, + "loss/crossentropy": 2.384338140487671, + "loss/hidden": 0.1474609375, + "loss/logits": 0.026476314291357994, + "loss/reg": 0.022965088486671448, + "step": 1669 + }, + { + "epoch": 0.835, + "grad_norm": 1.62082040309906, + "grad_norm_var": 0.2588288547408162, + "learning_rate": 2e-05, + "loss": 0.4793, + "loss/crossentropy": 2.1965416073799133, + "loss/hidden": 0.20751953125, + "loss/logits": 0.042130330577492714, + "loss/reg": 0.02296249382197857, + "step": 1670 + }, + { + "epoch": 0.8355, + "grad_norm": 2.5013601779937744, + "grad_norm_var": 0.29373184542219466, + "learning_rate": 2e-05, + "loss": 0.4534, + "loss/crossentropy": 2.3103402853012085, + "loss/hidden": 0.18896484375, + "loss/logits": 0.03484947420656681, + "loss/reg": 0.022959880530834198, + "step": 1671 + }, + { + "epoch": 0.836, + "grad_norm": 1.700095295906067, + "grad_norm_var": 0.2925206968647416, + "learning_rate": 2e-05, + "loss": 0.4456, + "loss/crossentropy": 2.3426826000213623, + "loss/hidden": 0.1748046875, + "loss/logits": 0.04122760146856308, + "loss/reg": 0.02295738458633423, + "step": 1672 + }, + { + "epoch": 0.8365, + "grad_norm": 2.7791571617126465, + "grad_norm_var": 0.3352385341546844, + "learning_rate": 2e-05, + "loss": 0.5213, + "loss/crossentropy": 2.452883005142212, + "loss/hidden": 0.248046875, + "loss/logits": 0.04374842904508114, + "loss/reg": 0.022954750806093216, + "step": 1673 + }, + { + "epoch": 0.837, + "grad_norm": 1.3409507274627686, + "grad_norm_var": 0.33320691501421645, + "learning_rate": 2e-05, + "loss": 0.4048, + "loss/crossentropy": 2.351631283760071, + "loss/hidden": 0.14892578125, + "loss/logits": 0.0263042114675045, + "loss/reg": 0.022952163591980934, + "step": 1674 + }, + { + "epoch": 0.8375, + "grad_norm": 1.8099993467330933, + "grad_norm_var": 0.3089535187739005, + "learning_rate": 2e-05, + "loss": 0.4723, + "loss/crossentropy": 2.3619593381881714, + "loss/hidden": 0.20703125, + "loss/logits": 0.03580853994935751, + "loss/reg": 0.02294965460896492, + "step": 1675 + }, + { + "epoch": 0.838, + "grad_norm": 1.2641894817352295, + "grad_norm_var": 0.3233415272972024, + "learning_rate": 2e-05, + "loss": 0.4738, + "loss/crossentropy": 2.2749900817871094, + "loss/hidden": 0.197265625, + "loss/logits": 0.047040607780218124, + "loss/reg": 0.022947140038013458, + "step": 1676 + }, + { + "epoch": 0.8385, + "grad_norm": 1.43521249294281, + "grad_norm_var": 0.3303785607627444, + "learning_rate": 2e-05, + "loss": 0.4134, + "loss/crossentropy": 2.2958513498306274, + "loss/hidden": 0.15576171875, + "loss/logits": 0.02822498418390751, + "loss/reg": 0.02294457145035267, + "step": 1677 + }, + { + "epoch": 0.839, + "grad_norm": 1.880581259727478, + "grad_norm_var": 0.32440405684143336, + "learning_rate": 2e-05, + "loss": 0.5203, + "loss/crossentropy": 2.5314308404922485, + "loss/hidden": 0.23583984375, + "loss/logits": 0.05500957649201155, + "loss/reg": 0.022941984236240387, + "step": 1678 + }, + { + "epoch": 0.8395, + "grad_norm": 1.4752358198165894, + "grad_norm_var": 0.3017318093173941, + "learning_rate": 2e-05, + "loss": 0.4043, + "loss/crossentropy": 2.373136043548584, + "loss/hidden": 0.14794921875, + "loss/logits": 0.026981882750988007, + "loss/reg": 0.022939518094062805, + "step": 1679 + }, + { + "epoch": 0.84, + "grad_norm": 1.6214615106582642, + "grad_norm_var": 0.3036514979065638, + "learning_rate": 2e-05, + "loss": 0.4526, + "loss/crossentropy": 2.3645347356796265, + "loss/hidden": 0.1845703125, + "loss/logits": 0.038687046617269516, + "loss/reg": 0.022937096655368805, + "step": 1680 + }, + { + "epoch": 0.8405, + "grad_norm": 2.2418832778930664, + "grad_norm_var": 0.2107344148931207, + "learning_rate": 2e-05, + "loss": 0.4214, + "loss/crossentropy": 2.4823267459869385, + "loss/hidden": 0.1640625, + "loss/logits": 0.02803431637585163, + "loss/reg": 0.022934794425964355, + "step": 1681 + }, + { + "epoch": 0.841, + "grad_norm": 2.971312999725342, + "grad_norm_var": 0.2856894840213674, + "learning_rate": 2e-05, + "loss": 0.6283, + "loss/crossentropy": 2.1208351850509644, + "loss/hidden": 0.37158203125, + "loss/logits": 0.02735395822674036, + "loss/reg": 0.02293219417333603, + "step": 1682 + }, + { + "epoch": 0.8415, + "grad_norm": 1.5801922082901, + "grad_norm_var": 0.28403227039487244, + "learning_rate": 2e-05, + "loss": 0.3926, + "loss/crossentropy": 2.56937313079834, + "loss/hidden": 0.13916015625, + "loss/logits": 0.02419054415076971, + "loss/reg": 0.022929731756448746, + "step": 1683 + }, + { + "epoch": 0.842, + "grad_norm": 2.2374184131622314, + "grad_norm_var": 0.2770492394930005, + "learning_rate": 2e-05, + "loss": 0.5308, + "loss/crossentropy": 2.353764295578003, + "loss/hidden": 0.25732421875, + "loss/logits": 0.044213516637682915, + "loss/reg": 0.02292727865278721, + "step": 1684 + }, + { + "epoch": 0.8425, + "grad_norm": 1.4383546113967896, + "grad_norm_var": 0.2753241881358552, + "learning_rate": 2e-05, + "loss": 0.4721, + "loss/crossentropy": 2.1140084862709045, + "loss/hidden": 0.20703125, + "loss/logits": 0.03579618874937296, + "loss/reg": 0.02292483299970627, + "step": 1685 + }, + { + "epoch": 0.843, + "grad_norm": 1.4581494331359863, + "grad_norm_var": 0.2823531072303079, + "learning_rate": 2e-05, + "loss": 0.4387, + "loss/crossentropy": 2.34401535987854, + "loss/hidden": 0.1787109375, + "loss/logits": 0.030774756334722042, + "loss/reg": 0.022922255098819733, + "step": 1686 + }, + { + "epoch": 0.8435, + "grad_norm": 1.4093376398086548, + "grad_norm_var": 0.263278753257605, + "learning_rate": 2e-05, + "loss": 0.4136, + "loss/crossentropy": 2.465882182121277, + "loss/hidden": 0.15234375, + "loss/logits": 0.032055970281362534, + "loss/reg": 0.022919660434126854, + "step": 1687 + }, + { + "epoch": 0.844, + "grad_norm": 1.5240459442138672, + "grad_norm_var": 0.2673313757129769, + "learning_rate": 2e-05, + "loss": 0.398, + "loss/crossentropy": 2.320050001144409, + "loss/hidden": 0.141845703125, + "loss/logits": 0.026945553719997406, + "loss/reg": 0.022917049005627632, + "step": 1688 + }, + { + "epoch": 0.8445, + "grad_norm": 7.262933731079102, + "grad_norm_var": 2.121647862424502, + "learning_rate": 2e-05, + "loss": 0.9268, + "loss/crossentropy": 2.2761436700820923, + "loss/hidden": 0.48876953125, + "loss/logits": 0.20890014059841633, + "loss/reg": 0.022914528846740723, + "step": 1689 + }, + { + "epoch": 0.845, + "grad_norm": 1.4794998168945312, + "grad_norm_var": 2.1095745457299615, + "learning_rate": 2e-05, + "loss": 0.4308, + "loss/crossentropy": 2.334906578063965, + "loss/hidden": 0.171875, + "loss/logits": 0.029798144474625587, + "loss/reg": 0.022911950945854187, + "step": 1690 + }, + { + "epoch": 0.8455, + "grad_norm": 1.4296311140060425, + "grad_norm_var": 2.1317074764367883, + "learning_rate": 2e-05, + "loss": 0.4302, + "loss/crossentropy": 2.331926465034485, + "loss/hidden": 0.16552734375, + "loss/logits": 0.0355403907597065, + "loss/reg": 0.02290956676006317, + "step": 1691 + }, + { + "epoch": 0.846, + "grad_norm": 2.1096997261047363, + "grad_norm_var": 2.0884379174542818, + "learning_rate": 2e-05, + "loss": 0.4455, + "loss/crossentropy": 2.557571768760681, + "loss/hidden": 0.177734375, + "loss/logits": 0.03866210114210844, + "loss/reg": 0.022907033562660217, + "step": 1692 + }, + { + "epoch": 0.8465, + "grad_norm": 1.6497454643249512, + "grad_norm_var": 2.072379136217235, + "learning_rate": 2e-05, + "loss": 0.4447, + "loss/crossentropy": 2.39884877204895, + "loss/hidden": 0.18359375, + "loss/logits": 0.03204050101339817, + "loss/reg": 0.022904478013515472, + "step": 1693 + }, + { + "epoch": 0.847, + "grad_norm": 3.0329835414886475, + "grad_norm_var": 2.120038982631581, + "learning_rate": 2e-05, + "loss": 0.6206, + "loss/crossentropy": 2.4439542293548584, + "loss/hidden": 0.294921875, + "loss/logits": 0.09666961058974266, + "loss/reg": 0.022901998832821846, + "step": 1694 + }, + { + "epoch": 0.8475, + "grad_norm": 1.6038068532943726, + "grad_norm_var": 2.1089456280954626, + "learning_rate": 2e-05, + "loss": 0.4652, + "loss/crossentropy": 2.129871666431427, + "loss/hidden": 0.19921875, + "loss/logits": 0.03698125295341015, + "loss/reg": 0.022899584844708443, + "step": 1695 + }, + { + "epoch": 0.848, + "grad_norm": 1.957082986831665, + "grad_norm_var": 2.0905146641594694, + "learning_rate": 2e-05, + "loss": 0.4753, + "loss/crossentropy": 2.3653637170791626, + "loss/hidden": 0.19970703125, + "loss/logits": 0.04664120636880398, + "loss/reg": 0.02289716713130474, + "step": 1696 + }, + { + "epoch": 0.8485, + "grad_norm": 1.42384672164917, + "grad_norm_var": 2.129038865225132, + "learning_rate": 2e-05, + "loss": 0.3962, + "loss/crossentropy": 2.5141024589538574, + "loss/hidden": 0.140625, + "loss/logits": 0.026606767438352108, + "loss/reg": 0.022894656285643578, + "step": 1697 + }, + { + "epoch": 0.849, + "grad_norm": 1.4435638189315796, + "grad_norm_var": 2.1097529678037024, + "learning_rate": 2e-05, + "loss": 0.4624, + "loss/crossentropy": 2.211042284965515, + "loss/hidden": 0.19580078125, + "loss/logits": 0.0376845495775342, + "loss/reg": 0.02289220504462719, + "step": 1698 + }, + { + "epoch": 0.8495, + "grad_norm": 1.3736546039581299, + "grad_norm_var": 2.125770387110932, + "learning_rate": 2e-05, + "loss": 0.4857, + "loss/crossentropy": 2.224379062652588, + "loss/hidden": 0.21533203125, + "loss/logits": 0.041461410000920296, + "loss/reg": 0.02288985066115856, + "step": 1699 + }, + { + "epoch": 0.85, + "grad_norm": 1.2207798957824707, + "grad_norm_var": 2.1652485676396744, + "learning_rate": 2e-05, + "loss": 0.4432, + "loss/crossentropy": 2.3198060989379883, + "loss/hidden": 0.17919921875, + "loss/logits": 0.03512590378522873, + "loss/reg": 0.022887248545885086, + "step": 1700 + }, + { + "epoch": 0.8505, + "grad_norm": 3.1013894081115723, + "grad_norm_var": 2.2161002754379138, + "learning_rate": 2e-05, + "loss": 0.4683, + "loss/crossentropy": 2.369223117828369, + "loss/hidden": 0.2021484375, + "loss/logits": 0.03727924171835184, + "loss/reg": 0.02288457751274109, + "step": 1701 + }, + { + "epoch": 0.851, + "grad_norm": 1.519582986831665, + "grad_norm_var": 2.2111400237679426, + "learning_rate": 2e-05, + "loss": 0.4225, + "loss/crossentropy": 2.1703940629959106, + "loss/hidden": 0.1669921875, + "loss/logits": 0.02673946786671877, + "loss/reg": 0.022881818935275078, + "step": 1702 + }, + { + "epoch": 0.8515, + "grad_norm": 1.3196645975112915, + "grad_norm_var": 2.219856788865909, + "learning_rate": 2e-05, + "loss": 0.3877, + "loss/crossentropy": 2.4375799894332886, + "loss/hidden": 0.1357421875, + "loss/logits": 0.02319456171244383, + "loss/reg": 0.02287893183529377, + "step": 1703 + }, + { + "epoch": 0.852, + "grad_norm": 1.607952356338501, + "grad_norm_var": 2.2139568549493474, + "learning_rate": 2e-05, + "loss": 0.4585, + "loss/crossentropy": 2.397140145301819, + "loss/hidden": 0.18994140625, + "loss/logits": 0.039843300357460976, + "loss/reg": 0.022875996306538582, + "step": 1704 + }, + { + "epoch": 0.8525, + "grad_norm": 1.3758463859558105, + "grad_norm_var": 0.32430155493763096, + "learning_rate": 2e-05, + "loss": 0.4734, + "loss/crossentropy": 2.4673901796340942, + "loss/hidden": 0.193359375, + "loss/logits": 0.05126242712140083, + "loss/reg": 0.022873075678944588, + "step": 1705 + }, + { + "epoch": 0.853, + "grad_norm": 1.749706506729126, + "grad_norm_var": 0.31991028408618616, + "learning_rate": 2e-05, + "loss": 0.4687, + "loss/crossentropy": 2.74143385887146, + "loss/hidden": 0.193359375, + "loss/logits": 0.046618303284049034, + "loss/reg": 0.022870399057865143, + "step": 1706 + }, + { + "epoch": 0.8535, + "grad_norm": 2.9021739959716797, + "grad_norm_var": 0.3935280096896221, + "learning_rate": 2e-05, + "loss": 0.61, + "loss/crossentropy": 2.360695719718933, + "loss/hidden": 0.33447265625, + "loss/logits": 0.046809954568743706, + "loss/reg": 0.022867854684591293, + "step": 1707 + }, + { + "epoch": 0.854, + "grad_norm": 1.5873744487762451, + "grad_norm_var": 0.3915854985762199, + "learning_rate": 2e-05, + "loss": 0.5241, + "loss/crossentropy": 2.2110289335250854, + "loss/hidden": 0.240234375, + "loss/logits": 0.05524888634681702, + "loss/reg": 0.022865328937768936, + "step": 1708 + }, + { + "epoch": 0.8545, + "grad_norm": 2.6934444904327393, + "grad_norm_var": 0.4381563541382609, + "learning_rate": 2e-05, + "loss": 0.4892, + "loss/crossentropy": 1.9379909038543701, + "loss/hidden": 0.23095703125, + "loss/logits": 0.029659108258783817, + "loss/reg": 0.02286284975707531, + "step": 1709 + }, + { + "epoch": 0.855, + "grad_norm": 1.629195213317871, + "grad_norm_var": 0.3435589120556684, + "learning_rate": 2e-05, + "loss": 0.4169, + "loss/crossentropy": 2.4776086807250977, + "loss/hidden": 0.15576171875, + "loss/logits": 0.032544512301683426, + "loss/reg": 0.022860383614897728, + "step": 1710 + }, + { + "epoch": 0.8555, + "grad_norm": 2.148226022720337, + "grad_norm_var": 0.3491618389264744, + "learning_rate": 2e-05, + "loss": 0.485, + "loss/crossentropy": 2.5047671794891357, + "loss/hidden": 0.205078125, + "loss/logits": 0.05138644762337208, + "loss/reg": 0.02285795472562313, + "step": 1711 + }, + { + "epoch": 0.856, + "grad_norm": 1.456741452217102, + "grad_norm_var": 0.35538574638478854, + "learning_rate": 2e-05, + "loss": 0.404, + "loss/crossentropy": 2.458739399909973, + "loss/hidden": 0.14892578125, + "loss/logits": 0.0265263793990016, + "loss/reg": 0.02285546064376831, + "step": 1712 + }, + { + "epoch": 0.8565, + "grad_norm": 1.3901511430740356, + "grad_norm_var": 0.3570773520934078, + "learning_rate": 2e-05, + "loss": 0.4505, + "loss/crossentropy": 2.324171304702759, + "loss/hidden": 0.18505859375, + "loss/logits": 0.03695343807339668, + "loss/reg": 0.022852910682559013, + "step": 1713 + }, + { + "epoch": 0.857, + "grad_norm": 1.6300503015518188, + "grad_norm_var": 0.35082418432478046, + "learning_rate": 2e-05, + "loss": 0.4168, + "loss/crossentropy": 2.6420832872390747, + "loss/hidden": 0.15771484375, + "loss/logits": 0.030615486204624176, + "loss/reg": 0.02285032905638218, + "step": 1714 + }, + { + "epoch": 0.8575, + "grad_norm": 1.3466908931732178, + "grad_norm_var": 0.35238126851175644, + "learning_rate": 2e-05, + "loss": 0.3992, + "loss/crossentropy": 2.43982470035553, + "loss/hidden": 0.1455078125, + "loss/logits": 0.025227680802345276, + "loss/reg": 0.022847697138786316, + "step": 1715 + }, + { + "epoch": 0.858, + "grad_norm": 2.9759249687194824, + "grad_norm_var": 0.41113615805510123, + "learning_rate": 2e-05, + "loss": 0.3993, + "loss/crossentropy": 2.382121205329895, + "loss/hidden": 0.148681640625, + "loss/logits": 0.02220490388572216, + "loss/reg": 0.022845016792416573, + "step": 1716 + }, + { + "epoch": 0.8585, + "grad_norm": 1.4040857553482056, + "grad_norm_var": 0.3197881529322027, + "learning_rate": 2e-05, + "loss": 0.4771, + "loss/crossentropy": 2.0994767546653748, + "loss/hidden": 0.20849609375, + "loss/logits": 0.04016950540244579, + "loss/reg": 0.022842474281787872, + "step": 1717 + }, + { + "epoch": 0.859, + "grad_norm": 1.6990382671356201, + "grad_norm_var": 0.3151857693459073, + "learning_rate": 2e-05, + "loss": 0.5367, + "loss/crossentropy": 2.323665142059326, + "loss/hidden": 0.27392578125, + "loss/logits": 0.034341275691986084, + "loss/reg": 0.022839896380901337, + "step": 1718 + }, + { + "epoch": 0.8595, + "grad_norm": 1.2173619270324707, + "grad_norm_var": 0.3224909500736409, + "learning_rate": 2e-05, + "loss": 0.4331, + "loss/crossentropy": 2.1206226348876953, + "loss/hidden": 0.17333984375, + "loss/logits": 0.031357141211628914, + "loss/reg": 0.02283727563917637, + "step": 1719 + }, + { + "epoch": 0.86, + "grad_norm": 2.2023870944976807, + "grad_norm_var": 0.3292850127322119, + "learning_rate": 2e-05, + "loss": 0.5505, + "loss/crossentropy": 2.327507257461548, + "loss/hidden": 0.27001953125, + "loss/logits": 0.052125243470072746, + "loss/reg": 0.022834734991192818, + "step": 1720 + }, + { + "epoch": 0.8605, + "grad_norm": 1.5860549211502075, + "grad_norm_var": 0.319092889556803, + "learning_rate": 2e-05, + "loss": 0.4335, + "loss/crossentropy": 2.394924759864807, + "loss/hidden": 0.173828125, + "loss/logits": 0.031398216262459755, + "loss/reg": 0.022832229733467102, + "step": 1721 + }, + { + "epoch": 0.861, + "grad_norm": 2.052626132965088, + "grad_norm_var": 0.3207301547447267, + "learning_rate": 2e-05, + "loss": 0.4896, + "loss/crossentropy": 2.6150401830673218, + "loss/hidden": 0.21728515625, + "loss/logits": 0.04404893517494202, + "loss/reg": 0.02282971516251564, + "step": 1722 + }, + { + "epoch": 0.8615, + "grad_norm": 1.9768868684768677, + "grad_norm_var": 0.246910721101532, + "learning_rate": 2e-05, + "loss": 0.5072, + "loss/crossentropy": 2.564804196357727, + "loss/hidden": 0.23876953125, + "loss/logits": 0.040187520906329155, + "loss/reg": 0.022827180102467537, + "step": 1723 + }, + { + "epoch": 0.862, + "grad_norm": 1.905512809753418, + "grad_norm_var": 0.2436969642283363, + "learning_rate": 2e-05, + "loss": 0.4706, + "loss/crossentropy": 2.4824973344802856, + "loss/hidden": 0.21337890625, + "loss/logits": 0.028959065675735474, + "loss/reg": 0.02282462827861309, + "step": 1724 + }, + { + "epoch": 0.8625, + "grad_norm": 1.9956358671188354, + "grad_norm_var": 0.19399456280608826, + "learning_rate": 2e-05, + "loss": 0.4786, + "loss/crossentropy": 2.287666082382202, + "loss/hidden": 0.20751953125, + "loss/logits": 0.042856570333242416, + "loss/reg": 0.022822000086307526, + "step": 1725 + }, + { + "epoch": 0.863, + "grad_norm": 1.5660924911499023, + "grad_norm_var": 0.19558407654289164, + "learning_rate": 2e-05, + "loss": 0.4187, + "loss/crossentropy": 2.3453445434570312, + "loss/hidden": 0.16455078125, + "loss/logits": 0.025976940989494324, + "loss/reg": 0.022819381207227707, + "step": 1726 + }, + { + "epoch": 0.8635, + "grad_norm": 1.2870204448699951, + "grad_norm_var": 0.2001835773595658, + "learning_rate": 2e-05, + "loss": 0.4523, + "loss/crossentropy": 2.185749650001526, + "loss/hidden": 0.185546875, + "loss/logits": 0.038570983335375786, + "loss/reg": 0.022816654294729233, + "step": 1727 + }, + { + "epoch": 0.864, + "grad_norm": 1.5679943561553955, + "grad_norm_var": 0.19689234439128783, + "learning_rate": 2e-05, + "loss": 0.48, + "loss/crossentropy": 2.1167298555374146, + "loss/hidden": 0.21630859375, + "loss/logits": 0.0355659443885088, + "loss/reg": 0.022814186289906502, + "step": 1728 + }, + { + "epoch": 0.8645, + "grad_norm": 1.7449185848236084, + "grad_norm_var": 0.1883177922944227, + "learning_rate": 2e-05, + "loss": 0.4252, + "loss/crossentropy": 2.49346387386322, + "loss/hidden": 0.162109375, + "loss/logits": 0.03502298891544342, + "loss/reg": 0.022811725735664368, + "step": 1729 + }, + { + "epoch": 0.865, + "grad_norm": 1.6579017639160156, + "grad_norm_var": 0.18788410072038247, + "learning_rate": 2e-05, + "loss": 0.52, + "loss/crossentropy": 2.4262338876724243, + "loss/hidden": 0.24169921875, + "loss/logits": 0.05023909732699394, + "loss/reg": 0.0228092223405838, + "step": 1730 + }, + { + "epoch": 0.8655, + "grad_norm": 1.4867043495178223, + "grad_norm_var": 0.18136299973852205, + "learning_rate": 2e-05, + "loss": 0.4134, + "loss/crossentropy": 2.3994067907333374, + "loss/hidden": 0.1552734375, + "loss/logits": 0.03004833124577999, + "loss/reg": 0.022806638851761818, + "step": 1731 + }, + { + "epoch": 0.866, + "grad_norm": 1.3810359239578247, + "grad_norm_var": 0.08398193136193673, + "learning_rate": 2e-05, + "loss": 0.4072, + "loss/crossentropy": 2.3403743505477905, + "loss/hidden": 0.1533203125, + "loss/logits": 0.025856359861791134, + "loss/reg": 0.02280416525900364, + "step": 1732 + }, + { + "epoch": 0.8665, + "grad_norm": 1.6184099912643433, + "grad_norm_var": 0.0792338392069519, + "learning_rate": 2e-05, + "loss": 0.416, + "loss/crossentropy": 2.6625880002975464, + "loss/hidden": 0.15234375, + "loss/logits": 0.03561602905392647, + "loss/reg": 0.022801598533988, + "step": 1733 + }, + { + "epoch": 0.867, + "grad_norm": 2.438000202178955, + "grad_norm_var": 0.11483483909980136, + "learning_rate": 2e-05, + "loss": 0.4855, + "loss/crossentropy": 2.478938102722168, + "loss/hidden": 0.20556640625, + "loss/logits": 0.05192135088145733, + "loss/reg": 0.022799065336585045, + "step": 1734 + }, + { + "epoch": 0.8675, + "grad_norm": 1.7405439615249634, + "grad_norm_var": 0.09616209020188246, + "learning_rate": 2e-05, + "loss": 0.5137, + "loss/crossentropy": 2.30058753490448, + "loss/hidden": 0.23095703125, + "loss/logits": 0.05478241667151451, + "loss/reg": 0.022796491160988808, + "step": 1735 + }, + { + "epoch": 0.868, + "grad_norm": 1.431205153465271, + "grad_norm_var": 0.08815077463142741, + "learning_rate": 2e-05, + "loss": 0.4983, + "loss/crossentropy": 2.2017308473587036, + "loss/hidden": 0.2216796875, + "loss/logits": 0.0487048402428627, + "loss/reg": 0.022793902084231377, + "step": 1736 + }, + { + "epoch": 0.8685, + "grad_norm": 9.956767082214355, + "grad_norm_var": 4.3237782917981535, + "learning_rate": 2e-05, + "loss": 1.0532, + "loss/crossentropy": 3.484397292137146, + "loss/hidden": 0.634765625, + "loss/logits": 0.19047586619853973, + "loss/reg": 0.022791236639022827, + "step": 1737 + }, + { + "epoch": 0.869, + "grad_norm": 1.2082791328430176, + "grad_norm_var": 4.389199988572325, + "learning_rate": 2e-05, + "loss": 0.4212, + "loss/crossentropy": 2.3602211475372314, + "loss/hidden": 0.162109375, + "loss/logits": 0.03119662031531334, + "loss/reg": 0.02278871089220047, + "step": 1738 + }, + { + "epoch": 0.8695, + "grad_norm": 1.8653035163879395, + "grad_norm_var": 4.393077132745998, + "learning_rate": 2e-05, + "loss": 0.563, + "loss/crossentropy": 1.9802654385566711, + "loss/hidden": 0.29443359375, + "loss/logits": 0.04070642963051796, + "loss/reg": 0.022786037996411324, + "step": 1739 + }, + { + "epoch": 0.87, + "grad_norm": 1.7705045938491821, + "grad_norm_var": 4.399125143377921, + "learning_rate": 2e-05, + "loss": 0.4537, + "loss/crossentropy": 2.4105314016342163, + "loss/hidden": 0.193359375, + "loss/logits": 0.032505772076547146, + "loss/reg": 0.022783316671848297, + "step": 1740 + }, + { + "epoch": 0.8705, + "grad_norm": 1.8441373109817505, + "grad_norm_var": 4.404077104357423, + "learning_rate": 2e-05, + "loss": 0.4757, + "loss/crossentropy": 2.494503378868103, + "loss/hidden": 0.21533203125, + "loss/logits": 0.03259473852813244, + "loss/reg": 0.022780809551477432, + "step": 1741 + }, + { + "epoch": 0.871, + "grad_norm": 1.637903094291687, + "grad_norm_var": 4.3987100041283655, + "learning_rate": 2e-05, + "loss": 0.4373, + "loss/crossentropy": 2.5231049060821533, + "loss/hidden": 0.17529296875, + "loss/logits": 0.03421847615391016, + "loss/reg": 0.022778036072850227, + "step": 1742 + }, + { + "epoch": 0.8715, + "grad_norm": 1.2547270059585571, + "grad_norm_var": 4.402554673430745, + "learning_rate": 2e-05, + "loss": 0.4012, + "loss/crossentropy": 2.4014939069747925, + "loss/hidden": 0.14404296875, + "loss/logits": 0.029431598260998726, + "loss/reg": 0.02277528867125511, + "step": 1743 + }, + { + "epoch": 0.872, + "grad_norm": 1.684816598892212, + "grad_norm_var": 4.394143219321391, + "learning_rate": 2e-05, + "loss": 0.4864, + "loss/crossentropy": 2.567805290222168, + "loss/hidden": 0.212890625, + "loss/logits": 0.04573212191462517, + "loss/reg": 0.022772807627916336, + "step": 1744 + }, + { + "epoch": 0.8725, + "grad_norm": 1.8819024562835693, + "grad_norm_var": 4.387550777046777, + "learning_rate": 2e-05, + "loss": 0.5153, + "loss/crossentropy": 2.206387996673584, + "loss/hidden": 0.23828125, + "loss/logits": 0.049341777339577675, + "loss/reg": 0.022770432755351067, + "step": 1745 + }, + { + "epoch": 0.873, + "grad_norm": 1.7741963863372803, + "grad_norm_var": 4.380321608464935, + "learning_rate": 2e-05, + "loss": 0.4103, + "loss/crossentropy": 2.610047698020935, + "loss/hidden": 0.15966796875, + "loss/logits": 0.022908887825906277, + "loss/reg": 0.022767851129174232, + "step": 1746 + }, + { + "epoch": 0.8735, + "grad_norm": 3.9074506759643555, + "grad_norm_var": 4.520894958490501, + "learning_rate": 2e-05, + "loss": 0.6931, + "loss/crossentropy": 2.1363461017608643, + "loss/hidden": 0.365234375, + "loss/logits": 0.10018501989543438, + "loss/reg": 0.02276543714106083, + "step": 1747 + }, + { + "epoch": 0.874, + "grad_norm": 1.198498249053955, + "grad_norm_var": 4.546248895237187, + "learning_rate": 2e-05, + "loss": 0.4262, + "loss/crossentropy": 2.4276458024978638, + "loss/hidden": 0.16796875, + "loss/logits": 0.030584653839468956, + "loss/reg": 0.022762905806303024, + "step": 1748 + }, + { + "epoch": 0.8745, + "grad_norm": 1.802208662033081, + "grad_norm_var": 4.531024858198469, + "learning_rate": 2e-05, + "loss": 0.4372, + "loss/crossentropy": 2.446824789047241, + "loss/hidden": 0.18017578125, + "loss/logits": 0.029467890039086342, + "loss/reg": 0.02276029624044895, + "step": 1749 + }, + { + "epoch": 0.875, + "grad_norm": 1.8234444856643677, + "grad_norm_var": 4.546376504661152, + "learning_rate": 2e-05, + "loss": 0.4537, + "loss/crossentropy": 2.524027705192566, + "loss/hidden": 0.18603515625, + "loss/logits": 0.04007681459188461, + "loss/reg": 0.022757630795240402, + "step": 1750 + }, + { + "epoch": 0.8755, + "grad_norm": 1.8809150457382202, + "grad_norm_var": 4.537158333397111, + "learning_rate": 2e-05, + "loss": 0.4648, + "loss/crossentropy": 2.4926512241363525, + "loss/hidden": 0.20556640625, + "loss/logits": 0.03168141841888428, + "loss/reg": 0.02275506965816021, + "step": 1751 + }, + { + "epoch": 0.876, + "grad_norm": 1.2176272869110107, + "grad_norm_var": 4.564967615041626, + "learning_rate": 2e-05, + "loss": 0.4126, + "loss/crossentropy": 2.351579189300537, + "loss/hidden": 0.15869140625, + "loss/logits": 0.026421986520290375, + "loss/reg": 0.022752393037080765, + "step": 1752 + }, + { + "epoch": 0.8765, + "grad_norm": 2.546253204345703, + "grad_norm_var": 0.4261500613285089, + "learning_rate": 2e-05, + "loss": 0.4591, + "loss/crossentropy": 2.254626989364624, + "loss/hidden": 0.19189453125, + "loss/logits": 0.039690613746643066, + "loss/reg": 0.022749925032258034, + "step": 1753 + }, + { + "epoch": 0.877, + "grad_norm": 1.4721755981445312, + "grad_norm_var": 0.4085867001765545, + "learning_rate": 2e-05, + "loss": 0.4259, + "loss/crossentropy": 2.4233288764953613, + "loss/hidden": 0.16748046875, + "loss/logits": 0.03093926515430212, + "loss/reg": 0.022747157141566277, + "step": 1754 + }, + { + "epoch": 0.8775, + "grad_norm": 1.2497280836105347, + "grad_norm_var": 0.43081935423290685, + "learning_rate": 2e-05, + "loss": 0.3901, + "loss/crossentropy": 2.395901918411255, + "loss/hidden": 0.138916015625, + "loss/logits": 0.023775647394359112, + "loss/reg": 0.022744452580809593, + "step": 1755 + }, + { + "epoch": 0.878, + "grad_norm": 1.8739466667175293, + "grad_norm_var": 0.4309550360190786, + "learning_rate": 2e-05, + "loss": 0.4859, + "loss/crossentropy": 2.5788021087646484, + "loss/hidden": 0.21923828125, + "loss/logits": 0.03924744948744774, + "loss/reg": 0.02274180017411709, + "step": 1756 + }, + { + "epoch": 0.8785, + "grad_norm": 1.8786344528198242, + "grad_norm_var": 0.43116057997378515, + "learning_rate": 2e-05, + "loss": 0.433, + "loss/crossentropy": 2.2879987955093384, + "loss/hidden": 0.17431640625, + "loss/logits": 0.03129947930574417, + "loss/reg": 0.02273917943239212, + "step": 1757 + }, + { + "epoch": 0.879, + "grad_norm": 1.6786879301071167, + "grad_norm_var": 0.4302863936647914, + "learning_rate": 2e-05, + "loss": 0.507, + "loss/crossentropy": 2.3784435987472534, + "loss/hidden": 0.236328125, + "loss/logits": 0.04326160717755556, + "loss/reg": 0.022736700251698494, + "step": 1758 + }, + { + "epoch": 0.8795, + "grad_norm": 1.2841837406158447, + "grad_norm_var": 0.42811919905549467, + "learning_rate": 2e-05, + "loss": 0.4027, + "loss/crossentropy": 2.5569673776626587, + "loss/hidden": 0.14990234375, + "loss/logits": 0.025482705794274807, + "loss/reg": 0.02273416332900524, + "step": 1759 + }, + { + "epoch": 0.88, + "grad_norm": 1.5086617469787598, + "grad_norm_var": 0.43328459560282606, + "learning_rate": 2e-05, + "loss": 0.5087, + "loss/crossentropy": 2.386002779006958, + "loss/hidden": 0.2529296875, + "loss/logits": 0.028444298543035984, + "loss/reg": 0.02273155376315117, + "step": 1760 + }, + { + "epoch": 0.8805, + "grad_norm": 1.7824084758758545, + "grad_norm_var": 0.4329647889707303, + "learning_rate": 2e-05, + "loss": 0.4786, + "loss/crossentropy": 2.3847200870513916, + "loss/hidden": 0.20849609375, + "loss/logits": 0.04277382045984268, + "loss/reg": 0.02272888645529747, + "step": 1761 + }, + { + "epoch": 0.881, + "grad_norm": 4.4069061279296875, + "grad_norm_var": 0.8553708809071431, + "learning_rate": 2e-05, + "loss": 0.4992, + "loss/crossentropy": 2.23597252368927, + "loss/hidden": 0.2265625, + "loss/logits": 0.04541921988129616, + "loss/reg": 0.02272612974047661, + "step": 1762 + }, + { + "epoch": 0.8815, + "grad_norm": 1.4857374429702759, + "grad_norm_var": 0.596154104293141, + "learning_rate": 2e-05, + "loss": 0.4126, + "loss/crossentropy": 2.1902058124542236, + "loss/hidden": 0.15771484375, + "loss/logits": 0.027693829499185085, + "loss/reg": 0.022723568603396416, + "step": 1763 + }, + { + "epoch": 0.882, + "grad_norm": 1.1129963397979736, + "grad_norm_var": 0.6036749302760668, + "learning_rate": 2e-05, + "loss": 0.404, + "loss/crossentropy": 2.1529598236083984, + "loss/hidden": 0.14794921875, + "loss/logits": 0.028801556676626205, + "loss/reg": 0.022720852866768837, + "step": 1764 + }, + { + "epoch": 0.8825, + "grad_norm": 1.3617935180664062, + "grad_norm_var": 0.6164186737964911, + "learning_rate": 2e-05, + "loss": 0.4394, + "loss/crossentropy": 2.4010159969329834, + "loss/hidden": 0.17578125, + "loss/logits": 0.03646496683359146, + "loss/reg": 0.022718340158462524, + "step": 1765 + }, + { + "epoch": 0.883, + "grad_norm": 1.3944129943847656, + "grad_norm_var": 0.6257383981751897, + "learning_rate": 2e-05, + "loss": 0.4295, + "loss/crossentropy": 2.249167323112488, + "loss/hidden": 0.17236328125, + "loss/logits": 0.029935719445347786, + "loss/reg": 0.022715754806995392, + "step": 1766 + }, + { + "epoch": 0.8835, + "grad_norm": 1.9540674686431885, + "grad_norm_var": 0.6272674150301994, + "learning_rate": 2e-05, + "loss": 0.445, + "loss/crossentropy": 2.6090357303619385, + "loss/hidden": 0.1884765625, + "loss/logits": 0.029395846650004387, + "loss/reg": 0.022712942212820053, + "step": 1767 + }, + { + "epoch": 0.884, + "grad_norm": 1.2891789674758911, + "grad_norm_var": 0.62238428300894, + "learning_rate": 2e-05, + "loss": 0.3992, + "loss/crossentropy": 2.690170645713806, + "loss/hidden": 0.146484375, + "loss/logits": 0.02565884869545698, + "loss/reg": 0.022710150107741356, + "step": 1768 + }, + { + "epoch": 0.8845, + "grad_norm": 1.2826368808746338, + "grad_norm_var": 0.590971243638222, + "learning_rate": 2e-05, + "loss": 0.4288, + "loss/crossentropy": 2.5179227590560913, + "loss/hidden": 0.17236328125, + "loss/logits": 0.029347356408834457, + "loss/reg": 0.022707320749759674, + "step": 1769 + }, + { + "epoch": 0.885, + "grad_norm": 1.7831594944000244, + "grad_norm_var": 0.588045487335727, + "learning_rate": 2e-05, + "loss": 0.4584, + "loss/crossentropy": 2.4644254446029663, + "loss/hidden": 0.197265625, + "loss/logits": 0.03404002822935581, + "loss/reg": 0.022704841569066048, + "step": 1770 + }, + { + "epoch": 0.8855, + "grad_norm": 1.3630361557006836, + "grad_norm_var": 0.5819252647022783, + "learning_rate": 2e-05, + "loss": 0.4236, + "loss/crossentropy": 2.4129849672317505, + "loss/hidden": 0.1630859375, + "loss/logits": 0.03353873174637556, + "loss/reg": 0.022702371701598167, + "step": 1771 + }, + { + "epoch": 0.886, + "grad_norm": 1.395241379737854, + "grad_norm_var": 0.5861043275034279, + "learning_rate": 2e-05, + "loss": 0.452, + "loss/crossentropy": 2.3290704488754272, + "loss/hidden": 0.1845703125, + "loss/logits": 0.04044055938720703, + "loss/reg": 0.022700009867548943, + "step": 1772 + }, + { + "epoch": 0.8865, + "grad_norm": 1.7868410348892212, + "grad_norm_var": 0.584262372098181, + "learning_rate": 2e-05, + "loss": 0.5297, + "loss/crossentropy": 2.2407915592193604, + "loss/hidden": 0.25830078125, + "loss/logits": 0.044432349503040314, + "loss/reg": 0.022697754204273224, + "step": 1773 + }, + { + "epoch": 0.887, + "grad_norm": 1.151076316833496, + "grad_norm_var": 0.6017088609785968, + "learning_rate": 2e-05, + "loss": 0.4063, + "loss/crossentropy": 2.358174681663513, + "loss/hidden": 0.15185546875, + "loss/logits": 0.027467947453260422, + "loss/reg": 0.022695155814290047, + "step": 1774 + }, + { + "epoch": 0.8875, + "grad_norm": 1.4789233207702637, + "grad_norm_var": 0.5946741348237327, + "learning_rate": 2e-05, + "loss": 0.492, + "loss/crossentropy": 2.0869343280792236, + "loss/hidden": 0.21875, + "loss/logits": 0.046349382027983665, + "loss/reg": 0.022692805156111717, + "step": 1775 + }, + { + "epoch": 0.888, + "grad_norm": 1.7221815586090088, + "grad_norm_var": 0.5932558452639825, + "learning_rate": 2e-05, + "loss": 0.4587, + "loss/crossentropy": 2.397850751876831, + "loss/hidden": 0.19140625, + "loss/logits": 0.04042772948741913, + "loss/reg": 0.02269033156335354, + "step": 1776 + }, + { + "epoch": 0.8885, + "grad_norm": 1.2972921133041382, + "grad_norm_var": 0.6008173321053277, + "learning_rate": 2e-05, + "loss": 0.4126, + "loss/crossentropy": 2.3940863609313965, + "loss/hidden": 0.15673828125, + "loss/logits": 0.028958087787032127, + "loss/reg": 0.02268776297569275, + "step": 1777 + }, + { + "epoch": 0.889, + "grad_norm": 1.3768194913864136, + "grad_norm_var": 0.05743777499220073, + "learning_rate": 2e-05, + "loss": 0.4215, + "loss/crossentropy": 2.385925769805908, + "loss/hidden": 0.162109375, + "loss/logits": 0.03253248520195484, + "loss/reg": 0.022685421630740166, + "step": 1778 + }, + { + "epoch": 0.8895, + "grad_norm": 1.526502013206482, + "grad_norm_var": 0.057723853573745744, + "learning_rate": 2e-05, + "loss": 0.4341, + "loss/crossentropy": 2.2163840532302856, + "loss/hidden": 0.17578125, + "loss/logits": 0.03145410865545273, + "loss/reg": 0.022683102637529373, + "step": 1779 + }, + { + "epoch": 0.89, + "grad_norm": 1.6970871686935425, + "grad_norm_var": 0.05243035328896462, + "learning_rate": 2e-05, + "loss": 0.4789, + "loss/crossentropy": 2.2332208156585693, + "loss/hidden": 0.20654296875, + "loss/logits": 0.04551873542368412, + "loss/reg": 0.022680532187223434, + "step": 1780 + }, + { + "epoch": 0.8905, + "grad_norm": 1.4872901439666748, + "grad_norm_var": 0.051248249436363774, + "learning_rate": 2e-05, + "loss": 0.4758, + "loss/crossentropy": 2.301728844642639, + "loss/hidden": 0.2080078125, + "loss/logits": 0.04098478890955448, + "loss/reg": 0.022678013890981674, + "step": 1781 + }, + { + "epoch": 0.891, + "grad_norm": 1.4966388940811157, + "grad_norm_var": 0.050474361598935265, + "learning_rate": 2e-05, + "loss": 0.4928, + "loss/crossentropy": 2.034238338470459, + "loss/hidden": 0.22265625, + "loss/logits": 0.04338419623672962, + "loss/reg": 0.022675424814224243, + "step": 1782 + }, + { + "epoch": 0.8915, + "grad_norm": 1.5417438745498657, + "grad_norm_var": 0.03643927829848502, + "learning_rate": 2e-05, + "loss": 0.4808, + "loss/crossentropy": 2.180203914642334, + "loss/hidden": 0.2060546875, + "loss/logits": 0.048051947727799416, + "loss/reg": 0.02267291769385338, + "step": 1783 + }, + { + "epoch": 0.892, + "grad_norm": 1.3023936748504639, + "grad_norm_var": 0.03611445252943189, + "learning_rate": 2e-05, + "loss": 0.4562, + "loss/crossentropy": 2.403857707977295, + "loss/hidden": 0.19189453125, + "loss/logits": 0.037562835961580276, + "loss/reg": 0.022670235484838486, + "step": 1784 + }, + { + "epoch": 0.8925, + "grad_norm": 1.5010074377059937, + "grad_norm_var": 0.033332240131487785, + "learning_rate": 2e-05, + "loss": 0.4343, + "loss/crossentropy": 2.3974303007125854, + "loss/hidden": 0.17431640625, + "loss/logits": 0.03334982506930828, + "loss/reg": 0.02266768552362919, + "step": 1785 + }, + { + "epoch": 0.893, + "grad_norm": 1.8571279048919678, + "grad_norm_var": 0.036524026921363806, + "learning_rate": 2e-05, + "loss": 0.4137, + "loss/crossentropy": 2.7112414836883545, + "loss/hidden": 0.1552734375, + "loss/logits": 0.03175277356058359, + "loss/reg": 0.022665170952677727, + "step": 1786 + }, + { + "epoch": 0.8935, + "grad_norm": 1.2478469610214233, + "grad_norm_var": 0.0394388347318376, + "learning_rate": 2e-05, + "loss": 0.4274, + "loss/crossentropy": 2.3169617652893066, + "loss/hidden": 0.1708984375, + "loss/logits": 0.02990701049566269, + "loss/reg": 0.02266273833811283, + "step": 1787 + }, + { + "epoch": 0.894, + "grad_norm": 2.265127182006836, + "grad_norm_var": 0.07555353783638541, + "learning_rate": 2e-05, + "loss": 0.4095, + "loss/crossentropy": 2.3823176622390747, + "loss/hidden": 0.16015625, + "loss/logits": 0.02273565251380205, + "loss/reg": 0.02266021817922592, + "step": 1788 + }, + { + "epoch": 0.8945, + "grad_norm": 1.3702856302261353, + "grad_norm_var": 0.07302160323975777, + "learning_rate": 2e-05, + "loss": 0.3994, + "loss/crossentropy": 2.4472192525863647, + "loss/hidden": 0.14794921875, + "loss/logits": 0.02484053187072277, + "loss/reg": 0.02265772968530655, + "step": 1789 + }, + { + "epoch": 0.895, + "grad_norm": 1.5002284049987793, + "grad_norm_var": 0.06346798172954345, + "learning_rate": 2e-05, + "loss": 0.4878, + "loss/crossentropy": 2.291977286338806, + "loss/hidden": 0.2177734375, + "loss/logits": 0.043478766456246376, + "loss/reg": 0.022655179724097252, + "step": 1790 + }, + { + "epoch": 0.8955, + "grad_norm": 1.1956472396850586, + "grad_norm_var": 0.07085745843397048, + "learning_rate": 2e-05, + "loss": 0.421, + "loss/crossentropy": 2.269936203956604, + "loss/hidden": 0.16552734375, + "loss/logits": 0.02890065312385559, + "loss/reg": 0.02265259623527527, + "step": 1791 + }, + { + "epoch": 0.896, + "grad_norm": 1.631693720817566, + "grad_norm_var": 0.068979061781067, + "learning_rate": 2e-05, + "loss": 0.4571, + "loss/crossentropy": 2.319582223892212, + "loss/hidden": 0.193359375, + "loss/logits": 0.0372174559161067, + "loss/reg": 0.02265011891722679, + "step": 1792 + }, + { + "epoch": 0.8965, + "grad_norm": 1.1987940073013306, + "grad_norm_var": 0.07248952922075773, + "learning_rate": 2e-05, + "loss": 0.4329, + "loss/crossentropy": 2.2927104234695435, + "loss/hidden": 0.17431640625, + "loss/logits": 0.03212358243763447, + "loss/reg": 0.022647712379693985, + "step": 1793 + }, + { + "epoch": 0.897, + "grad_norm": 2.1740849018096924, + "grad_norm_var": 0.09781844329650032, + "learning_rate": 2e-05, + "loss": 0.5068, + "loss/crossentropy": 1.931494951248169, + "loss/hidden": 0.24462890625, + "loss/logits": 0.03571862727403641, + "loss/reg": 0.022645175457000732, + "step": 1794 + }, + { + "epoch": 0.8975, + "grad_norm": 2.1789815425872803, + "grad_norm_var": 0.12133015992480196, + "learning_rate": 2e-05, + "loss": 0.4422, + "loss/crossentropy": 2.1458447575569153, + "loss/hidden": 0.1875, + "loss/logits": 0.02825307659804821, + "loss/reg": 0.022642606869339943, + "step": 1795 + }, + { + "epoch": 0.898, + "grad_norm": 1.4859169721603394, + "grad_norm_var": 0.12146453537655641, + "learning_rate": 2e-05, + "loss": 0.458, + "loss/crossentropy": 2.554581642150879, + "loss/hidden": 0.19189453125, + "loss/logits": 0.03974371217191219, + "loss/reg": 0.02264014631509781, + "step": 1796 + }, + { + "epoch": 0.8985, + "grad_norm": 1.7171391248703003, + "grad_norm_var": 0.121628688093484, + "learning_rate": 2e-05, + "loss": 0.4388, + "loss/crossentropy": 2.5617785453796387, + "loss/hidden": 0.17626953125, + "loss/logits": 0.03613244369626045, + "loss/reg": 0.022637590765953064, + "step": 1797 + }, + { + "epoch": 0.899, + "grad_norm": 1.327169418334961, + "grad_norm_var": 0.12585053460300416, + "learning_rate": 2e-05, + "loss": 0.4193, + "loss/crossentropy": 2.2237210273742676, + "loss/hidden": 0.1611328125, + "loss/logits": 0.03177984245121479, + "loss/reg": 0.022635027766227722, + "step": 1798 + }, + { + "epoch": 0.8995, + "grad_norm": 1.7142618894577026, + "grad_norm_var": 0.12652134086684536, + "learning_rate": 2e-05, + "loss": 0.4097, + "loss/crossentropy": 2.3350863456726074, + "loss/hidden": 0.153564453125, + "loss/logits": 0.02983055729418993, + "loss/reg": 0.02263249270617962, + "step": 1799 + }, + { + "epoch": 0.9, + "grad_norm": 1.9557993412017822, + "grad_norm_var": 0.12690867583912677, + "learning_rate": 2e-05, + "loss": 0.4866, + "loss/crossentropy": 2.4636796712875366, + "loss/hidden": 0.21826171875, + "loss/logits": 0.042068254202604294, + "loss/reg": 0.02263004146516323, + "step": 1800 + }, + { + "epoch": 0.9005, + "grad_norm": 1.1416701078414917, + "grad_norm_var": 0.14188113240769837, + "learning_rate": 2e-05, + "loss": 0.4022, + "loss/crossentropy": 2.5174624919891357, + "loss/hidden": 0.14794921875, + "loss/logits": 0.02796847652643919, + "loss/reg": 0.022627437487244606, + "step": 1801 + }, + { + "epoch": 0.901, + "grad_norm": 1.4578477144241333, + "grad_norm_var": 0.13936010822746894, + "learning_rate": 2e-05, + "loss": 0.4365, + "loss/crossentropy": 2.3277111053466797, + "loss/hidden": 0.17578125, + "loss/logits": 0.03451576270163059, + "loss/reg": 0.02262502908706665, + "step": 1802 + }, + { + "epoch": 0.9015, + "grad_norm": 2.0312423706054688, + "grad_norm_var": 0.14117838718365097, + "learning_rate": 2e-05, + "loss": 0.3985, + "loss/crossentropy": 2.541975498199463, + "loss/hidden": 0.1455078125, + "loss/logits": 0.026732699014246464, + "loss/reg": 0.02262257970869541, + "step": 1803 + }, + { + "epoch": 0.902, + "grad_norm": 2.132697820663452, + "grad_norm_var": 0.13135331477077988, + "learning_rate": 2e-05, + "loss": 0.4634, + "loss/crossentropy": 2.439447283744812, + "loss/hidden": 0.2001953125, + "loss/logits": 0.03704650327563286, + "loss/reg": 0.022620007395744324, + "step": 1804 + }, + { + "epoch": 0.9025, + "grad_norm": 1.7248797416687012, + "grad_norm_var": 0.12653841640955357, + "learning_rate": 2e-05, + "loss": 0.4427, + "loss/crossentropy": 2.479643940925598, + "loss/hidden": 0.17822265625, + "loss/logits": 0.038322363048791885, + "loss/reg": 0.02261737734079361, + "step": 1805 + }, + { + "epoch": 0.903, + "grad_norm": 1.5996307134628296, + "grad_norm_var": 0.12503174039449808, + "learning_rate": 2e-05, + "loss": 0.4873, + "loss/crossentropy": 2.3944002389907837, + "loss/hidden": 0.21728515625, + "loss/logits": 0.04385751113295555, + "loss/reg": 0.02261476404964924, + "step": 1806 + }, + { + "epoch": 0.9035, + "grad_norm": 2.0911006927490234, + "grad_norm_var": 0.11890385472204343, + "learning_rate": 2e-05, + "loss": 0.4095, + "loss/crossentropy": 2.336695432662964, + "loss/hidden": 0.158447265625, + "loss/logits": 0.024968229234218597, + "loss/reg": 0.022612126544117928, + "step": 1807 + }, + { + "epoch": 0.904, + "grad_norm": 1.5537153482437134, + "grad_norm_var": 0.12022990836071562, + "learning_rate": 2e-05, + "loss": 0.4726, + "loss/crossentropy": 2.2479125261306763, + "loss/hidden": 0.22021484375, + "loss/logits": 0.026327339932322502, + "loss/reg": 0.022609485313296318, + "step": 1808 + }, + { + "epoch": 0.9045, + "grad_norm": 1.2499186992645264, + "grad_norm_var": 0.11685534109740553, + "learning_rate": 2e-05, + "loss": 0.4078, + "loss/crossentropy": 2.4833693504333496, + "loss/hidden": 0.1494140625, + "loss/logits": 0.03229031339287758, + "loss/reg": 0.02260700799524784, + "step": 1809 + }, + { + "epoch": 0.905, + "grad_norm": 1.5342835187911987, + "grad_norm_var": 0.10378850866723704, + "learning_rate": 2e-05, + "loss": 0.4023, + "loss/crossentropy": 2.1959651708602905, + "loss/hidden": 0.150390625, + "loss/logits": 0.02590491622686386, + "loss/reg": 0.022604528814554214, + "step": 1810 + }, + { + "epoch": 0.9055, + "grad_norm": 1.044647455215454, + "grad_norm_var": 0.10889354132386113, + "learning_rate": 2e-05, + "loss": 0.3867, + "loss/crossentropy": 2.1947706937789917, + "loss/hidden": 0.1357421875, + "loss/logits": 0.0249461866915226, + "loss/reg": 0.022602051496505737, + "step": 1811 + }, + { + "epoch": 0.906, + "grad_norm": 3.011995792388916, + "grad_norm_var": 0.2291783334976803, + "learning_rate": 2e-05, + "loss": 0.5375, + "loss/crossentropy": 2.6274040937423706, + "loss/hidden": 0.26025390625, + "loss/logits": 0.05128267593681812, + "loss/reg": 0.022599538788199425, + "step": 1812 + }, + { + "epoch": 0.9065, + "grad_norm": 1.5391746759414673, + "grad_norm_var": 0.2308816121342284, + "learning_rate": 2e-05, + "loss": 0.436, + "loss/crossentropy": 2.5975476503372192, + "loss/hidden": 0.1787109375, + "loss/logits": 0.031343039125204086, + "loss/reg": 0.022597048431634903, + "step": 1813 + }, + { + "epoch": 0.907, + "grad_norm": 1.6539846658706665, + "grad_norm_var": 0.22155591112929945, + "learning_rate": 2e-05, + "loss": 0.4405, + "loss/crossentropy": 2.389395594596863, + "loss/hidden": 0.1796875, + "loss/logits": 0.03484657034277916, + "loss/reg": 0.022594643756747246, + "step": 1814 + }, + { + "epoch": 0.9075, + "grad_norm": 1.2229188680648804, + "grad_norm_var": 0.23667999380509078, + "learning_rate": 2e-05, + "loss": 0.4212, + "loss/crossentropy": 2.35276997089386, + "loss/hidden": 0.1650390625, + "loss/logits": 0.030212889425456524, + "loss/reg": 0.02259230427443981, + "step": 1815 + }, + { + "epoch": 0.908, + "grad_norm": 1.4916632175445557, + "grad_norm_var": 0.23332946859573647, + "learning_rate": 2e-05, + "loss": 0.4277, + "loss/crossentropy": 2.2875664830207825, + "loss/hidden": 0.16650390625, + "loss/logits": 0.035321952775120735, + "loss/reg": 0.02258998341858387, + "step": 1816 + }, + { + "epoch": 0.9085, + "grad_norm": 1.2740663290023804, + "grad_norm_var": 0.2253617779282422, + "learning_rate": 2e-05, + "loss": 0.4286, + "loss/crossentropy": 2.3370351791381836, + "loss/hidden": 0.16455078125, + "loss/logits": 0.03822075389325619, + "loss/reg": 0.02258743718266487, + "step": 1817 + }, + { + "epoch": 0.909, + "grad_norm": 1.2872508764266968, + "grad_norm_var": 0.23185537664945718, + "learning_rate": 2e-05, + "loss": 0.4221, + "loss/crossentropy": 2.4487507343292236, + "loss/hidden": 0.16357421875, + "loss/logits": 0.03271046280860901, + "loss/reg": 0.02258501760661602, + "step": 1818 + }, + { + "epoch": 0.9095, + "grad_norm": 1.5939817428588867, + "grad_norm_var": 0.2217355171208072, + "learning_rate": 2e-05, + "loss": 0.4138, + "loss/crossentropy": 2.6092634201049805, + "loss/hidden": 0.16015625, + "loss/logits": 0.02780964784324169, + "loss/reg": 0.022582601755857468, + "step": 1819 + }, + { + "epoch": 0.91, + "grad_norm": 16.670120239257812, + "grad_norm_var": 14.413642548278558, + "learning_rate": 2e-05, + "loss": 0.5563, + "loss/crossentropy": 2.3885061740875244, + "loss/hidden": 0.2939453125, + "loss/logits": 0.036548784002661705, + "loss/reg": 0.022580305114388466, + "step": 1820 + }, + { + "epoch": 0.9105, + "grad_norm": 1.2790286540985107, + "grad_norm_var": 14.474163637655296, + "learning_rate": 2e-05, + "loss": 0.373, + "loss/crossentropy": 2.6208510398864746, + "loss/hidden": 0.12646484375, + "loss/logits": 0.020793078001588583, + "loss/reg": 0.022577952593564987, + "step": 1821 + }, + { + "epoch": 0.911, + "grad_norm": 2.2307803630828857, + "grad_norm_var": 14.422778758807375, + "learning_rate": 2e-05, + "loss": 0.4049, + "loss/crossentropy": 2.3613555431365967, + "loss/hidden": 0.1572265625, + "loss/logits": 0.02187713049352169, + "loss/reg": 0.022575698792934418, + "step": 1822 + }, + { + "epoch": 0.9115, + "grad_norm": 1.8077691793441772, + "grad_norm_var": 14.44496363143063, + "learning_rate": 2e-05, + "loss": 0.4596, + "loss/crossentropy": 2.2239835262298584, + "loss/hidden": 0.20068359375, + "loss/logits": 0.03322533704340458, + "loss/reg": 0.022573480382561684, + "step": 1823 + }, + { + "epoch": 0.912, + "grad_norm": 1.8814700841903687, + "grad_norm_var": 14.409108100365703, + "learning_rate": 2e-05, + "loss": 0.4243, + "loss/crossentropy": 2.503218650817871, + "loss/hidden": 0.16748046875, + "loss/logits": 0.031097950413823128, + "loss/reg": 0.022571343928575516, + "step": 1824 + }, + { + "epoch": 0.9125, + "grad_norm": 1.4032585620880127, + "grad_norm_var": 14.384031530190613, + "learning_rate": 2e-05, + "loss": 0.4415, + "loss/crossentropy": 2.498712182044983, + "loss/hidden": 0.18408203125, + "loss/logits": 0.03172986023128033, + "loss/reg": 0.022569168359041214, + "step": 1825 + }, + { + "epoch": 0.913, + "grad_norm": 1.4820616245269775, + "grad_norm_var": 14.391329331953612, + "learning_rate": 2e-05, + "loss": 0.3925, + "loss/crossentropy": 2.2149962186813354, + "loss/hidden": 0.14404296875, + "loss/logits": 0.022828245535492897, + "loss/reg": 0.022566672414541245, + "step": 1826 + }, + { + "epoch": 0.9135, + "grad_norm": 1.4449611902236938, + "grad_norm_var": 14.320749149874818, + "learning_rate": 2e-05, + "loss": 0.4056, + "loss/crossentropy": 2.385851740837097, + "loss/hidden": 0.14794921875, + "loss/logits": 0.03205075114965439, + "loss/reg": 0.022564470767974854, + "step": 1827 + }, + { + "epoch": 0.914, + "grad_norm": 1.4951767921447754, + "grad_norm_var": 14.377107771874954, + "learning_rate": 2e-05, + "loss": 0.4165, + "loss/crossentropy": 2.23067569732666, + "loss/hidden": 0.16015625, + "loss/logits": 0.030674993991851807, + "loss/reg": 0.02256196364760399, + "step": 1828 + }, + { + "epoch": 0.9145, + "grad_norm": 1.4566495418548584, + "grad_norm_var": 14.38793906557842, + "learning_rate": 2e-05, + "loss": 0.4193, + "loss/crossentropy": 2.438162684440613, + "loss/hidden": 0.162109375, + "loss/logits": 0.03160354122519493, + "loss/reg": 0.02255944348871708, + "step": 1829 + }, + { + "epoch": 0.915, + "grad_norm": 1.2917461395263672, + "grad_norm_var": 14.436020724601905, + "learning_rate": 2e-05, + "loss": 0.4317, + "loss/crossentropy": 2.349228262901306, + "loss/hidden": 0.1708984375, + "loss/logits": 0.03521360456943512, + "loss/reg": 0.022557225078344345, + "step": 1830 + }, + { + "epoch": 0.9155, + "grad_norm": 1.3939363956451416, + "grad_norm_var": 14.40970744042121, + "learning_rate": 2e-05, + "loss": 0.376, + "loss/crossentropy": 2.434686541557312, + "loss/hidden": 0.130126953125, + "loss/logits": 0.020286419428884983, + "loss/reg": 0.022554853931069374, + "step": 1831 + }, + { + "epoch": 0.916, + "grad_norm": 1.1982911825180054, + "grad_norm_var": 14.453267319482267, + "learning_rate": 2e-05, + "loss": 0.406, + "loss/crossentropy": 2.2608832120895386, + "loss/hidden": 0.1513671875, + "loss/logits": 0.02907765470445156, + "loss/reg": 0.02255268208682537, + "step": 1832 + }, + { + "epoch": 0.9165, + "grad_norm": 1.3199797868728638, + "grad_norm_var": 14.446203864298448, + "learning_rate": 2e-05, + "loss": 0.4498, + "loss/crossentropy": 2.0910937786102295, + "loss/hidden": 0.1884765625, + "loss/logits": 0.03581584058701992, + "loss/reg": 0.022550417110323906, + "step": 1833 + }, + { + "epoch": 0.917, + "grad_norm": 1.717532753944397, + "grad_norm_var": 14.390936544297686, + "learning_rate": 2e-05, + "loss": 0.4676, + "loss/crossentropy": 2.2329607009887695, + "loss/hidden": 0.20849609375, + "loss/logits": 0.03362170793116093, + "loss/reg": 0.022548070177435875, + "step": 1834 + }, + { + "epoch": 0.9175, + "grad_norm": 1.6943548917770386, + "grad_norm_var": 14.379719646058886, + "learning_rate": 2e-05, + "loss": 0.4554, + "loss/crossentropy": 2.37869393825531, + "loss/hidden": 0.19384765625, + "loss/logits": 0.036072161979973316, + "loss/reg": 0.022545799612998962, + "step": 1835 + }, + { + "epoch": 0.918, + "grad_norm": 2.670581817626953, + "grad_norm_var": 0.15172412365397647, + "learning_rate": 2e-05, + "loss": 0.5334, + "loss/crossentropy": 2.3655420541763306, + "loss/hidden": 0.2705078125, + "loss/logits": 0.03744707256555557, + "loss/reg": 0.022543571889400482, + "step": 1836 + }, + { + "epoch": 0.9185, + "grad_norm": 1.4796888828277588, + "grad_norm_var": 0.14537294518872693, + "learning_rate": 2e-05, + "loss": 0.446, + "loss/crossentropy": 2.2026679515838623, + "loss/hidden": 0.1884765625, + "loss/logits": 0.03215072676539421, + "loss/reg": 0.022541362792253494, + "step": 1837 + }, + { + "epoch": 0.919, + "grad_norm": 1.465540885925293, + "grad_norm_var": 0.11996093294203304, + "learning_rate": 2e-05, + "loss": 0.3955, + "loss/crossentropy": 2.6018182039260864, + "loss/hidden": 0.146484375, + "loss/logits": 0.023670999333262444, + "loss/reg": 0.022538956254720688, + "step": 1838 + }, + { + "epoch": 0.9195, + "grad_norm": 2.6521716117858887, + "grad_norm_var": 0.19071007315725008, + "learning_rate": 2e-05, + "loss": 0.5236, + "loss/crossentropy": 2.290159225463867, + "loss/hidden": 0.25390625, + "loss/logits": 0.044362759217619896, + "loss/reg": 0.022536424919962883, + "step": 1839 + }, + { + "epoch": 0.92, + "grad_norm": 1.2103041410446167, + "grad_norm_var": 0.19617798026988734, + "learning_rate": 2e-05, + "loss": 0.4108, + "loss/crossentropy": 2.3787648677825928, + "loss/hidden": 0.15185546875, + "loss/logits": 0.033578867092728615, + "loss/reg": 0.022533901035785675, + "step": 1840 + }, + { + "epoch": 0.9205, + "grad_norm": 1.3619086742401123, + "grad_norm_var": 0.19729243671530577, + "learning_rate": 2e-05, + "loss": 0.3874, + "loss/crossentropy": 2.400606870651245, + "loss/hidden": 0.138671875, + "loss/logits": 0.023377398028969765, + "loss/reg": 0.022531181573867798, + "step": 1841 + }, + { + "epoch": 0.921, + "grad_norm": 1.7256075143814087, + "grad_norm_var": 0.19770787293851314, + "learning_rate": 2e-05, + "loss": 0.3938, + "loss/crossentropy": 2.6239322423934937, + "loss/hidden": 0.13623046875, + "loss/logits": 0.03229370526969433, + "loss/reg": 0.02252843603491783, + "step": 1842 + }, + { + "epoch": 0.9215, + "grad_norm": 1.4745891094207764, + "grad_norm_var": 0.1971555977191843, + "learning_rate": 2e-05, + "loss": 0.3882, + "loss/crossentropy": 2.5068957805633545, + "loss/hidden": 0.13671875, + "loss/logits": 0.026268533430993557, + "loss/reg": 0.022525638341903687, + "step": 1843 + }, + { + "epoch": 0.922, + "grad_norm": 1.3436163663864136, + "grad_norm_var": 0.20071971118220464, + "learning_rate": 2e-05, + "loss": 0.3943, + "loss/crossentropy": 2.5273977518081665, + "loss/hidden": 0.14453125, + "loss/logits": 0.024529898539185524, + "loss/reg": 0.02252272516489029, + "step": 1844 + }, + { + "epoch": 0.9225, + "grad_norm": 2.0100479125976562, + "grad_norm_var": 0.209944773268783, + "learning_rate": 2e-05, + "loss": 0.4533, + "loss/crossentropy": 2.466127634048462, + "loss/hidden": 0.193359375, + "loss/logits": 0.03471413720399141, + "loss/reg": 0.02252020128071308, + "step": 1845 + }, + { + "epoch": 0.923, + "grad_norm": 1.6746459007263184, + "grad_norm_var": 0.20206274459075116, + "learning_rate": 2e-05, + "loss": 0.4434, + "loss/crossentropy": 2.244032859802246, + "loss/hidden": 0.18359375, + "loss/logits": 0.034620098769664764, + "loss/reg": 0.02251766063272953, + "step": 1846 + }, + { + "epoch": 0.9235, + "grad_norm": 1.7665760517120361, + "grad_norm_var": 0.19804128550095795, + "learning_rate": 2e-05, + "loss": 0.4023, + "loss/crossentropy": 2.1969178915023804, + "loss/hidden": 0.15087890625, + "loss/logits": 0.026284687221050262, + "loss/reg": 0.022515103220939636, + "step": 1847 + }, + { + "epoch": 0.924, + "grad_norm": 2.707465648651123, + "grad_norm_var": 0.24490152911908372, + "learning_rate": 2e-05, + "loss": 0.4971, + "loss/crossentropy": 2.5745354890823364, + "loss/hidden": 0.23193359375, + "loss/logits": 0.0400242879986763, + "loss/reg": 0.022512590512633324, + "step": 1848 + }, + { + "epoch": 0.9245, + "grad_norm": 1.9823905229568481, + "grad_norm_var": 0.23282989475386653, + "learning_rate": 2e-05, + "loss": 0.4681, + "loss/crossentropy": 2.053453028202057, + "loss/hidden": 0.20849609375, + "loss/logits": 0.03454894572496414, + "loss/reg": 0.02251008152961731, + "step": 1849 + }, + { + "epoch": 0.925, + "grad_norm": 1.7281039953231812, + "grad_norm_var": 0.23270857087946387, + "learning_rate": 2e-05, + "loss": 0.4487, + "loss/crossentropy": 2.4333807229995728, + "loss/hidden": 0.18359375, + "loss/logits": 0.04003257304430008, + "loss/reg": 0.022507477551698685, + "step": 1850 + }, + { + "epoch": 0.9255, + "grad_norm": 1.4039613008499146, + "grad_norm_var": 0.24242675596621838, + "learning_rate": 2e-05, + "loss": 0.4234, + "loss/crossentropy": 2.3681410551071167, + "loss/hidden": 0.1640625, + "loss/logits": 0.03433472663164139, + "loss/reg": 0.022504812106490135, + "step": 1851 + }, + { + "epoch": 0.926, + "grad_norm": 1.361372947692871, + "grad_norm_var": 0.196025750965984, + "learning_rate": 2e-05, + "loss": 0.4174, + "loss/crossentropy": 2.1348154544830322, + "loss/hidden": 0.1572265625, + "loss/logits": 0.03517003171145916, + "loss/reg": 0.022502336651086807, + "step": 1852 + }, + { + "epoch": 0.9265, + "grad_norm": 1.5376478433609009, + "grad_norm_var": 0.19446169115935935, + "learning_rate": 2e-05, + "loss": 0.4399, + "loss/crossentropy": 2.3104746341705322, + "loss/hidden": 0.1748046875, + "loss/logits": 0.04014399088919163, + "loss/reg": 0.022499844431877136, + "step": 1853 + }, + { + "epoch": 0.927, + "grad_norm": 1.6215567588806152, + "grad_norm_var": 0.19083799211992075, + "learning_rate": 2e-05, + "loss": 0.4836, + "loss/crossentropy": 2.3783109188079834, + "loss/hidden": 0.2099609375, + "loss/logits": 0.0486428327858448, + "loss/reg": 0.02249729447066784, + "step": 1854 + }, + { + "epoch": 0.9275, + "grad_norm": 1.4279130697250366, + "grad_norm_var": 0.1327791587908031, + "learning_rate": 2e-05, + "loss": 0.3999, + "loss/crossentropy": 2.4454482793807983, + "loss/hidden": 0.146484375, + "loss/logits": 0.02843039110302925, + "loss/reg": 0.022494826465845108, + "step": 1855 + }, + { + "epoch": 0.928, + "grad_norm": 1.3840880393981934, + "grad_norm_var": 0.12456864834301858, + "learning_rate": 2e-05, + "loss": 0.4034, + "loss/crossentropy": 2.2915083169937134, + "loss/hidden": 0.15185546875, + "loss/logits": 0.026664272882044315, + "loss/reg": 0.02249237336218357, + "step": 1856 + }, + { + "epoch": 0.9285, + "grad_norm": 1.6339174509048462, + "grad_norm_var": 0.11849177496741632, + "learning_rate": 2e-05, + "loss": 0.4585, + "loss/crossentropy": 2.2919251918792725, + "loss/hidden": 0.1962890625, + "loss/logits": 0.03729063458740711, + "loss/reg": 0.022490020841360092, + "step": 1857 + }, + { + "epoch": 0.929, + "grad_norm": 1.4191261529922485, + "grad_norm_var": 0.12225227678708066, + "learning_rate": 2e-05, + "loss": 0.3906, + "loss/crossentropy": 2.4586825370788574, + "loss/hidden": 0.13818359375, + "loss/logits": 0.027546225115656853, + "loss/reg": 0.022487731650471687, + "step": 1858 + }, + { + "epoch": 0.9295, + "grad_norm": 1.765230417251587, + "grad_norm_var": 0.12054770545048896, + "learning_rate": 2e-05, + "loss": 0.4006, + "loss/crossentropy": 2.3640472888946533, + "loss/hidden": 0.1484375, + "loss/logits": 0.027359573170542717, + "loss/reg": 0.022485224530100822, + "step": 1859 + }, + { + "epoch": 0.93, + "grad_norm": 1.314341425895691, + "grad_norm_var": 0.12188687798420096, + "learning_rate": 2e-05, + "loss": 0.3958, + "loss/crossentropy": 2.542987108230591, + "loss/hidden": 0.1455078125, + "loss/logits": 0.025465862825512886, + "loss/reg": 0.022482680156826973, + "step": 1860 + }, + { + "epoch": 0.9305, + "grad_norm": 1.4204936027526855, + "grad_norm_var": 0.11697036921642787, + "learning_rate": 2e-05, + "loss": 0.3786, + "loss/crossentropy": 2.3337528705596924, + "loss/hidden": 0.13330078125, + "loss/logits": 0.0205409936606884, + "loss/reg": 0.0224803127348423, + "step": 1861 + }, + { + "epoch": 0.931, + "grad_norm": 1.553285002708435, + "grad_norm_var": 0.11723807462238127, + "learning_rate": 2e-05, + "loss": 0.4207, + "loss/crossentropy": 2.5051403045654297, + "loss/hidden": 0.16357421875, + "loss/logits": 0.032310767099261284, + "loss/reg": 0.022477777674794197, + "step": 1862 + }, + { + "epoch": 0.9315, + "grad_norm": 1.88336980342865, + "grad_norm_var": 0.12026858023636061, + "learning_rate": 2e-05, + "loss": 0.4247, + "loss/crossentropy": 2.7639983892440796, + "loss/hidden": 0.16650390625, + "loss/logits": 0.033397359773516655, + "loss/reg": 0.022475138306617737, + "step": 1863 + }, + { + "epoch": 0.932, + "grad_norm": 1.633898377418518, + "grad_norm_var": 0.03864676483869018, + "learning_rate": 2e-05, + "loss": 0.4802, + "loss/crossentropy": 2.3575611114501953, + "loss/hidden": 0.22021484375, + "loss/logits": 0.035267666913568974, + "loss/reg": 0.022472495213150978, + "step": 1864 + }, + { + "epoch": 0.9325, + "grad_norm": 1.7001293897628784, + "grad_norm_var": 0.02799001185132912, + "learning_rate": 2e-05, + "loss": 0.4325, + "loss/crossentropy": 2.351699948310852, + "loss/hidden": 0.1748046875, + "loss/logits": 0.03303542733192444, + "loss/reg": 0.022469859570264816, + "step": 1865 + }, + { + "epoch": 0.933, + "grad_norm": 2.136624336242676, + "grad_norm_var": 0.04816114932450143, + "learning_rate": 2e-05, + "loss": 0.5564, + "loss/crossentropy": 2.115296185016632, + "loss/hidden": 0.2880859375, + "loss/logits": 0.043633848428726196, + "loss/reg": 0.022467387840151787, + "step": 1866 + }, + { + "epoch": 0.9335, + "grad_norm": 3.2611968517303467, + "grad_norm_var": 0.22143645197999617, + "learning_rate": 2e-05, + "loss": 0.5975, + "loss/crossentropy": 2.6474725008010864, + "loss/hidden": 0.297607421875, + "loss/logits": 0.07519873604178429, + "loss/reg": 0.022464843466877937, + "step": 1867 + }, + { + "epoch": 0.934, + "grad_norm": 1.7594748735427856, + "grad_norm_var": 0.2138510846890559, + "learning_rate": 2e-05, + "loss": 0.4304, + "loss/crossentropy": 2.440946102142334, + "loss/hidden": 0.1806640625, + "loss/logits": 0.025086318142712116, + "loss/reg": 0.02246221713721752, + "step": 1868 + }, + { + "epoch": 0.9345, + "grad_norm": 1.7738037109375, + "grad_norm_var": 0.2117281243319851, + "learning_rate": 2e-05, + "loss": 0.4517, + "loss/crossentropy": 2.58816659450531, + "loss/hidden": 0.18505859375, + "loss/logits": 0.04205223172903061, + "loss/reg": 0.022459525614976883, + "step": 1869 + }, + { + "epoch": 0.935, + "grad_norm": 1.5588525533676147, + "grad_norm_var": 0.21288492425881386, + "learning_rate": 2e-05, + "loss": 0.4193, + "loss/crossentropy": 2.6452767848968506, + "loss/hidden": 0.15966796875, + "loss/logits": 0.03501817770302296, + "loss/reg": 0.022456802427768707, + "step": 1870 + }, + { + "epoch": 0.9355, + "grad_norm": 1.714156150817871, + "grad_norm_var": 0.20660591312481713, + "learning_rate": 2e-05, + "loss": 0.3944, + "loss/crossentropy": 2.630972743034363, + "loss/hidden": 0.142578125, + "loss/logits": 0.02727540396153927, + "loss/reg": 0.022454047575592995, + "step": 1871 + }, + { + "epoch": 0.936, + "grad_norm": 1.4239534139633179, + "grad_norm_var": 0.2047895173630837, + "learning_rate": 2e-05, + "loss": 0.418, + "loss/crossentropy": 2.3476343154907227, + "loss/hidden": 0.16357421875, + "loss/logits": 0.029905791394412518, + "loss/reg": 0.022451288998126984, + "step": 1872 + }, + { + "epoch": 0.9365, + "grad_norm": 1.3508610725402832, + "grad_norm_var": 0.21406456048783054, + "learning_rate": 2e-05, + "loss": 0.4097, + "loss/crossentropy": 2.158454120159149, + "loss/hidden": 0.15869140625, + "loss/logits": 0.026525546796619892, + "loss/reg": 0.02244875766336918, + "step": 1873 + }, + { + "epoch": 0.937, + "grad_norm": 1.8326611518859863, + "grad_norm_var": 0.20765040453607733, + "learning_rate": 2e-05, + "loss": 0.4879, + "loss/crossentropy": 2.358833074569702, + "loss/hidden": 0.23291015625, + "loss/logits": 0.03050221409648657, + "loss/reg": 0.022446228191256523, + "step": 1874 + }, + { + "epoch": 0.9375, + "grad_norm": 4.5147013664245605, + "grad_norm_var": 0.6838218076838546, + "learning_rate": 2e-05, + "loss": 0.556, + "loss/crossentropy": 2.2411223649978638, + "loss/hidden": 0.28271484375, + "loss/logits": 0.048853909596800804, + "loss/reg": 0.022443652153015137, + "step": 1875 + }, + { + "epoch": 0.938, + "grad_norm": 2.6737983226776123, + "grad_norm_var": 0.6882806728767182, + "learning_rate": 2e-05, + "loss": 0.4451, + "loss/crossentropy": 2.5554691553115845, + "loss/hidden": 0.18310546875, + "loss/logits": 0.03760566934943199, + "loss/reg": 0.022440902888774872, + "step": 1876 + }, + { + "epoch": 0.9385, + "grad_norm": 1.9008327722549438, + "grad_norm_var": 0.6648208335261887, + "learning_rate": 2e-05, + "loss": 0.4281, + "loss/crossentropy": 2.85513436794281, + "loss/hidden": 0.17236328125, + "loss/logits": 0.03139635734260082, + "loss/reg": 0.022438300773501396, + "step": 1877 + }, + { + "epoch": 0.939, + "grad_norm": 1.3097538948059082, + "grad_norm_var": 0.6843957065276801, + "learning_rate": 2e-05, + "loss": 0.4314, + "loss/crossentropy": 2.171161413192749, + "loss/hidden": 0.17333984375, + "loss/logits": 0.03369998559355736, + "loss/reg": 0.022435514256358147, + "step": 1878 + }, + { + "epoch": 0.9395, + "grad_norm": 1.6398875713348389, + "grad_norm_var": 0.6927558067930797, + "learning_rate": 2e-05, + "loss": 0.4433, + "loss/crossentropy": 2.272761583328247, + "loss/hidden": 0.1845703125, + "loss/logits": 0.0343943927437067, + "loss/reg": 0.022432943806052208, + "step": 1879 + }, + { + "epoch": 0.94, + "grad_norm": 1.6524351835250854, + "grad_norm_var": 0.6918439217164187, + "learning_rate": 2e-05, + "loss": 0.4265, + "loss/crossentropy": 2.3826204538345337, + "loss/hidden": 0.169921875, + "loss/logits": 0.03229558374732733, + "loss/reg": 0.022430358454585075, + "step": 1880 + }, + { + "epoch": 0.9405, + "grad_norm": 1.5030336380004883, + "grad_norm_var": 0.7024858941629423, + "learning_rate": 2e-05, + "loss": 0.4619, + "loss/crossentropy": 2.4012389183044434, + "loss/hidden": 0.20458984375, + "loss/logits": 0.033006876707077026, + "loss/reg": 0.022427737712860107, + "step": 1881 + }, + { + "epoch": 0.941, + "grad_norm": 1.1325322389602661, + "grad_norm_var": 0.7472577601143559, + "learning_rate": 2e-05, + "loss": 0.3661, + "loss/crossentropy": 2.3976120948791504, + "loss/hidden": 0.123291015625, + "loss/logits": 0.018594788387417793, + "loss/reg": 0.022425329312682152, + "step": 1882 + }, + { + "epoch": 0.9415, + "grad_norm": 1.1853415966033936, + "grad_norm_var": 0.6502409271460311, + "learning_rate": 2e-05, + "loss": 0.4157, + "loss/crossentropy": 2.298361301422119, + "loss/hidden": 0.16015625, + "loss/logits": 0.031314633786678314, + "loss/reg": 0.022422639653086662, + "step": 1883 + }, + { + "epoch": 0.942, + "grad_norm": 1.0973137617111206, + "grad_norm_var": 0.6819181070580886, + "learning_rate": 2e-05, + "loss": 0.3617, + "loss/crossentropy": 2.4975024461746216, + "loss/hidden": 0.117919921875, + "loss/logits": 0.019550339318811893, + "loss/reg": 0.02241992950439453, + "step": 1884 + }, + { + "epoch": 0.9425, + "grad_norm": 1.7438507080078125, + "grad_norm_var": 0.6819449915123499, + "learning_rate": 2e-05, + "loss": 0.4634, + "loss/crossentropy": 2.204440951347351, + "loss/hidden": 0.20458984375, + "loss/logits": 0.03460996691137552, + "loss/reg": 0.022417448461055756, + "step": 1885 + }, + { + "epoch": 0.943, + "grad_norm": 1.2848305702209473, + "grad_norm_var": 0.6941560719689528, + "learning_rate": 2e-05, + "loss": 0.4238, + "loss/crossentropy": 2.4170058965682983, + "loss/hidden": 0.16796875, + "loss/logits": 0.03171874303370714, + "loss/reg": 0.022415155544877052, + "step": 1886 + }, + { + "epoch": 0.9435, + "grad_norm": 1.4587926864624023, + "grad_norm_var": 0.6993669145136687, + "learning_rate": 2e-05, + "loss": 0.4306, + "loss/crossentropy": 2.349593162536621, + "loss/hidden": 0.17431640625, + "loss/logits": 0.032118335366249084, + "loss/reg": 0.022412730380892754, + "step": 1887 + }, + { + "epoch": 0.944, + "grad_norm": 1.3421655893325806, + "grad_norm_var": 0.7031391966357072, + "learning_rate": 2e-05, + "loss": 0.4002, + "loss/crossentropy": 2.1012765169143677, + "loss/hidden": 0.14599609375, + "loss/logits": 0.030093910172581673, + "loss/reg": 0.02241034060716629, + "step": 1888 + }, + { + "epoch": 0.9445, + "grad_norm": 1.568566083908081, + "grad_norm_var": 0.6951998080418774, + "learning_rate": 2e-05, + "loss": 0.4535, + "loss/crossentropy": 2.2762022018432617, + "loss/hidden": 0.193359375, + "loss/logits": 0.03608548082411289, + "loss/reg": 0.022408101707696915, + "step": 1889 + }, + { + "epoch": 0.945, + "grad_norm": 1.2452459335327148, + "grad_norm_var": 0.7095108720725463, + "learning_rate": 2e-05, + "loss": 0.4212, + "loss/crossentropy": 2.4997655153274536, + "loss/hidden": 0.16552734375, + "loss/logits": 0.03162308409810066, + "loss/reg": 0.022405438125133514, + "step": 1890 + }, + { + "epoch": 0.9455, + "grad_norm": 1.5108202695846558, + "grad_norm_var": 0.14745889251719102, + "learning_rate": 2e-05, + "loss": 0.4281, + "loss/crossentropy": 2.2601327896118164, + "loss/hidden": 0.16796875, + "loss/logits": 0.03605945594608784, + "loss/reg": 0.02240295149385929, + "step": 1891 + }, + { + "epoch": 0.946, + "grad_norm": 1.6598831415176392, + "grad_norm_var": 0.05513170444355821, + "learning_rate": 2e-05, + "loss": 0.4369, + "loss/crossentropy": 2.2406809329986572, + "loss/hidden": 0.1787109375, + "loss/logits": 0.034201012924313545, + "loss/reg": 0.02240018919110298, + "step": 1892 + }, + { + "epoch": 0.9465, + "grad_norm": 1.3680381774902344, + "grad_norm_var": 0.041003415881171415, + "learning_rate": 2e-05, + "loss": 0.4007, + "loss/crossentropy": 2.4398785829544067, + "loss/hidden": 0.148193359375, + "loss/logits": 0.028537730686366558, + "loss/reg": 0.022397480905056, + "step": 1893 + }, + { + "epoch": 0.947, + "grad_norm": 1.0278970003128052, + "grad_norm_var": 0.05007064750664269, + "learning_rate": 2e-05, + "loss": 0.3886, + "loss/crossentropy": 2.148539900779724, + "loss/hidden": 0.14208984375, + "loss/logits": 0.022542059421539307, + "loss/reg": 0.022394755855202675, + "step": 1894 + }, + { + "epoch": 0.9475, + "grad_norm": 1.5086561441421509, + "grad_norm_var": 0.0469721299358883, + "learning_rate": 2e-05, + "loss": 0.4413, + "loss/crossentropy": 2.251150965690613, + "loss/hidden": 0.1845703125, + "loss/logits": 0.03278907388448715, + "loss/reg": 0.022392379119992256, + "step": 1895 + }, + { + "epoch": 0.948, + "grad_norm": 1.2127013206481934, + "grad_norm_var": 0.04385164563974554, + "learning_rate": 2e-05, + "loss": 0.368, + "loss/crossentropy": 2.422248601913452, + "loss/hidden": 0.123779296875, + "loss/logits": 0.020290100947022438, + "loss/reg": 0.02238963358104229, + "step": 1896 + }, + { + "epoch": 0.9485, + "grad_norm": 1.2324891090393066, + "grad_norm_var": 0.043468858091787806, + "learning_rate": 2e-05, + "loss": 0.3832, + "loss/crossentropy": 2.0850866436958313, + "loss/hidden": 0.136962890625, + "loss/logits": 0.02234545536339283, + "loss/reg": 0.022387119010090828, + "step": 1897 + }, + { + "epoch": 0.949, + "grad_norm": 1.4261040687561035, + "grad_norm_var": 0.040394134059281585, + "learning_rate": 2e-05, + "loss": 0.4292, + "loss/crossentropy": 2.431095004081726, + "loss/hidden": 0.171875, + "loss/logits": 0.033502984791994095, + "loss/reg": 0.022384393960237503, + "step": 1898 + }, + { + "epoch": 0.9495, + "grad_norm": 1.439329981803894, + "grad_norm_var": 0.038272658552281236, + "learning_rate": 2e-05, + "loss": 0.387, + "loss/crossentropy": 2.3343453407287598, + "loss/hidden": 0.13916015625, + "loss/logits": 0.02406618557870388, + "loss/reg": 0.02238152176141739, + "step": 1899 + }, + { + "epoch": 0.95, + "grad_norm": 1.0969916582107544, + "grad_norm_var": 0.03828493091074415, + "learning_rate": 2e-05, + "loss": 0.3786, + "loss/crossentropy": 2.563263773918152, + "loss/hidden": 0.1318359375, + "loss/logits": 0.02294111903756857, + "loss/reg": 0.022378597408533096, + "step": 1900 + }, + { + "epoch": 0.9505, + "grad_norm": 1.1716290712356567, + "grad_norm_var": 0.031210427928216926, + "learning_rate": 2e-05, + "loss": 0.4071, + "loss/crossentropy": 2.391364336013794, + "loss/hidden": 0.1572265625, + "loss/logits": 0.02612478658556938, + "loss/reg": 0.022375814616680145, + "step": 1901 + }, + { + "epoch": 0.951, + "grad_norm": 1.3930448293685913, + "grad_norm_var": 0.03104337690990106, + "learning_rate": 2e-05, + "loss": 0.3902, + "loss/crossentropy": 2.4488768577575684, + "loss/hidden": 0.1396484375, + "loss/logits": 0.02680811006575823, + "loss/reg": 0.022373300045728683, + "step": 1902 + }, + { + "epoch": 0.9515, + "grad_norm": 1.588849425315857, + "grad_norm_var": 0.03391953124871445, + "learning_rate": 2e-05, + "loss": 0.4344, + "loss/crossentropy": 2.354185700416565, + "loss/hidden": 0.1806640625, + "loss/logits": 0.03003675863146782, + "loss/reg": 0.02237078920006752, + "step": 1903 + }, + { + "epoch": 0.952, + "grad_norm": 1.7344504594802856, + "grad_norm_var": 0.0424987168581661, + "learning_rate": 2e-05, + "loss": 0.4191, + "loss/crossentropy": 2.253276824951172, + "loss/hidden": 0.16455078125, + "loss/logits": 0.030912759713828564, + "loss/reg": 0.02236793003976345, + "step": 1904 + }, + { + "epoch": 0.9525, + "grad_norm": 1.2497073411941528, + "grad_norm_var": 0.0411145507961009, + "learning_rate": 2e-05, + "loss": 0.3802, + "loss/crossentropy": 2.337361454963684, + "loss/hidden": 0.13623046875, + "loss/logits": 0.02029071655124426, + "loss/reg": 0.02236493118107319, + "step": 1905 + }, + { + "epoch": 0.953, + "grad_norm": 1.4437819719314575, + "grad_norm_var": 0.040365271308345045, + "learning_rate": 2e-05, + "loss": 0.3873, + "loss/crossentropy": 2.3124853372573853, + "loss/hidden": 0.1435546875, + "loss/logits": 0.020108817145228386, + "loss/reg": 0.02236202545464039, + "step": 1906 + }, + { + "epoch": 0.9535, + "grad_norm": 1.5539910793304443, + "grad_norm_var": 0.04124039089983468, + "learning_rate": 2e-05, + "loss": 0.3817, + "loss/crossentropy": 2.313749074935913, + "loss/hidden": 0.13671875, + "loss/logits": 0.02142718993127346, + "loss/reg": 0.02235933393239975, + "step": 1907 + }, + { + "epoch": 0.954, + "grad_norm": 1.723780632019043, + "grad_norm_var": 0.04386541517829012, + "learning_rate": 2e-05, + "loss": 0.4172, + "loss/crossentropy": 2.3829362392425537, + "loss/hidden": 0.16455078125, + "loss/logits": 0.02905107382684946, + "loss/reg": 0.022356610745191574, + "step": 1908 + }, + { + "epoch": 0.9545, + "grad_norm": 1.19853937625885, + "grad_norm_var": 0.04606052697454756, + "learning_rate": 2e-05, + "loss": 0.4345, + "loss/crossentropy": 2.2605234384536743, + "loss/hidden": 0.17626953125, + "loss/logits": 0.03473933879286051, + "loss/reg": 0.022353753447532654, + "step": 1909 + }, + { + "epoch": 0.955, + "grad_norm": 2.8696258068084717, + "grad_norm_var": 0.17279256562972117, + "learning_rate": 2e-05, + "loss": 0.5594, + "loss/crossentropy": 2.468735933303833, + "loss/hidden": 0.29638671875, + "loss/logits": 0.0395014937967062, + "loss/reg": 0.022350985556840897, + "step": 1910 + }, + { + "epoch": 0.9555, + "grad_norm": 1.2979068756103516, + "grad_norm_var": 0.17505073259704976, + "learning_rate": 2e-05, + "loss": 0.3883, + "loss/crossentropy": 2.4635722637176514, + "loss/hidden": 0.13720703125, + "loss/logits": 0.027581739239394665, + "loss/reg": 0.02234843373298645, + "step": 1911 + }, + { + "epoch": 0.956, + "grad_norm": 2.820190906524658, + "grad_norm_var": 0.2798921413237015, + "learning_rate": 2e-05, + "loss": 0.6459, + "loss/crossentropy": 2.131369948387146, + "loss/hidden": 0.365234375, + "loss/logits": 0.057183969765901566, + "loss/reg": 0.02234589122235775, + "step": 1912 + }, + { + "epoch": 0.9565, + "grad_norm": 1.3415361642837524, + "grad_norm_var": 0.2756186472645955, + "learning_rate": 2e-05, + "loss": 0.422, + "loss/crossentropy": 2.289743185043335, + "loss/hidden": 0.16650390625, + "loss/logits": 0.03201920446008444, + "loss/reg": 0.022343412041664124, + "step": 1913 + }, + { + "epoch": 0.957, + "grad_norm": 1.1303457021713257, + "grad_norm_var": 0.2873257056445263, + "learning_rate": 2e-05, + "loss": 0.3862, + "loss/crossentropy": 2.3576101064682007, + "loss/hidden": 0.14013671875, + "loss/logits": 0.02269960194826126, + "loss/reg": 0.022340916097164154, + "step": 1914 + }, + { + "epoch": 0.9575, + "grad_norm": 2.158590078353882, + "grad_norm_var": 0.3075251014181994, + "learning_rate": 2e-05, + "loss": 0.4402, + "loss/crossentropy": 2.332666039466858, + "loss/hidden": 0.1787109375, + "loss/logits": 0.038076866418123245, + "loss/reg": 0.022338369861245155, + "step": 1915 + }, + { + "epoch": 0.958, + "grad_norm": 7.00865364074707, + "grad_norm_var": 2.08675653148509, + "learning_rate": 2e-05, + "loss": 0.6937, + "loss/crossentropy": 2.646122097969055, + "loss/hidden": 0.392578125, + "loss/logits": 0.07780970819294453, + "loss/reg": 0.02233590930700302, + "step": 1916 + }, + { + "epoch": 0.9585, + "grad_norm": 1.3123027086257935, + "grad_norm_var": 2.0728257314385186, + "learning_rate": 2e-05, + "loss": 0.4072, + "loss/crossentropy": 2.2954723834991455, + "loss/hidden": 0.1572265625, + "loss/logits": 0.026589620858430862, + "loss/reg": 0.022333433851599693, + "step": 1917 + }, + { + "epoch": 0.959, + "grad_norm": 1.8116488456726074, + "grad_norm_var": 2.050510475959324, + "learning_rate": 2e-05, + "loss": 0.4173, + "loss/crossentropy": 2.6285065412521362, + "loss/hidden": 0.16064453125, + "loss/logits": 0.03334318473935127, + "loss/reg": 0.022331027314066887, + "step": 1918 + }, + { + "epoch": 0.9595, + "grad_norm": 1.3005446195602417, + "grad_norm_var": 2.072096328270597, + "learning_rate": 2e-05, + "loss": 0.3712, + "loss/crossentropy": 2.384632110595703, + "loss/hidden": 0.1279296875, + "loss/logits": 0.019956374540925026, + "loss/reg": 0.022328531369566917, + "step": 1919 + }, + { + "epoch": 0.96, + "grad_norm": 1.381271243095398, + "grad_norm_var": 2.0922664903830954, + "learning_rate": 2e-05, + "loss": 0.3898, + "loss/crossentropy": 2.2269625663757324, + "loss/hidden": 0.146484375, + "loss/logits": 0.020025085657835007, + "loss/reg": 0.022326109930872917, + "step": 1920 + }, + { + "epoch": 0.9605, + "grad_norm": 2.105039596557617, + "grad_norm_var": 2.055258347725847, + "learning_rate": 2e-05, + "loss": 0.4958, + "loss/crossentropy": 2.388508439064026, + "loss/hidden": 0.22509765625, + "loss/logits": 0.047498359344899654, + "loss/reg": 0.02232373133301735, + "step": 1921 + }, + { + "epoch": 0.961, + "grad_norm": 1.0844900608062744, + "grad_norm_var": 2.091343013520689, + "learning_rate": 2e-05, + "loss": 0.3874, + "loss/crossentropy": 2.326627492904663, + "loss/hidden": 0.1396484375, + "loss/logits": 0.02454256359487772, + "loss/reg": 0.022321194410324097, + "step": 1922 + }, + { + "epoch": 0.9615, + "grad_norm": 1.7085936069488525, + "grad_norm_var": 2.0835161560615814, + "learning_rate": 2e-05, + "loss": 0.4864, + "loss/crossentropy": 2.1795098781585693, + "loss/hidden": 0.22021484375, + "loss/logits": 0.04298657365143299, + "loss/reg": 0.022318590432405472, + "step": 1923 + }, + { + "epoch": 0.962, + "grad_norm": 1.0788378715515137, + "grad_norm_var": 2.1346259374470815, + "learning_rate": 2e-05, + "loss": 0.3828, + "loss/crossentropy": 2.402096390724182, + "loss/hidden": 0.13623046875, + "loss/logits": 0.023389977402985096, + "loss/reg": 0.022316064685583115, + "step": 1924 + }, + { + "epoch": 0.9625, + "grad_norm": 1.3279380798339844, + "grad_norm_var": 2.1222672863766183, + "learning_rate": 2e-05, + "loss": 0.4175, + "loss/crossentropy": 2.4705991744995117, + "loss/hidden": 0.1650390625, + "loss/logits": 0.02933623269200325, + "loss/reg": 0.02231350913643837, + "step": 1925 + }, + { + "epoch": 0.963, + "grad_norm": 1.8480850458145142, + "grad_norm_var": 2.066806634795002, + "learning_rate": 2e-05, + "loss": 0.5545, + "loss/crossentropy": 2.2896007299423218, + "loss/hidden": 0.2978515625, + "loss/logits": 0.03350013308227062, + "loss/reg": 0.022310776636004448, + "step": 1926 + }, + { + "epoch": 0.9635, + "grad_norm": 1.2676535844802856, + "grad_norm_var": 2.0693722058326345, + "learning_rate": 2e-05, + "loss": 0.4228, + "loss/crossentropy": 2.229649305343628, + "loss/hidden": 0.16796875, + "loss/logits": 0.03177023585885763, + "loss/reg": 0.022308047860860825, + "step": 1927 + }, + { + "epoch": 0.964, + "grad_norm": 1.5557312965393066, + "grad_norm_var": 2.017172302933795, + "learning_rate": 2e-05, + "loss": 0.443, + "loss/crossentropy": 2.5194848775863647, + "loss/hidden": 0.18408203125, + "loss/logits": 0.035887595266103745, + "loss/reg": 0.02230549044907093, + "step": 1928 + }, + { + "epoch": 0.9645, + "grad_norm": 1.9265832901000977, + "grad_norm_var": 1.9997728547949178, + "learning_rate": 2e-05, + "loss": 0.4194, + "loss/crossentropy": 2.3407788276672363, + "loss/hidden": 0.16455078125, + "loss/logits": 0.03186378628015518, + "loss/reg": 0.02230297587811947, + "step": 1929 + }, + { + "epoch": 0.965, + "grad_norm": 1.2221165895462036, + "grad_norm_var": 1.9911827201257373, + "learning_rate": 2e-05, + "loss": 0.3808, + "loss/crossentropy": 2.2330673933029175, + "loss/hidden": 0.133544921875, + "loss/logits": 0.02422002237290144, + "loss/reg": 0.02230045385658741, + "step": 1930 + }, + { + "epoch": 0.9655, + "grad_norm": 1.4464212656021118, + "grad_norm_var": 1.996535291902523, + "learning_rate": 2e-05, + "loss": 0.431, + "loss/crossentropy": 2.560730218887329, + "loss/hidden": 0.17333984375, + "loss/logits": 0.034654753282666206, + "loss/reg": 0.022297974675893784, + "step": 1931 + }, + { + "epoch": 0.966, + "grad_norm": 1.3387093544006348, + "grad_norm_var": 0.09578263410320928, + "learning_rate": 2e-05, + "loss": 0.4139, + "loss/crossentropy": 2.2281144857406616, + "loss/hidden": 0.1640625, + "loss/logits": 0.026879730634391308, + "loss/reg": 0.02229529432952404, + "step": 1932 + }, + { + "epoch": 0.9665, + "grad_norm": 1.6466373205184937, + "grad_norm_var": 0.09519305136430105, + "learning_rate": 2e-05, + "loss": 0.4525, + "loss/crossentropy": 2.416178584098816, + "loss/hidden": 0.19677734375, + "loss/logits": 0.03280434384942055, + "loss/reg": 0.022292664274573326, + "step": 1933 + }, + { + "epoch": 0.967, + "grad_norm": 1.9065243005752563, + "grad_norm_var": 0.09965824271180447, + "learning_rate": 2e-05, + "loss": 0.398, + "loss/crossentropy": 2.332263708114624, + "loss/hidden": 0.14990234375, + "loss/logits": 0.02516376320272684, + "loss/reg": 0.02228992059826851, + "step": 1934 + }, + { + "epoch": 0.9675, + "grad_norm": 1.438956618309021, + "grad_norm_var": 0.09700722244866876, + "learning_rate": 2e-05, + "loss": 0.4454, + "loss/crossentropy": 2.299056053161621, + "loss/hidden": 0.1787109375, + "loss/logits": 0.0438066478818655, + "loss/reg": 0.022287409752607346, + "step": 1935 + }, + { + "epoch": 0.968, + "grad_norm": 1.3283144235610962, + "grad_norm_var": 0.09814598179248814, + "learning_rate": 2e-05, + "loss": 0.3993, + "loss/crossentropy": 2.705706477165222, + "loss/hidden": 0.150390625, + "loss/logits": 0.026097907684743404, + "loss/reg": 0.022284839302301407, + "step": 1936 + }, + { + "epoch": 0.9685, + "grad_norm": 1.2671499252319336, + "grad_norm_var": 0.07604085535109846, + "learning_rate": 2e-05, + "loss": 0.4034, + "loss/crossentropy": 2.3038320541381836, + "loss/hidden": 0.15283203125, + "loss/logits": 0.027765167877078056, + "loss/reg": 0.022282104939222336, + "step": 1937 + }, + { + "epoch": 0.969, + "grad_norm": 1.7129496335983276, + "grad_norm_var": 0.06908875770655426, + "learning_rate": 2e-05, + "loss": 0.4102, + "loss/crossentropy": 2.409985303878784, + "loss/hidden": 0.16015625, + "loss/logits": 0.027242244221270084, + "loss/reg": 0.022279653698205948, + "step": 1938 + }, + { + "epoch": 0.9695, + "grad_norm": 2.031566619873047, + "grad_norm_var": 0.08453384690603698, + "learning_rate": 2e-05, + "loss": 0.4135, + "loss/crossentropy": 2.453916311264038, + "loss/hidden": 0.162109375, + "loss/logits": 0.028619682416319847, + "loss/reg": 0.022277243435382843, + "step": 1939 + }, + { + "epoch": 0.97, + "grad_norm": 1.2778832912445068, + "grad_norm_var": 0.07526176615922105, + "learning_rate": 2e-05, + "loss": 0.4069, + "loss/crossentropy": 2.225833773612976, + "loss/hidden": 0.15185546875, + "loss/logits": 0.032262424007058144, + "loss/reg": 0.02227473258972168, + "step": 1940 + }, + { + "epoch": 0.9705, + "grad_norm": 1.492366075515747, + "grad_norm_var": 0.07243497295631436, + "learning_rate": 2e-05, + "loss": 0.4036, + "loss/crossentropy": 2.5838898420333862, + "loss/hidden": 0.15087890625, + "loss/logits": 0.029996756464242935, + "loss/reg": 0.022272255271673203, + "step": 1941 + }, + { + "epoch": 0.971, + "grad_norm": 1.3968478441238403, + "grad_norm_var": 0.06687936652998203, + "learning_rate": 2e-05, + "loss": 0.4118, + "loss/crossentropy": 2.50420606136322, + "loss/hidden": 0.15673828125, + "loss/logits": 0.03238658234477043, + "loss/reg": 0.022269796580076218, + "step": 1942 + }, + { + "epoch": 0.9715, + "grad_norm": 1.295305848121643, + "grad_norm_var": 0.06601141679391885, + "learning_rate": 2e-05, + "loss": 0.4081, + "loss/crossentropy": 2.0319120287895203, + "loss/hidden": 0.1572265625, + "loss/logits": 0.028178725391626358, + "loss/reg": 0.02226731739938259, + "step": 1943 + }, + { + "epoch": 0.972, + "grad_norm": 2.280089855194092, + "grad_norm_var": 0.10247276685499802, + "learning_rate": 2e-05, + "loss": 0.4834, + "loss/crossentropy": 2.3557260036468506, + "loss/hidden": 0.22216796875, + "loss/logits": 0.03859390318393707, + "loss/reg": 0.022265000268816948, + "step": 1944 + }, + { + "epoch": 0.9725, + "grad_norm": 1.870656132698059, + "grad_norm_var": 0.0999572300988054, + "learning_rate": 2e-05, + "loss": 0.4063, + "loss/crossentropy": 2.2198551893234253, + "loss/hidden": 0.15283203125, + "loss/logits": 0.03083806298673153, + "loss/reg": 0.022262422367930412, + "step": 1945 + }, + { + "epoch": 0.973, + "grad_norm": 1.223396897315979, + "grad_norm_var": 0.09989973331883183, + "learning_rate": 2e-05, + "loss": 0.3957, + "loss/crossentropy": 2.442264437675476, + "loss/hidden": 0.148681640625, + "loss/logits": 0.024409527890384197, + "loss/reg": 0.02225991152226925, + "step": 1946 + }, + { + "epoch": 0.9735, + "grad_norm": 1.715518593788147, + "grad_norm_var": 0.10036436305615364, + "learning_rate": 2e-05, + "loss": 0.4104, + "loss/crossentropy": 2.298407196998596, + "loss/hidden": 0.1552734375, + "loss/logits": 0.03260168805718422, + "loss/reg": 0.0222572460770607, + "step": 1947 + }, + { + "epoch": 0.974, + "grad_norm": 1.156018853187561, + "grad_norm_var": 0.10824091454887531, + "learning_rate": 2e-05, + "loss": 0.3887, + "loss/crossentropy": 2.604636073112488, + "loss/hidden": 0.13916015625, + "loss/logits": 0.027005971409380436, + "loss/reg": 0.022254537791013718, + "step": 1948 + }, + { + "epoch": 0.9745, + "grad_norm": 1.4995312690734863, + "grad_norm_var": 0.10799240399380565, + "learning_rate": 2e-05, + "loss": 0.4008, + "loss/crossentropy": 2.598103880882263, + "loss/hidden": 0.14794921875, + "loss/logits": 0.030296839773654938, + "loss/reg": 0.022251838818192482, + "step": 1949 + }, + { + "epoch": 0.975, + "grad_norm": 1.2907731533050537, + "grad_norm_var": 0.10289614463530032, + "learning_rate": 2e-05, + "loss": 0.413, + "loss/crossentropy": 2.3919172286987305, + "loss/hidden": 0.15966796875, + "loss/logits": 0.03085498232394457, + "loss/reg": 0.022249221801757812, + "step": 1950 + }, + { + "epoch": 0.9755, + "grad_norm": 1.502394437789917, + "grad_norm_var": 0.10248473161154052, + "learning_rate": 2e-05, + "loss": 0.3982, + "loss/crossentropy": 2.357411503791809, + "loss/hidden": 0.1484375, + "loss/logits": 0.027324603870511055, + "loss/reg": 0.0222467128187418, + "step": 1951 + }, + { + "epoch": 0.976, + "grad_norm": 1.2249350547790527, + "grad_norm_var": 0.1058127524217482, + "learning_rate": 2e-05, + "loss": 0.3818, + "loss/crossentropy": 2.7032934427261353, + "loss/hidden": 0.13525390625, + "loss/logits": 0.024068184196949005, + "loss/reg": 0.022244160994887352, + "step": 1952 + }, + { + "epoch": 0.9765, + "grad_norm": 1.2931334972381592, + "grad_norm_var": 0.10499684489912173, + "learning_rate": 2e-05, + "loss": 0.4009, + "loss/crossentropy": 2.303010582923889, + "loss/hidden": 0.150390625, + "loss/logits": 0.028063518926501274, + "loss/reg": 0.02224154584109783, + "step": 1953 + }, + { + "epoch": 0.977, + "grad_norm": 1.1064225435256958, + "grad_norm_var": 0.11209890357808133, + "learning_rate": 2e-05, + "loss": 0.394, + "loss/crossentropy": 2.4800167083740234, + "loss/hidden": 0.14306640625, + "loss/logits": 0.028574367053806782, + "loss/reg": 0.022238755598664284, + "step": 1954 + }, + { + "epoch": 0.9775, + "grad_norm": 1.7888641357421875, + "grad_norm_var": 0.09788471441156942, + "learning_rate": 2e-05, + "loss": 0.4597, + "loss/crossentropy": 2.269771993160248, + "loss/hidden": 0.19873046875, + "loss/logits": 0.03857916593551636, + "loss/reg": 0.022236214950680733, + "step": 1955 + }, + { + "epoch": 0.978, + "grad_norm": 1.1377007961273193, + "grad_norm_var": 0.1025800961707351, + "learning_rate": 2e-05, + "loss": 0.3758, + "loss/crossentropy": 2.247413754463196, + "loss/hidden": 0.1298828125, + "loss/logits": 0.02357647381722927, + "loss/reg": 0.022233642637729645, + "step": 1956 + }, + { + "epoch": 0.9785, + "grad_norm": 1.4004080295562744, + "grad_norm_var": 0.10264583324120663, + "learning_rate": 2e-05, + "loss": 0.4035, + "loss/crossentropy": 2.3835976123809814, + "loss/hidden": 0.15380859375, + "loss/logits": 0.02738987375050783, + "loss/reg": 0.022231118753552437, + "step": 1957 + }, + { + "epoch": 0.979, + "grad_norm": 1.24699068069458, + "grad_norm_var": 0.10508895477803246, + "learning_rate": 2e-05, + "loss": 0.3889, + "loss/crossentropy": 2.438993811607361, + "loss/hidden": 0.13818359375, + "loss/logits": 0.028424736112356186, + "loss/reg": 0.022228769958019257, + "step": 1958 + }, + { + "epoch": 0.9795, + "grad_norm": 2.308201789855957, + "grad_norm_var": 0.14973633890307708, + "learning_rate": 2e-05, + "loss": 0.4118, + "loss/crossentropy": 2.373353362083435, + "loss/hidden": 0.15673828125, + "loss/logits": 0.032791512086987495, + "loss/reg": 0.022226233035326004, + "step": 1959 + }, + { + "epoch": 0.98, + "grad_norm": 1.3943239450454712, + "grad_norm_var": 0.1069748260107654, + "learning_rate": 2e-05, + "loss": 0.4152, + "loss/crossentropy": 2.4576724767684937, + "loss/hidden": 0.16259765625, + "loss/logits": 0.03036567196249962, + "loss/reg": 0.022223642095923424, + "step": 1960 + }, + { + "epoch": 0.9805, + "grad_norm": 1.372459053993225, + "grad_norm_var": 0.09437562854595664, + "learning_rate": 2e-05, + "loss": 0.4329, + "loss/crossentropy": 2.2118232250213623, + "loss/hidden": 0.18115234375, + "loss/logits": 0.029521776363253593, + "loss/reg": 0.02222101204097271, + "step": 1961 + }, + { + "epoch": 0.981, + "grad_norm": 1.6961666345596313, + "grad_norm_var": 0.09618417236027692, + "learning_rate": 2e-05, + "loss": 0.3974, + "loss/crossentropy": 2.430359721183777, + "loss/hidden": 0.138671875, + "loss/logits": 0.03654170501977205, + "loss/reg": 0.02221854217350483, + "step": 1962 + }, + { + "epoch": 0.9815, + "grad_norm": 1.9904968738555908, + "grad_norm_var": 0.11079650013560964, + "learning_rate": 2e-05, + "loss": 0.3833, + "loss/crossentropy": 2.349852681159973, + "loss/hidden": 0.137939453125, + "loss/logits": 0.023190665990114212, + "loss/reg": 0.02221612073481083, + "step": 1963 + }, + { + "epoch": 0.982, + "grad_norm": 1.5087324380874634, + "grad_norm_var": 0.10413266118218628, + "learning_rate": 2e-05, + "loss": 0.385, + "loss/crossentropy": 2.4656479358673096, + "loss/hidden": 0.138671875, + "loss/logits": 0.024239342659711838, + "loss/reg": 0.02221374586224556, + "step": 1964 + }, + { + "epoch": 0.9825, + "grad_norm": 3.662198781967163, + "grad_norm_var": 0.40061585609098616, + "learning_rate": 2e-05, + "loss": 0.4676, + "loss/crossentropy": 2.5416672229766846, + "loss/hidden": 0.16748046875, + "loss/logits": 0.07797094993293285, + "loss/reg": 0.022211195901036263, + "step": 1965 + }, + { + "epoch": 0.983, + "grad_norm": 3.0709733963012695, + "grad_norm_var": 0.5204777832696872, + "learning_rate": 2e-05, + "loss": 0.446, + "loss/crossentropy": 2.499261498451233, + "loss/hidden": 0.18896484375, + "loss/logits": 0.03491983376443386, + "loss/reg": 0.022208670154213905, + "step": 1966 + }, + { + "epoch": 0.9835, + "grad_norm": 1.299326777458191, + "grad_norm_var": 0.5292589340957948, + "learning_rate": 2e-05, + "loss": 0.4253, + "loss/crossentropy": 2.3157382011413574, + "loss/hidden": 0.1669921875, + "loss/logits": 0.03624746948480606, + "loss/reg": 0.02220613695681095, + "step": 1967 + }, + { + "epoch": 0.984, + "grad_norm": 2.600094795227051, + "grad_norm_var": 0.5568919038873178, + "learning_rate": 2e-05, + "loss": 0.559, + "loss/crossentropy": 2.232682466506958, + "loss/hidden": 0.2373046875, + "loss/logits": 0.09961535781621933, + "loss/reg": 0.022203726693987846, + "step": 1968 + }, + { + "epoch": 0.9845, + "grad_norm": 1.5083189010620117, + "grad_norm_var": 0.5451060779468074, + "learning_rate": 2e-05, + "loss": 0.4475, + "loss/crossentropy": 2.3565086126327515, + "loss/hidden": 0.1845703125, + "loss/logits": 0.04091835021972656, + "loss/reg": 0.022201379761099815, + "step": 1969 + }, + { + "epoch": 0.985, + "grad_norm": 1.5320773124694824, + "grad_norm_var": 0.5160320549007683, + "learning_rate": 2e-05, + "loss": 0.4333, + "loss/crossentropy": 2.222718358039856, + "loss/hidden": 0.16845703125, + "loss/logits": 0.04285791330039501, + "loss/reg": 0.022199101746082306, + "step": 1970 + }, + { + "epoch": 0.9855, + "grad_norm": 1.7056018114089966, + "grad_norm_var": 0.5170866940808054, + "learning_rate": 2e-05, + "loss": 0.4092, + "loss/crossentropy": 2.4271280765533447, + "loss/hidden": 0.15673828125, + "loss/logits": 0.03053828328847885, + "loss/reg": 0.02219672128558159, + "step": 1971 + }, + { + "epoch": 0.986, + "grad_norm": 1.1430257558822632, + "grad_norm_var": 0.5165901006666008, + "learning_rate": 2e-05, + "loss": 0.3962, + "loss/crossentropy": 2.436211943626404, + "loss/hidden": 0.14453125, + "loss/logits": 0.02975220326334238, + "loss/reg": 0.02219444327056408, + "step": 1972 + }, + { + "epoch": 0.9865, + "grad_norm": 1.38370943069458, + "grad_norm_var": 0.5175861871168814, + "learning_rate": 2e-05, + "loss": 0.4147, + "loss/crossentropy": 2.4028401374816895, + "loss/hidden": 0.1611328125, + "loss/logits": 0.03168147522956133, + "loss/reg": 0.022191938012838364, + "step": 1973 + }, + { + "epoch": 0.987, + "grad_norm": 1.449479579925537, + "grad_norm_var": 0.5041676177403838, + "learning_rate": 2e-05, + "loss": 0.4342, + "loss/crossentropy": 2.3942774534225464, + "loss/hidden": 0.1767578125, + "loss/logits": 0.03549867123365402, + "loss/reg": 0.022189509123563766, + "step": 1974 + }, + { + "epoch": 0.9875, + "grad_norm": 2.0965735912323, + "grad_norm_var": 0.49408207054312825, + "learning_rate": 2e-05, + "loss": 0.4188, + "loss/crossentropy": 2.363860249519348, + "loss/hidden": 0.16259765625, + "loss/logits": 0.034327320754528046, + "loss/reg": 0.02218729257583618, + "step": 1975 + }, + { + "epoch": 0.988, + "grad_norm": 1.2674319744110107, + "grad_norm_var": 0.5026008210188041, + "learning_rate": 2e-05, + "loss": 0.4196, + "loss/crossentropy": 2.3103872537612915, + "loss/hidden": 0.16455078125, + "loss/logits": 0.03319397568702698, + "loss/reg": 0.022184785455465317, + "step": 1976 + }, + { + "epoch": 0.9885, + "grad_norm": 1.3519755601882935, + "grad_norm_var": 0.5038777873620819, + "learning_rate": 2e-05, + "loss": 0.3873, + "loss/crossentropy": 2.3832825422286987, + "loss/hidden": 0.140625, + "loss/logits": 0.024883822537958622, + "loss/reg": 0.022182263433933258, + "step": 1977 + }, + { + "epoch": 0.989, + "grad_norm": 1.163076400756836, + "grad_norm_var": 0.5310906853740609, + "learning_rate": 2e-05, + "loss": 0.3687, + "loss/crossentropy": 2.3655554056167603, + "loss/hidden": 0.1279296875, + "loss/logits": 0.018932482227683067, + "loss/reg": 0.02217974327504635, + "step": 1978 + }, + { + "epoch": 0.9895, + "grad_norm": 1.2967028617858887, + "grad_norm_var": 0.543166161422513, + "learning_rate": 2e-05, + "loss": 0.4113, + "loss/crossentropy": 2.2330108880996704, + "loss/hidden": 0.16064453125, + "loss/logits": 0.02884120587259531, + "loss/reg": 0.02217736653983593, + "step": 1979 + }, + { + "epoch": 0.99, + "grad_norm": 0.9606475830078125, + "grad_norm_var": 0.5797518155803723, + "learning_rate": 2e-05, + "loss": 0.3794, + "loss/crossentropy": 2.309578061103821, + "loss/hidden": 0.13427734375, + "loss/logits": 0.023378074169158936, + "loss/reg": 0.022174881771206856, + "step": 1980 + }, + { + "epoch": 0.9905, + "grad_norm": 1.4933451414108276, + "grad_norm_var": 0.3115809486329993, + "learning_rate": 2e-05, + "loss": 0.4478, + "loss/crossentropy": 2.2966067790985107, + "loss/hidden": 0.18994140625, + "loss/logits": 0.036129954271018505, + "loss/reg": 0.022172508761286736, + "step": 1981 + }, + { + "epoch": 0.991, + "grad_norm": 1.1591241359710693, + "grad_norm_var": 0.16063496865446744, + "learning_rate": 2e-05, + "loss": 0.3848, + "loss/crossentropy": 2.3767281770706177, + "loss/hidden": 0.13916015625, + "loss/logits": 0.023951291106641293, + "loss/reg": 0.022169925272464752, + "step": 1982 + }, + { + "epoch": 0.9915, + "grad_norm": 1.2430766820907593, + "grad_norm_var": 0.16206145180208573, + "learning_rate": 2e-05, + "loss": 0.4026, + "loss/crossentropy": 2.3017172813415527, + "loss/hidden": 0.150390625, + "loss/logits": 0.03055698424577713, + "loss/reg": 0.02216746285557747, + "step": 1983 + }, + { + "epoch": 0.992, + "grad_norm": 1.0589817762374878, + "grad_norm_var": 0.07615843072529553, + "learning_rate": 2e-05, + "loss": 0.3757, + "loss/crossentropy": 2.41360604763031, + "loss/hidden": 0.13134765625, + "loss/logits": 0.022694111801683903, + "loss/reg": 0.02216503396630287, + "step": 1984 + }, + { + "epoch": 0.9925, + "grad_norm": 1.074315071105957, + "grad_norm_var": 0.07954031445189572, + "learning_rate": 2e-05, + "loss": 0.3816, + "loss/crossentropy": 2.434372305870056, + "loss/hidden": 0.1376953125, + "loss/logits": 0.02227596938610077, + "loss/reg": 0.02216257154941559, + "step": 1985 + }, + { + "epoch": 0.993, + "grad_norm": 1.1688265800476074, + "grad_norm_var": 0.07830008007563455, + "learning_rate": 2e-05, + "loss": 0.3919, + "loss/crossentropy": 2.4475741386413574, + "loss/hidden": 0.1416015625, + "loss/logits": 0.02868059929460287, + "loss/reg": 0.022160008549690247, + "step": 1986 + }, + { + "epoch": 0.9935, + "grad_norm": 1.343680739402771, + "grad_norm_var": 0.06756511802767377, + "learning_rate": 2e-05, + "loss": 0.3966, + "loss/crossentropy": 2.1644026041030884, + "loss/hidden": 0.15087890625, + "loss/logits": 0.02410024218261242, + "loss/reg": 0.022157687693834305, + "step": 1987 + }, + { + "epoch": 0.994, + "grad_norm": 2.975306272506714, + "grad_norm_var": 0.2412736036708892, + "learning_rate": 2e-05, + "loss": 0.4244, + "loss/crossentropy": 2.527232050895691, + "loss/hidden": 0.17431640625, + "loss/logits": 0.0285196453332901, + "loss/reg": 0.022155148908495903, + "step": 1988 + }, + { + "epoch": 0.9945, + "grad_norm": 1.4210851192474365, + "grad_norm_var": 0.24125286489004902, + "learning_rate": 2e-05, + "loss": 0.4013, + "loss/crossentropy": 2.453064799308777, + "loss/hidden": 0.15283203125, + "loss/logits": 0.026922681368887424, + "loss/reg": 0.02215270884335041, + "step": 1989 + }, + { + "epoch": 0.995, + "grad_norm": 1.1747286319732666, + "grad_norm_var": 0.24444132193735152, + "learning_rate": 2e-05, + "loss": 0.3815, + "loss/crossentropy": 2.421423316001892, + "loss/hidden": 0.138427734375, + "loss/logits": 0.021576720289885998, + "loss/reg": 0.02215024270117283, + "step": 1990 + }, + { + "epoch": 0.9955, + "grad_norm": 1.2867865562438965, + "grad_norm_var": 0.20919603916842147, + "learning_rate": 2e-05, + "loss": 0.3843, + "loss/crossentropy": 2.414217710494995, + "loss/hidden": 0.13623046875, + "loss/logits": 0.02661888301372528, + "loss/reg": 0.022147882729768753, + "step": 1991 + }, + { + "epoch": 0.996, + "grad_norm": 1.7244079113006592, + "grad_norm_var": 0.21782960949894387, + "learning_rate": 2e-05, + "loss": 0.3918, + "loss/crossentropy": 2.4352335929870605, + "loss/hidden": 0.14208984375, + "loss/logits": 0.028264615684747696, + "loss/reg": 0.022145364433526993, + "step": 1992 + }, + { + "epoch": 0.9965, + "grad_norm": 1.0717110633850098, + "grad_norm_var": 0.22335652296934896, + "learning_rate": 2e-05, + "loss": 0.3637, + "loss/crossentropy": 2.3417575359344482, + "loss/hidden": 0.122314453125, + "loss/logits": 0.019959733821451664, + "loss/reg": 0.022142987698316574, + "step": 1993 + }, + { + "epoch": 0.997, + "grad_norm": 0.9572432041168213, + "grad_norm_var": 0.23116159615423915, + "learning_rate": 2e-05, + "loss": 0.3697, + "loss/crossentropy": 2.3081470727920532, + "loss/hidden": 0.128662109375, + "loss/logits": 0.019598262384533882, + "loss/reg": 0.022140614688396454, + "step": 1994 + }, + { + "epoch": 0.9975, + "grad_norm": 1.2059389352798462, + "grad_norm_var": 0.23217773839135317, + "learning_rate": 2e-05, + "loss": 0.3804, + "loss/crossentropy": 2.416892886161804, + "loss/hidden": 0.138671875, + "loss/logits": 0.02030058763921261, + "loss/reg": 0.022138269618153572, + "step": 1995 + }, + { + "epoch": 0.998, + "grad_norm": 1.0577036142349243, + "grad_norm_var": 0.22795505383013293, + "learning_rate": 2e-05, + "loss": 0.3623, + "loss/crossentropy": 2.3178452253341675, + "loss/hidden": 0.12255859375, + "loss/logits": 0.018403733149170876, + "loss/reg": 0.022135984152555466, + "step": 1996 + }, + { + "epoch": 0.9985, + "grad_norm": 1.2266889810562134, + "grad_norm_var": 0.22689434089943936, + "learning_rate": 2e-05, + "loss": 0.3796, + "loss/crossentropy": 2.3155784606933594, + "loss/hidden": 0.1357421875, + "loss/logits": 0.02248302474617958, + "loss/reg": 0.022133611142635345, + "step": 1997 + }, + { + "epoch": 0.999, + "grad_norm": 1.0541908740997314, + "grad_norm_var": 0.2298592464456514, + "learning_rate": 2e-05, + "loss": 0.3634, + "loss/crossentropy": 2.4655479192733765, + "loss/hidden": 0.12255859375, + "loss/logits": 0.01956414245069027, + "loss/reg": 0.02213137224316597, + "step": 1998 + }, + { + "epoch": 0.9995, + "grad_norm": 1.7036995887756348, + "grad_norm_var": 0.2386848838311654, + "learning_rate": 2e-05, + "loss": 0.4205, + "loss/crossentropy": 2.35299813747406, + "loss/hidden": 0.17236328125, + "loss/logits": 0.026862223632633686, + "loss/reg": 0.02212887816131115, + "step": 1999 + }, + { + "epoch": 1.0, + "grad_norm": 1.3390984535217285, + "grad_norm_var": 0.23294083127609208, + "learning_rate": 2e-05, + "loss": 0.3795, + "loss/crossentropy": 2.3336535692214966, + "loss/hidden": 0.1357421875, + "loss/logits": 0.022448450326919556, + "loss/reg": 0.022126398980617523, + "step": 2000 + } + ], + "logging_steps": 1, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": true, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.57623446257664e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}