{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.25, "eval_steps": 2000, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00025, "grad_norm": 39.5, "learning_rate": 0.0001, "loss": 7.8298, "loss/crossentropy": 2.313796639442444, "loss/hidden": 3.414453125, "loss/jsd": 0.0, "loss/logits": 0.21518087349832057, "step": 10 }, { "epoch": 0.0005, "grad_norm": 31.5, "grad_norm_var": 5.698893229166667, "learning_rate": 0.0001, "loss": 7.8693, "loss/crossentropy": 2.1564369201660156, "loss/hidden": 3.587109375, "loss/jsd": 0.0, "loss/logits": 0.21401480734348297, "step": 20 }, { "epoch": 0.00075, "grad_norm": 36.0, "grad_norm_var": 6.930143229166666, "learning_rate": 0.0001, "loss": 7.8779, "loss/crossentropy": 2.179039953649044, "loss/hidden": 3.709375, "loss/jsd": 0.0, "loss/logits": 0.22207003347575666, "step": 30 }, { "epoch": 0.001, "grad_norm": 32.75, "grad_norm_var": 40.942708333333336, "learning_rate": 0.0001, "loss": 7.7653, "loss/crossentropy": 2.074952059984207, "loss/hidden": 3.55625, "loss/jsd": 0.0, "loss/logits": 0.20403100922703743, "step": 40 }, { "epoch": 0.00125, "grad_norm": 35.75, "grad_norm_var": 94.25729166666666, "learning_rate": 0.0001, "loss": 7.8641, "loss/crossentropy": 2.087546107172966, "loss/hidden": 3.50546875, "loss/jsd": 0.0, "loss/logits": 0.19412125833332539, "step": 50 }, { "epoch": 0.0015, "grad_norm": 30.25, "grad_norm_var": 110.89140625, "learning_rate": 0.0001, "loss": 7.8652, "loss/crossentropy": 2.2259810894727705, "loss/hidden": 3.528125, "loss/jsd": 0.0, "loss/logits": 0.21063638497143983, "step": 60 }, { "epoch": 0.00175, "grad_norm": 36.0, "grad_norm_var": 62.02805989583333, "learning_rate": 0.0001, "loss": 7.751, "loss/crossentropy": 2.164659637212753, "loss/hidden": 3.47109375, "loss/jsd": 0.0, "loss/logits": 0.19977533183991908, "step": 70 }, { "epoch": 0.002, "grad_norm": 34.25, "grad_norm_var": 8.6759765625, "learning_rate": 0.0001, "loss": 7.7596, "loss/crossentropy": 2.097026476264, "loss/hidden": 3.478125, "loss/jsd": 0.0, "loss/logits": 0.20290330462157727, "step": 80 }, { "epoch": 0.00225, "grad_norm": 39.75, "grad_norm_var": 71.35598958333334, "learning_rate": 0.0001, "loss": 7.8106, "loss/crossentropy": 2.1291788890957832, "loss/hidden": 3.491796875, "loss/jsd": 0.0, "loss/logits": 0.19771635457873343, "step": 90 }, { "epoch": 0.0025, "grad_norm": 34.25, "grad_norm_var": 9.158072916666667, "learning_rate": 0.0001, "loss": 7.7473, "loss/crossentropy": 2.147798593342304, "loss/hidden": 3.558203125, "loss/jsd": 0.0, "loss/logits": 0.20517258979380132, "step": 100 }, { "epoch": 0.00275, "grad_norm": 31.625, "grad_norm_var": 9.737239583333333, "learning_rate": 0.0001, "loss": 7.7738, "loss/crossentropy": 2.1884776622056963, "loss/hidden": 3.458203125, "loss/jsd": 0.0, "loss/logits": 0.20624704901129007, "step": 110 }, { "epoch": 0.003, "grad_norm": 37.75, "grad_norm_var": 335.18170572916665, "learning_rate": 0.0001, "loss": 7.8546, "loss/crossentropy": 2.2224678859114646, "loss/hidden": 3.5296875, "loss/jsd": 0.0, "loss/logits": 0.22259013392031193, "step": 120 }, { "epoch": 0.00325, "grad_norm": 136.0, "grad_norm_var": 1014.1374348958333, "learning_rate": 0.0001, "loss": 7.7227, "loss/crossentropy": 2.135145714879036, "loss/hidden": 3.459765625, "loss/jsd": 0.0, "loss/logits": 0.21895913481712342, "step": 130 }, { "epoch": 0.0035, "grad_norm": 36.25, "grad_norm_var": 663.2072916666667, "learning_rate": 0.0001, "loss": 7.6794, "loss/crossentropy": 2.2155070066452027, "loss/hidden": 3.428515625, "loss/jsd": 0.0, "loss/logits": 0.18895817659795283, "step": 140 }, { "epoch": 0.00375, "grad_norm": 38.5, "grad_norm_var": 54.0384765625, "learning_rate": 0.0001, "loss": 7.7461, "loss/crossentropy": 2.1793935388326644, "loss/hidden": 3.5140625, "loss/jsd": 0.0, "loss/logits": 0.19897108823060988, "step": 150 }, { "epoch": 0.004, "grad_norm": 62.0, "grad_norm_var": 99.1447265625, "learning_rate": 0.0001, "loss": 7.7956, "loss/crossentropy": 2.194957372546196, "loss/hidden": 3.597265625, "loss/jsd": 0.0, "loss/logits": 0.22534323409199714, "step": 160 }, { "epoch": 0.00425, "grad_norm": 32.5, "grad_norm_var": 62.60390625, "learning_rate": 0.0001, "loss": 7.7866, "loss/crossentropy": 2.1939920127391814, "loss/hidden": 3.529296875, "loss/jsd": 0.0, "loss/logits": 0.2311840608716011, "step": 170 }, { "epoch": 0.0045, "grad_norm": 33.5, "grad_norm_var": 10.049934895833333, "learning_rate": 0.0001, "loss": 7.6691, "loss/crossentropy": 2.1646964073181154, "loss/hidden": 3.416015625, "loss/jsd": 0.0, "loss/logits": 0.2066217228770256, "step": 180 }, { "epoch": 0.00475, "grad_norm": 32.75, "grad_norm_var": 13.889322916666666, "learning_rate": 0.0001, "loss": 7.8529, "loss/crossentropy": 2.135753521323204, "loss/hidden": 3.64140625, "loss/jsd": 0.0, "loss/logits": 0.23793395943939685, "step": 190 }, { "epoch": 0.005, "grad_norm": 30.375, "grad_norm_var": 14.12265625, "learning_rate": 0.0001, "loss": 7.7357, "loss/crossentropy": 2.1783783614635466, "loss/hidden": 3.540234375, "loss/jsd": 0.0, "loss/logits": 0.2111268475651741, "step": 200 }, { "epoch": 0.00525, "grad_norm": 47.75, "grad_norm_var": 167.33014322916668, "learning_rate": 0.0001, "loss": 7.8414, "loss/crossentropy": 2.091558237373829, "loss/hidden": 3.49765625, "loss/jsd": 0.0, "loss/logits": 0.2065280582755804, "step": 210 }, { "epoch": 0.0055, "grad_norm": 30.625, "grad_norm_var": 185.38899739583334, "learning_rate": 0.0001, "loss": 7.7181, "loss/crossentropy": 2.1866263896226883, "loss/hidden": 3.434375, "loss/jsd": 0.0, "loss/logits": 0.20596572011709213, "step": 220 }, { "epoch": 0.00575, "grad_norm": 30.125, "grad_norm_var": 45.847330729166664, "learning_rate": 0.0001, "loss": 7.6276, "loss/crossentropy": 2.1170753836631775, "loss/hidden": 3.419140625, "loss/jsd": 0.0, "loss/logits": 0.20233637914061547, "step": 230 }, { "epoch": 0.006, "grad_norm": 33.75, "grad_norm_var": 17.477083333333333, "learning_rate": 0.0001, "loss": 7.7487, "loss/crossentropy": 2.1388430804014207, "loss/hidden": 3.55703125, "loss/jsd": 0.0, "loss/logits": 0.20581382531672715, "step": 240 }, { "epoch": 0.00625, "grad_norm": 31.75, "grad_norm_var": 1.54765625, "learning_rate": 0.0001, "loss": 7.6568, "loss/crossentropy": 2.2856020241975785, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.2351478708907962, "step": 250 }, { "epoch": 0.0065, "grad_norm": 28.375, "grad_norm_var": 28.54375, "learning_rate": 0.0001, "loss": 7.6993, "loss/crossentropy": 2.0653378486633303, "loss/hidden": 3.493359375, "loss/jsd": 0.0, "loss/logits": 0.19755732025951148, "step": 260 }, { "epoch": 0.00675, "grad_norm": 33.25, "grad_norm_var": 28.384375, "learning_rate": 0.0001, "loss": 7.7075, "loss/crossentropy": 2.1598333328962327, "loss/hidden": 3.44765625, "loss/jsd": 0.0, "loss/logits": 0.19795978404581546, "step": 270 }, { "epoch": 0.007, "grad_norm": 32.5, "grad_norm_var": 20.862955729166668, "learning_rate": 0.0001, "loss": 7.6852, "loss/crossentropy": 2.138056221604347, "loss/hidden": 3.43671875, "loss/jsd": 0.0, "loss/logits": 0.1999417580664158, "step": 280 }, { "epoch": 0.00725, "grad_norm": 42.0, "grad_norm_var": 20.856705729166666, "learning_rate": 0.0001, "loss": 7.8621, "loss/crossentropy": 2.1779348880052565, "loss/hidden": 3.42421875, "loss/jsd": 0.0, "loss/logits": 0.1947902340441942, "step": 290 }, { "epoch": 0.0075, "grad_norm": 31.75, "grad_norm_var": 8.949739583333333, "learning_rate": 0.0001, "loss": 7.6542, "loss/crossentropy": 2.174051034450531, "loss/hidden": 3.49375, "loss/jsd": 0.0, "loss/logits": 0.2129627451300621, "step": 300 }, { "epoch": 0.00775, "grad_norm": 33.5, "grad_norm_var": 4.620247395833333, "learning_rate": 0.0001, "loss": 7.6721, "loss/crossentropy": 2.0598735958337784, "loss/hidden": 3.54921875, "loss/jsd": 0.0, "loss/logits": 0.20318429488688708, "step": 310 }, { "epoch": 0.008, "grad_norm": 35.5, "grad_norm_var": 2.059375, "learning_rate": 0.0001, "loss": 7.6655, "loss/crossentropy": 2.1254130959510804, "loss/hidden": 3.446875, "loss/jsd": 0.0, "loss/logits": 0.19875272288918494, "step": 320 }, { "epoch": 0.00825, "grad_norm": 35.0, "grad_norm_var": 1.9639973958333334, "learning_rate": 0.0001, "loss": 7.7461, "loss/crossentropy": 2.1635933369398117, "loss/hidden": 3.41640625, "loss/jsd": 0.0, "loss/logits": 0.2012931451201439, "step": 330 }, { "epoch": 0.0085, "grad_norm": 33.75, "grad_norm_var": 2.255989583333333, "learning_rate": 0.0001, "loss": 7.6974, "loss/crossentropy": 2.214476653933525, "loss/hidden": 3.39453125, "loss/jsd": 0.0, "loss/logits": 0.1954287003725767, "step": 340 }, { "epoch": 0.00875, "grad_norm": 30.625, "grad_norm_var": 2.4942057291666666, "learning_rate": 0.0001, "loss": 7.6918, "loss/crossentropy": 2.216859245300293, "loss/hidden": 3.393359375, "loss/jsd": 0.0, "loss/logits": 0.20083636604249477, "step": 350 }, { "epoch": 0.009, "grad_norm": 31.625, "grad_norm_var": 1.7905598958333333, "learning_rate": 0.0001, "loss": 7.6896, "loss/crossentropy": 2.2161539107561112, "loss/hidden": 3.405078125, "loss/jsd": 0.0, "loss/logits": 0.19574192687869071, "step": 360 }, { "epoch": 0.00925, "grad_norm": 29.75, "grad_norm_var": 7.141080729166666, "learning_rate": 0.0001, "loss": 7.8109, "loss/crossentropy": 2.153403599560261, "loss/hidden": 3.559765625, "loss/jsd": 0.0, "loss/logits": 0.21339697316288947, "step": 370 }, { "epoch": 0.0095, "grad_norm": 37.5, "grad_norm_var": 10.9244140625, "learning_rate": 0.0001, "loss": 7.7615, "loss/crossentropy": 2.253763607144356, "loss/hidden": 3.494140625, "loss/jsd": 0.0, "loss/logits": 0.2074073076248169, "step": 380 }, { "epoch": 0.00975, "grad_norm": 33.75, "grad_norm_var": 13.8400390625, "learning_rate": 0.0001, "loss": 7.7209, "loss/crossentropy": 2.1363648414611816, "loss/hidden": 3.478125, "loss/jsd": 0.0, "loss/logits": 0.20775138661265374, "step": 390 }, { "epoch": 0.01, "grad_norm": 31.5, "grad_norm_var": 14.397330729166667, "learning_rate": 0.0001, "loss": 7.6574, "loss/crossentropy": 2.1789979085326197, "loss/hidden": 3.5109375, "loss/jsd": 0.0, "loss/logits": 0.20811444334685802, "step": 400 }, { "epoch": 0.01025, "grad_norm": 33.0, "grad_norm_var": 9.6837890625, "learning_rate": 0.0001, "loss": 7.7272, "loss/crossentropy": 2.232848098874092, "loss/hidden": 3.5328125, "loss/jsd": 0.0, "loss/logits": 0.21815686002373696, "step": 410 }, { "epoch": 0.0105, "grad_norm": 30.0, "grad_norm_var": 73.00201822916667, "learning_rate": 0.0001, "loss": 7.7767, "loss/crossentropy": 2.064501041173935, "loss/hidden": 3.616796875, "loss/jsd": 0.0, "loss/logits": 0.21888567861169578, "step": 420 }, { "epoch": 0.01075, "grad_norm": 29.625, "grad_norm_var": 73.21015625, "learning_rate": 0.0001, "loss": 7.7084, "loss/crossentropy": 2.1248373448848725, "loss/hidden": 3.48828125, "loss/jsd": 0.0, "loss/logits": 0.20307110175490378, "step": 430 }, { "epoch": 0.011, "grad_norm": 31.875, "grad_norm_var": 8.3259765625, "learning_rate": 0.0001, "loss": 7.6488, "loss/crossentropy": 2.1684874832630157, "loss/hidden": 3.371875, "loss/jsd": 0.0, "loss/logits": 0.18734447471797466, "step": 440 }, { "epoch": 0.01125, "grad_norm": 40.75, "grad_norm_var": 27.13125, "learning_rate": 0.0001, "loss": 7.831, "loss/crossentropy": 2.1310232520103454, "loss/hidden": 3.631640625, "loss/jsd": 0.0, "loss/logits": 0.20724854618310928, "step": 450 }, { "epoch": 0.0115, "grad_norm": 36.0, "grad_norm_var": 27.680208333333333, "learning_rate": 0.0001, "loss": 7.7446, "loss/crossentropy": 2.134530597925186, "loss/hidden": 3.43984375, "loss/jsd": 0.0, "loss/logits": 0.21913636103272438, "step": 460 }, { "epoch": 0.01175, "grad_norm": 33.0, "grad_norm_var": 7.48125, "learning_rate": 0.0001, "loss": 7.6288, "loss/crossentropy": 2.2641385555267335, "loss/hidden": 3.348828125, "loss/jsd": 0.0, "loss/logits": 0.1849798161536455, "step": 470 }, { "epoch": 0.012, "grad_norm": 31.75, "grad_norm_var": 11.4837890625, "learning_rate": 0.0001, "loss": 7.6493, "loss/crossentropy": 2.2282994374632836, "loss/hidden": 3.443359375, "loss/jsd": 0.0, "loss/logits": 0.2014446135610342, "step": 480 }, { "epoch": 0.01225, "grad_norm": 37.25, "grad_norm_var": 6.677083333333333, "learning_rate": 0.0001, "loss": 7.7612, "loss/crossentropy": 2.1222758114337923, "loss/hidden": 3.5625, "loss/jsd": 0.0, "loss/logits": 0.2013396628201008, "step": 490 }, { "epoch": 0.0125, "grad_norm": 32.5, "grad_norm_var": 4.7744140625, "learning_rate": 0.0001, "loss": 7.6421, "loss/crossentropy": 2.069608175754547, "loss/hidden": 3.45078125, "loss/jsd": 0.0, "loss/logits": 0.1966784244403243, "step": 500 }, { "epoch": 0.01275, "grad_norm": 41.5, "grad_norm_var": 10.3697265625, "learning_rate": 0.0001, "loss": 7.6818, "loss/crossentropy": 2.1589883297681807, "loss/hidden": 3.355078125, "loss/jsd": 0.0, "loss/logits": 0.18451723456382751, "step": 510 }, { "epoch": 0.013, "grad_norm": 31.125, "grad_norm_var": 6.667122395833333, "learning_rate": 0.0001, "loss": 7.7136, "loss/crossentropy": 2.149793979898095, "loss/hidden": 3.598828125, "loss/jsd": 0.0, "loss/logits": 0.2093389181420207, "step": 520 }, { "epoch": 0.01325, "grad_norm": 35.25, "grad_norm_var": 20.768489583333334, "learning_rate": 0.0001, "loss": 7.7677, "loss/crossentropy": 2.195904017984867, "loss/hidden": 3.425390625, "loss/jsd": 0.0, "loss/logits": 0.19832278694957495, "step": 530 }, { "epoch": 0.0135, "grad_norm": 35.5, "grad_norm_var": 17.8619140625, "learning_rate": 0.0001, "loss": 7.6679, "loss/crossentropy": 2.1850160747766494, "loss/hidden": 3.403125, "loss/jsd": 0.0, "loss/logits": 0.20512696355581284, "step": 540 }, { "epoch": 0.01375, "grad_norm": 35.25, "grad_norm_var": 25.736393229166666, "learning_rate": 0.0001, "loss": 7.6426, "loss/crossentropy": 2.1822438329458236, "loss/hidden": 3.441796875, "loss/jsd": 0.0, "loss/logits": 0.20032773297280074, "step": 550 }, { "epoch": 0.014, "grad_norm": 35.25, "grad_norm_var": 30.77890625, "learning_rate": 0.0001, "loss": 7.7347, "loss/crossentropy": 2.1990185409784315, "loss/hidden": 3.410546875, "loss/jsd": 0.0, "loss/logits": 0.19729668814688922, "step": 560 }, { "epoch": 0.01425, "grad_norm": 41.25, "grad_norm_var": 46.00149739583333, "learning_rate": 0.0001, "loss": 7.6662, "loss/crossentropy": 2.0567948162555694, "loss/hidden": 3.487109375, "loss/jsd": 0.0, "loss/logits": 0.18916799686849117, "step": 570 }, { "epoch": 0.0145, "grad_norm": 31.875, "grad_norm_var": 18.383072916666666, "learning_rate": 0.0001, "loss": 7.662, "loss/crossentropy": 2.1589747786521913, "loss/hidden": 3.6359375, "loss/jsd": 0.0, "loss/logits": 0.21064655482769012, "step": 580 }, { "epoch": 0.01475, "grad_norm": 34.0, "grad_norm_var": 7.5025390625, "learning_rate": 0.0001, "loss": 7.6739, "loss/crossentropy": 2.053135275095701, "loss/hidden": 3.4828125, "loss/jsd": 0.0, "loss/logits": 0.20297051095403731, "step": 590 }, { "epoch": 0.015, "grad_norm": 42.0, "grad_norm_var": 58.718684895833334, "learning_rate": 0.0001, "loss": 7.6386, "loss/crossentropy": 2.0670476451516153, "loss/hidden": 3.590234375, "loss/jsd": 0.0, "loss/logits": 0.208550613373518, "step": 600 }, { "epoch": 0.01525, "grad_norm": 29.75, "grad_norm_var": 57.89993489583333, "learning_rate": 0.0001, "loss": 7.6461, "loss/crossentropy": 2.1219205021858216, "loss/hidden": 3.4953125, "loss/jsd": 0.0, "loss/logits": 0.20205040834844112, "step": 610 }, { "epoch": 0.0155, "grad_norm": 29.25, "grad_norm_var": 10.6228515625, "learning_rate": 0.0001, "loss": 7.5641, "loss/crossentropy": 2.114127852022648, "loss/hidden": 3.4171875, "loss/jsd": 0.0, "loss/logits": 0.19217339344322681, "step": 620 }, { "epoch": 0.01575, "grad_norm": 30.375, "grad_norm_var": 14.542643229166666, "learning_rate": 0.0001, "loss": 7.6319, "loss/crossentropy": 2.2160026699304582, "loss/hidden": 3.392578125, "loss/jsd": 0.0, "loss/logits": 0.19281109217554332, "step": 630 }, { "epoch": 0.016, "grad_norm": 34.0, "grad_norm_var": 8.1869140625, "learning_rate": 0.0001, "loss": 7.6546, "loss/crossentropy": 2.1914512276649476, "loss/hidden": 3.47109375, "loss/jsd": 0.0, "loss/logits": 0.2007790008559823, "step": 640 }, { "epoch": 0.01625, "grad_norm": 30.375, "grad_norm_var": 5.4587890625, "learning_rate": 0.0001, "loss": 7.573, "loss/crossentropy": 2.0619212985038757, "loss/hidden": 3.440625, "loss/jsd": 0.0, "loss/logits": 0.19594881720840931, "step": 650 }, { "epoch": 0.0165, "grad_norm": 34.75, "grad_norm_var": 6.121809895833334, "learning_rate": 0.0001, "loss": 7.7448, "loss/crossentropy": 2.1046764492988586, "loss/hidden": 3.5921875, "loss/jsd": 0.0, "loss/logits": 0.20083924774080514, "step": 660 }, { "epoch": 0.01675, "grad_norm": 34.5, "grad_norm_var": 5.715559895833334, "learning_rate": 0.0001, "loss": 7.8327, "loss/crossentropy": 2.2835423797369003, "loss/hidden": 3.604296875, "loss/jsd": 0.0, "loss/logits": 0.23483402598649264, "step": 670 }, { "epoch": 0.017, "grad_norm": 31.875, "grad_norm_var": 10.7650390625, "learning_rate": 0.0001, "loss": 7.8138, "loss/crossentropy": 2.0907129019498827, "loss/hidden": 3.518359375, "loss/jsd": 0.0, "loss/logits": 0.1924523524940014, "step": 680 }, { "epoch": 0.01725, "grad_norm": 33.0, "grad_norm_var": 1.42265625, "learning_rate": 0.0001, "loss": 7.6162, "loss/crossentropy": 2.127697338163853, "loss/hidden": 3.50859375, "loss/jsd": 0.0, "loss/logits": 0.2057236723601818, "step": 690 }, { "epoch": 0.0175, "grad_norm": 31.0, "grad_norm_var": 8.9822265625, "learning_rate": 0.0001, "loss": 7.727, "loss/crossentropy": 2.088846719264984, "loss/hidden": 3.487890625, "loss/jsd": 0.0, "loss/logits": 0.21837750263512135, "step": 700 }, { "epoch": 0.01775, "grad_norm": 31.125, "grad_norm_var": 1.7385416666666667, "learning_rate": 0.0001, "loss": 7.504, "loss/crossentropy": 2.1813240855932237, "loss/hidden": 3.376171875, "loss/jsd": 0.0, "loss/logits": 0.18769481666386129, "step": 710 }, { "epoch": 0.018, "grad_norm": 34.0, "grad_norm_var": 2.569205729166667, "learning_rate": 0.0001, "loss": 7.6713, "loss/crossentropy": 2.127785587310791, "loss/hidden": 3.45546875, "loss/jsd": 0.0, "loss/logits": 0.20503429286181926, "step": 720 }, { "epoch": 0.01825, "grad_norm": 29.875, "grad_norm_var": 12.484375, "learning_rate": 0.0001, "loss": 7.6481, "loss/crossentropy": 2.171098938584328, "loss/hidden": 3.474609375, "loss/jsd": 0.0, "loss/logits": 0.20801318623125553, "step": 730 }, { "epoch": 0.0185, "grad_norm": 37.25, "grad_norm_var": 14.9978515625, "learning_rate": 0.0001, "loss": 7.6067, "loss/crossentropy": 2.1487890854477882, "loss/hidden": 3.427734375, "loss/jsd": 0.0, "loss/logits": 0.18985433727502823, "step": 740 }, { "epoch": 0.01875, "grad_norm": 32.0, "grad_norm_var": 7.299739583333333, "learning_rate": 0.0001, "loss": 7.4877, "loss/crossentropy": 2.2428383469581603, "loss/hidden": 3.24375, "loss/jsd": 0.0, "loss/logits": 0.18733534589409828, "step": 750 }, { "epoch": 0.019, "grad_norm": 30.0, "grad_norm_var": 3.379622395833333, "learning_rate": 0.0001, "loss": 7.6509, "loss/crossentropy": 2.1872796684503557, "loss/hidden": 3.4203125, "loss/jsd": 0.0, "loss/logits": 0.21135813258588315, "step": 760 }, { "epoch": 0.01925, "grad_norm": 30.5, "grad_norm_var": 46.3072265625, "learning_rate": 0.0001, "loss": 7.7384, "loss/crossentropy": 2.2427982538938522, "loss/hidden": 3.369921875, "loss/jsd": 0.0, "loss/logits": 0.1998496226966381, "step": 770 }, { "epoch": 0.0195, "grad_norm": 41.5, "grad_norm_var": 45.49264322916667, "learning_rate": 0.0001, "loss": 7.6688, "loss/crossentropy": 2.207463192939758, "loss/hidden": 3.321875, "loss/jsd": 0.0, "loss/logits": 0.18665656447410583, "step": 780 }, { "epoch": 0.01975, "grad_norm": 31.625, "grad_norm_var": 53.53951822916667, "learning_rate": 0.0001, "loss": 7.6327, "loss/crossentropy": 2.075051838159561, "loss/hidden": 3.37265625, "loss/jsd": 0.0, "loss/logits": 0.19726874344050885, "step": 790 }, { "epoch": 0.02, "grad_norm": 35.25, "grad_norm_var": 15.802083333333334, "learning_rate": 0.0001, "loss": 7.6833, "loss/crossentropy": 2.0811705768108366, "loss/hidden": 3.40078125, "loss/jsd": 0.0, "loss/logits": 0.19689124524593354, "step": 800 }, { "epoch": 0.02025, "grad_norm": 36.25, "grad_norm_var": 2.7916015625, "learning_rate": 0.0001, "loss": 7.6998, "loss/crossentropy": 2.139931133389473, "loss/hidden": 3.56640625, "loss/jsd": 0.0, "loss/logits": 0.20113225914537908, "step": 810 }, { "epoch": 0.0205, "grad_norm": 33.25, "grad_norm_var": 3.21640625, "learning_rate": 0.0001, "loss": 7.6125, "loss/crossentropy": 2.3070268869400024, "loss/hidden": 3.4, "loss/jsd": 0.0, "loss/logits": 0.19928287118673324, "step": 820 }, { "epoch": 0.02075, "grad_norm": 31.375, "grad_norm_var": 4.70390625, "learning_rate": 0.0001, "loss": 7.7169, "loss/crossentropy": 2.1359834372997284, "loss/hidden": 3.6421875, "loss/jsd": 0.0, "loss/logits": 0.22523897737264634, "step": 830 }, { "epoch": 0.021, "grad_norm": 33.75, "grad_norm_var": 7.06015625, "learning_rate": 0.0001, "loss": 7.6629, "loss/crossentropy": 2.1498879536986353, "loss/hidden": 3.599609375, "loss/jsd": 0.0, "loss/logits": 0.21073536314070224, "step": 840 }, { "epoch": 0.02125, "grad_norm": 31.125, "grad_norm_var": 11.855143229166666, "learning_rate": 0.0001, "loss": 7.7246, "loss/crossentropy": 2.154731386899948, "loss/hidden": 3.379296875, "loss/jsd": 0.0, "loss/logits": 0.18697260301560165, "step": 850 }, { "epoch": 0.0215, "grad_norm": 28.0, "grad_norm_var": 3.8988932291666667, "learning_rate": 0.0001, "loss": 7.4616, "loss/crossentropy": 2.209418597817421, "loss/hidden": 3.441015625, "loss/jsd": 0.0, "loss/logits": 0.19536950960755348, "step": 860 }, { "epoch": 0.02175, "grad_norm": 36.25, "grad_norm_var": 28.367708333333333, "learning_rate": 0.0001, "loss": 7.6221, "loss/crossentropy": 2.106307029724121, "loss/hidden": 3.477734375, "loss/jsd": 0.0, "loss/logits": 0.20511649739928545, "step": 870 }, { "epoch": 0.022, "grad_norm": 30.875, "grad_norm_var": 25.5978515625, "learning_rate": 0.0001, "loss": 7.57, "loss/crossentropy": 2.170385852456093, "loss/hidden": 3.419140625, "loss/jsd": 0.0, "loss/logits": 0.18977888114750385, "step": 880 }, { "epoch": 0.02225, "grad_norm": 31.0, "grad_norm_var": 4.178580729166667, "learning_rate": 0.0001, "loss": 7.659, "loss/crossentropy": 2.022993338108063, "loss/hidden": 3.580859375, "loss/jsd": 0.0, "loss/logits": 0.2017082829028368, "step": 890 }, { "epoch": 0.0225, "grad_norm": 30.25, "grad_norm_var": 4.118684895833334, "learning_rate": 0.0001, "loss": 7.6471, "loss/crossentropy": 2.1982390731573105, "loss/hidden": 3.4078125, "loss/jsd": 0.0, "loss/logits": 0.186830697581172, "step": 900 }, { "epoch": 0.02275, "grad_norm": 36.0, "grad_norm_var": 9.886393229166666, "learning_rate": 0.0001, "loss": 7.5929, "loss/crossentropy": 2.1351534157991408, "loss/hidden": 3.44296875, "loss/jsd": 0.0, "loss/logits": 0.19507032372057437, "step": 910 }, { "epoch": 0.023, "grad_norm": 30.25, "grad_norm_var": 69.10182291666666, "learning_rate": 0.0001, "loss": 7.7006, "loss/crossentropy": 2.1805424720048903, "loss/hidden": 3.60546875, "loss/jsd": 0.0, "loss/logits": 0.2387762701138854, "step": 920 }, { "epoch": 0.02325, "grad_norm": 29.625, "grad_norm_var": 148.09765625, "learning_rate": 0.0001, "loss": 7.603, "loss/crossentropy": 2.1993222564458845, "loss/hidden": 3.5421875, "loss/jsd": 0.0, "loss/logits": 0.2252051206305623, "step": 930 }, { "epoch": 0.0235, "grad_norm": 33.0, "grad_norm_var": 149.05670572916668, "learning_rate": 0.0001, "loss": 7.6101, "loss/crossentropy": 2.132229286432266, "loss/hidden": 3.434765625, "loss/jsd": 0.0, "loss/logits": 0.19997731409966946, "step": 940 }, { "epoch": 0.02375, "grad_norm": 33.25, "grad_norm_var": 2.372330729166667, "learning_rate": 0.0001, "loss": 7.6309, "loss/crossentropy": 2.057620918750763, "loss/hidden": 3.525, "loss/jsd": 0.0, "loss/logits": 0.19510896243155001, "step": 950 }, { "epoch": 0.024, "grad_norm": 30.125, "grad_norm_var": 2.9291666666666667, "learning_rate": 0.0001, "loss": 7.5599, "loss/crossentropy": 2.1666407614946364, "loss/hidden": 3.38828125, "loss/jsd": 0.0, "loss/logits": 0.19047823324799537, "step": 960 }, { "epoch": 0.02425, "grad_norm": 34.0, "grad_norm_var": 750.4874348958333, "learning_rate": 0.0001, "loss": 7.667, "loss/crossentropy": 2.199223425984383, "loss/hidden": 3.561328125, "loss/jsd": 0.0, "loss/logits": 0.22954254262149335, "step": 970 }, { "epoch": 0.0245, "grad_norm": 31.0, "grad_norm_var": 736.4518229166666, "learning_rate": 0.0001, "loss": 7.6648, "loss/crossentropy": 2.052691954374313, "loss/hidden": 3.612109375, "loss/jsd": 0.0, "loss/logits": 0.20743414014577866, "step": 980 }, { "epoch": 0.02475, "grad_norm": 29.625, "grad_norm_var": 13.204166666666667, "learning_rate": 0.0001, "loss": 7.587, "loss/crossentropy": 2.2363356560468675, "loss/hidden": 3.383984375, "loss/jsd": 0.0, "loss/logits": 0.19663754627108573, "step": 990 }, { "epoch": 0.025, "grad_norm": 29.5, "grad_norm_var": 8.623958333333333, "learning_rate": 0.0001, "loss": 7.5999, "loss/crossentropy": 2.096450260281563, "loss/hidden": 3.437890625, "loss/jsd": 0.0, "loss/logits": 0.1896633107215166, "step": 1000 }, { "epoch": 0.02525, "grad_norm": 30.0, "grad_norm_var": 26.456184895833335, "learning_rate": 0.0001, "loss": 7.6469, "loss/crossentropy": 2.219489449262619, "loss/hidden": 3.461328125, "loss/jsd": 0.0, "loss/logits": 0.20298976600170135, "step": 1010 }, { "epoch": 0.0255, "grad_norm": 32.75, "grad_norm_var": 22.0525390625, "learning_rate": 0.0001, "loss": 7.6634, "loss/crossentropy": 2.157027468085289, "loss/hidden": 3.641015625, "loss/jsd": 0.0, "loss/logits": 0.21785753238946198, "step": 1020 }, { "epoch": 0.02575, "grad_norm": 33.0, "grad_norm_var": 31.347330729166668, "learning_rate": 0.0001, "loss": 7.6449, "loss/crossentropy": 2.1728423804044725, "loss/hidden": 3.41015625, "loss/jsd": 0.0, "loss/logits": 0.19077841471880674, "step": 1030 }, { "epoch": 0.026, "grad_norm": 74.5, "grad_norm_var": 122.20045572916666, "learning_rate": 0.0001, "loss": 7.6132, "loss/crossentropy": 2.1822386175394057, "loss/hidden": 3.39453125, "loss/jsd": 0.0, "loss/logits": 0.1930880568921566, "step": 1040 }, { "epoch": 0.02625, "grad_norm": 67.5, "grad_norm_var": 178.43125, "learning_rate": 0.0001, "loss": 7.6429, "loss/crossentropy": 2.2600297421216964, "loss/hidden": 3.458203125, "loss/jsd": 0.0, "loss/logits": 0.20767511576414108, "step": 1050 }, { "epoch": 0.0265, "grad_norm": 31.875, "grad_norm_var": 92.06087239583333, "learning_rate": 0.0001, "loss": 7.5718, "loss/crossentropy": 2.118990848958492, "loss/hidden": 3.37890625, "loss/jsd": 0.0, "loss/logits": 0.19327420592308045, "step": 1060 }, { "epoch": 0.02675, "grad_norm": 30.875, "grad_norm_var": 35.283268229166666, "learning_rate": 0.0001, "loss": 7.603, "loss/crossentropy": 2.2320737928152083, "loss/hidden": 3.335546875, "loss/jsd": 0.0, "loss/logits": 0.18573360554873944, "step": 1070 }, { "epoch": 0.027, "grad_norm": 35.25, "grad_norm_var": 3795.8796223958334, "learning_rate": 0.0001, "loss": 7.6632, "loss/crossentropy": 2.1329027831554415, "loss/hidden": 3.500390625, "loss/jsd": 0.0, "loss/logits": 0.25382886435836555, "step": 1080 }, { "epoch": 0.02725, "grad_norm": 41.0, "grad_norm_var": 3810.5869140625, "learning_rate": 0.0001, "loss": 7.591, "loss/crossentropy": 2.147709222137928, "loss/hidden": 3.409375, "loss/jsd": 0.0, "loss/logits": 0.19441522471606731, "step": 1090 }, { "epoch": 0.0275, "grad_norm": 34.0, "grad_norm_var": 10.347916666666666, "learning_rate": 0.0001, "loss": 7.459, "loss/crossentropy": 2.134738603234291, "loss/hidden": 3.45, "loss/jsd": 0.0, "loss/logits": 0.18917258959263564, "step": 1100 }, { "epoch": 0.02775, "grad_norm": 30.5, "grad_norm_var": 5.362239583333333, "learning_rate": 0.0001, "loss": 7.4625, "loss/crossentropy": 2.072269695997238, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.19563193432986736, "step": 1110 }, { "epoch": 0.028, "grad_norm": 29.375, "grad_norm_var": 14.4478515625, "learning_rate": 0.0001, "loss": 7.514, "loss/crossentropy": 2.131034165620804, "loss/hidden": 3.44453125, "loss/jsd": 0.0, "loss/logits": 0.18194433208554983, "step": 1120 }, { "epoch": 0.02825, "grad_norm": 33.0, "grad_norm_var": 23.925, "learning_rate": 0.0001, "loss": 7.5884, "loss/crossentropy": 2.023801653087139, "loss/hidden": 3.637109375, "loss/jsd": 0.0, "loss/logits": 0.20569879673421382, "step": 1130 }, { "epoch": 0.0285, "grad_norm": 32.0, "grad_norm_var": 7.2744140625, "learning_rate": 0.0001, "loss": 7.6524, "loss/crossentropy": 2.1517456393688916, "loss/hidden": 3.537890625, "loss/jsd": 0.0, "loss/logits": 0.2079196309670806, "step": 1140 }, { "epoch": 0.02875, "grad_norm": 35.75, "grad_norm_var": 7.99140625, "learning_rate": 0.0001, "loss": 7.6074, "loss/crossentropy": 2.004653300344944, "loss/hidden": 3.583984375, "loss/jsd": 0.0, "loss/logits": 0.19787274841219188, "step": 1150 }, { "epoch": 0.029, "grad_norm": 32.75, "grad_norm_var": 4.120572916666666, "learning_rate": 0.0001, "loss": 7.6348, "loss/crossentropy": 2.1528601229190825, "loss/hidden": 3.361328125, "loss/jsd": 0.0, "loss/logits": 0.1927174234762788, "step": 1160 }, { "epoch": 0.02925, "grad_norm": 36.25, "grad_norm_var": 7.069205729166667, "learning_rate": 0.0001, "loss": 7.6205, "loss/crossentropy": 2.05783154964447, "loss/hidden": 3.614453125, "loss/jsd": 0.0, "loss/logits": 0.22374887801706791, "step": 1170 }, { "epoch": 0.0295, "grad_norm": 31.0, "grad_norm_var": 41.484309895833334, "learning_rate": 0.0001, "loss": 7.7356, "loss/crossentropy": 2.126041141152382, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.2197611417621374, "step": 1180 }, { "epoch": 0.02975, "grad_norm": 38.5, "grad_norm_var": 43.91223958333333, "learning_rate": 0.0001, "loss": 7.6157, "loss/crossentropy": 2.2476495057344437, "loss/hidden": 3.462109375, "loss/jsd": 0.0, "loss/logits": 0.19447711408138274, "step": 1190 }, { "epoch": 0.03, "grad_norm": 47.5, "grad_norm_var": 95.72057291666667, "learning_rate": 0.0001, "loss": 7.7237, "loss/crossentropy": 2.092088536918163, "loss/hidden": 3.433203125, "loss/jsd": 0.0, "loss/logits": 0.19047515615820884, "step": 1200 }, { "epoch": 0.03025, "grad_norm": 31.5, "grad_norm_var": 99.6822265625, "learning_rate": 0.0001, "loss": 7.5678, "loss/crossentropy": 2.1249007523059844, "loss/hidden": 3.505078125, "loss/jsd": 0.0, "loss/logits": 0.19545839354395866, "step": 1210 }, { "epoch": 0.0305, "grad_norm": 30.625, "grad_norm_var": 6.409309895833333, "learning_rate": 0.0001, "loss": 7.5862, "loss/crossentropy": 2.2506365835666657, "loss/hidden": 3.39921875, "loss/jsd": 0.0, "loss/logits": 0.19677093252539635, "step": 1220 }, { "epoch": 0.03075, "grad_norm": 33.25, "grad_norm_var": 6.676822916666667, "learning_rate": 0.0001, "loss": 7.5504, "loss/crossentropy": 2.133887434005737, "loss/hidden": 3.459375, "loss/jsd": 0.0, "loss/logits": 0.1896925836801529, "step": 1230 }, { "epoch": 0.031, "grad_norm": 38.0, "grad_norm_var": 5.2337890625, "learning_rate": 0.0001, "loss": 7.5956, "loss/crossentropy": 2.0669597774744033, "loss/hidden": 3.475390625, "loss/jsd": 0.0, "loss/logits": 0.19292720556259155, "step": 1240 }, { "epoch": 0.03125, "grad_norm": 30.625, "grad_norm_var": 3.97265625, "learning_rate": 0.0001, "loss": 7.5722, "loss/crossentropy": 2.261713761091232, "loss/hidden": 3.346875, "loss/jsd": 0.0, "loss/logits": 0.19570228308439255, "step": 1250 }, { "epoch": 0.0315, "grad_norm": 29.25, "grad_norm_var": 4.916080729166667, "learning_rate": 0.0001, "loss": 7.5566, "loss/crossentropy": 2.0476513862609864, "loss/hidden": 3.668359375, "loss/jsd": 0.0, "loss/logits": 0.2093046260997653, "step": 1260 }, { "epoch": 0.03175, "grad_norm": 34.75, "grad_norm_var": 10.875455729166667, "learning_rate": 0.0001, "loss": 7.6172, "loss/crossentropy": 2.1128817319869997, "loss/hidden": 3.5703125, "loss/jsd": 0.0, "loss/logits": 0.21670667603611946, "step": 1270 }, { "epoch": 0.032, "grad_norm": 33.5, "grad_norm_var": 3.824934895833333, "learning_rate": 0.0001, "loss": 7.6787, "loss/crossentropy": 2.2115501552820205, "loss/hidden": 3.417578125, "loss/jsd": 0.0, "loss/logits": 0.1827129926532507, "step": 1280 }, { "epoch": 0.03225, "grad_norm": 30.375, "grad_norm_var": 13.828125, "learning_rate": 0.0001, "loss": 7.6339, "loss/crossentropy": 2.176504462957382, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.2160520726814866, "step": 1290 }, { "epoch": 0.0325, "grad_norm": 32.25, "grad_norm_var": 5.916080729166667, "learning_rate": 0.0001, "loss": 7.6438, "loss/crossentropy": 2.173138880729675, "loss/hidden": 3.529296875, "loss/jsd": 0.0, "loss/logits": 0.2056989949196577, "step": 1300 }, { "epoch": 0.03275, "grad_norm": 32.25, "grad_norm_var": 5.78515625, "learning_rate": 0.0001, "loss": 7.7146, "loss/crossentropy": 2.247766065597534, "loss/hidden": 3.488671875, "loss/jsd": 0.0, "loss/logits": 0.20310762114822864, "step": 1310 }, { "epoch": 0.033, "grad_norm": 32.75, "grad_norm_var": 4.51640625, "learning_rate": 0.0001, "loss": 7.6452, "loss/crossentropy": 2.0862443327903746, "loss/hidden": 3.406640625, "loss/jsd": 0.0, "loss/logits": 0.18912406917661428, "step": 1320 }, { "epoch": 0.03325, "grad_norm": 34.5, "grad_norm_var": 7.220833333333333, "learning_rate": 0.0001, "loss": 7.7715, "loss/crossentropy": 2.093398702144623, "loss/hidden": 3.570703125, "loss/jsd": 0.0, "loss/logits": 0.2104920681566, "step": 1330 }, { "epoch": 0.0335, "grad_norm": 38.0, "grad_norm_var": 9.108268229166667, "learning_rate": 0.0001, "loss": 7.7368, "loss/crossentropy": 2.17246213555336, "loss/hidden": 3.576953125, "loss/jsd": 0.0, "loss/logits": 0.21665989980101585, "step": 1340 }, { "epoch": 0.03375, "grad_norm": 33.5, "grad_norm_var": 4.794205729166666, "learning_rate": 0.0001, "loss": 7.5892, "loss/crossentropy": 2.1238946616649628, "loss/hidden": 3.48828125, "loss/jsd": 0.0, "loss/logits": 0.2172164160758257, "step": 1350 }, { "epoch": 0.034, "grad_norm": 32.5, "grad_norm_var": 101.1431640625, "learning_rate": 0.0001, "loss": 7.6341, "loss/crossentropy": 2.194270025193691, "loss/hidden": 3.4859375, "loss/jsd": 0.0, "loss/logits": 0.19632596522569656, "step": 1360 }, { "epoch": 0.03425, "grad_norm": 33.25, "grad_norm_var": 3.9119140625, "learning_rate": 0.0001, "loss": 7.5496, "loss/crossentropy": 2.1282873928546904, "loss/hidden": 3.343359375, "loss/jsd": 0.0, "loss/logits": 0.17983752395957708, "step": 1370 }, { "epoch": 0.0345, "grad_norm": 34.5, "grad_norm_var": 4.324934895833334, "learning_rate": 0.0001, "loss": 7.6348, "loss/crossentropy": 2.140147662162781, "loss/hidden": 3.471484375, "loss/jsd": 0.0, "loss/logits": 0.20302014388144016, "step": 1380 }, { "epoch": 0.03475, "grad_norm": 30.75, "grad_norm_var": 2.818489583333333, "learning_rate": 0.0001, "loss": 7.7271, "loss/crossentropy": 2.128489089012146, "loss/hidden": 3.4359375, "loss/jsd": 0.0, "loss/logits": 0.19602114744484425, "step": 1390 }, { "epoch": 0.035, "grad_norm": 31.25, "grad_norm_var": 2.1458333333333335, "learning_rate": 0.0001, "loss": 7.6417, "loss/crossentropy": 2.1306474581360817, "loss/hidden": 3.582421875, "loss/jsd": 0.0, "loss/logits": 0.19735200479626655, "step": 1400 }, { "epoch": 0.03525, "grad_norm": 31.25, "grad_norm_var": 9.2009765625, "learning_rate": 0.0001, "loss": 7.7002, "loss/crossentropy": 2.173697289824486, "loss/hidden": 3.480859375, "loss/jsd": 0.0, "loss/logits": 0.20366120263934134, "step": 1410 }, { "epoch": 0.0355, "grad_norm": 31.75, "grad_norm_var": 9.913997395833333, "learning_rate": 0.0001, "loss": 7.6577, "loss/crossentropy": 2.26003720164299, "loss/hidden": 3.385546875, "loss/jsd": 0.0, "loss/logits": 0.20924863480031491, "step": 1420 }, { "epoch": 0.03575, "grad_norm": 30.25, "grad_norm_var": 26.66015625, "learning_rate": 0.0001, "loss": 7.5718, "loss/crossentropy": 2.2352112770080566, "loss/hidden": 3.419140625, "loss/jsd": 0.0, "loss/logits": 0.19357634484767913, "step": 1430 }, { "epoch": 0.036, "grad_norm": 34.0, "grad_norm_var": 30.602083333333333, "learning_rate": 0.0001, "loss": 7.5861, "loss/crossentropy": 2.0770506739616392, "loss/hidden": 3.5453125, "loss/jsd": 0.0, "loss/logits": 0.21514309681951999, "step": 1440 }, { "epoch": 0.03625, "grad_norm": 37.5, "grad_norm_var": 12.3875, "learning_rate": 0.0001, "loss": 7.4986, "loss/crossentropy": 2.0542123883962633, "loss/hidden": 3.510546875, "loss/jsd": 0.0, "loss/logits": 0.19684152901172638, "step": 1450 }, { "epoch": 0.0365, "grad_norm": 32.75, "grad_norm_var": 8.548372395833333, "learning_rate": 0.0001, "loss": 7.6006, "loss/crossentropy": 2.175110411643982, "loss/hidden": 3.37578125, "loss/jsd": 0.0, "loss/logits": 0.19031002502888442, "step": 1460 }, { "epoch": 0.03675, "grad_norm": 34.25, "grad_norm_var": 2.8429676028135214e+18, "learning_rate": 0.0001, "loss": 7.7702, "loss/crossentropy": 2.1691948026418686, "loss/hidden": 3.5859375, "loss/jsd": 0.0, "loss/logits": 0.22843880020081997, "step": 1470 }, { "epoch": 0.037, "grad_norm": 36.25, "grad_norm_var": 2.842967603101565e+18, "learning_rate": 0.0001, "loss": 7.6046, "loss/crossentropy": 2.0826233722269536, "loss/hidden": 3.520703125, "loss/jsd": 0.0, "loss/logits": 0.1938928204588592, "step": 1480 }, { "epoch": 0.03725, "grad_norm": 32.75, "grad_norm_var": 8.939518229166667, "learning_rate": 0.0001, "loss": 7.6265, "loss/crossentropy": 2.2077848985791206, "loss/hidden": 3.435546875, "loss/jsd": 0.0, "loss/logits": 0.1943045362830162, "step": 1490 }, { "epoch": 0.0375, "grad_norm": 34.25, "grad_norm_var": 7.7125, "learning_rate": 0.0001, "loss": 7.6217, "loss/crossentropy": 2.1079602181911468, "loss/hidden": 3.395703125, "loss/jsd": 0.0, "loss/logits": 0.19446163363754748, "step": 1500 }, { "epoch": 0.03775, "grad_norm": 34.25, "grad_norm_var": 5.7931640625, "learning_rate": 0.0001, "loss": 7.5893, "loss/crossentropy": 2.078600898385048, "loss/hidden": 3.5734375, "loss/jsd": 0.0, "loss/logits": 0.21464722994714974, "step": 1510 }, { "epoch": 0.038, "grad_norm": 35.75, "grad_norm_var": 6.186458333333333, "learning_rate": 0.0001, "loss": 7.6365, "loss/crossentropy": 2.1014960765838624, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.19028044641017913, "step": 1520 }, { "epoch": 0.03825, "grad_norm": 36.0, "grad_norm_var": 4.455989583333333, "learning_rate": 0.0001, "loss": 7.5414, "loss/crossentropy": 2.1112293377518654, "loss/hidden": 3.350390625, "loss/jsd": 0.0, "loss/logits": 0.18414278626441954, "step": 1530 }, { "epoch": 0.0385, "grad_norm": 32.25, "grad_norm_var": 3.7249348958333335, "learning_rate": 0.0001, "loss": 7.5681, "loss/crossentropy": 2.166412356495857, "loss/hidden": 3.44453125, "loss/jsd": 0.0, "loss/logits": 0.19420933350920677, "step": 1540 }, { "epoch": 0.03875, "grad_norm": 31.625, "grad_norm_var": 5.330989583333333, "learning_rate": 0.0001, "loss": 7.6663, "loss/crossentropy": 2.0857026129961014, "loss/hidden": 3.574609375, "loss/jsd": 0.0, "loss/logits": 0.2175652377307415, "step": 1550 }, { "epoch": 0.039, "grad_norm": 30.75, "grad_norm_var": 7.401822916666666, "learning_rate": 0.0001, "loss": 7.5661, "loss/crossentropy": 2.1806214213371278, "loss/hidden": 3.562890625, "loss/jsd": 0.0, "loss/logits": 0.21507157981395722, "step": 1560 }, { "epoch": 0.03925, "grad_norm": 37.5, "grad_norm_var": 9.043684895833334, "learning_rate": 0.0001, "loss": 7.5649, "loss/crossentropy": 2.073585295677185, "loss/hidden": 3.520703125, "loss/jsd": 0.0, "loss/logits": 0.19737527389079332, "step": 1570 }, { "epoch": 0.0395, "grad_norm": 32.5, "grad_norm_var": 4.607747395833333, "learning_rate": 0.0001, "loss": 7.6893, "loss/crossentropy": 2.262183803319931, "loss/hidden": 3.425390625, "loss/jsd": 0.0, "loss/logits": 0.22450251020491124, "step": 1580 }, { "epoch": 0.03975, "grad_norm": 31.25, "grad_norm_var": 1.3983723958333334, "learning_rate": 0.0001, "loss": 7.7101, "loss/crossentropy": 2.1200410187244416, "loss/hidden": 3.464453125, "loss/jsd": 0.0, "loss/logits": 0.20633359774947166, "step": 1590 }, { "epoch": 0.04, "grad_norm": 32.25, "grad_norm_var": 28.868684895833333, "learning_rate": 0.0001, "loss": 7.6821, "loss/crossentropy": 2.1520946115255355, "loss/hidden": 3.616796875, "loss/jsd": 0.0, "loss/logits": 0.20241751577705144, "step": 1600 }, { "epoch": 0.04025, "grad_norm": 34.5, "grad_norm_var": 24.308072916666667, "learning_rate": 0.0001, "loss": 7.6482, "loss/crossentropy": 2.109182408452034, "loss/hidden": 3.491015625, "loss/jsd": 0.0, "loss/logits": 0.19619097150862216, "step": 1610 }, { "epoch": 0.0405, "grad_norm": 33.25, "grad_norm_var": 1.83125, "learning_rate": 0.0001, "loss": 7.614, "loss/crossentropy": 2.220561644434929, "loss/hidden": 3.44921875, "loss/jsd": 0.0, "loss/logits": 0.20333079397678375, "step": 1620 }, { "epoch": 0.04075, "grad_norm": 31.0, "grad_norm_var": 7.183072916666666, "learning_rate": 0.0001, "loss": 7.7748, "loss/crossentropy": 2.2026446878910066, "loss/hidden": 3.472265625, "loss/jsd": 0.0, "loss/logits": 0.20862093791365624, "step": 1630 }, { "epoch": 0.041, "grad_norm": 36.25, "grad_norm_var": 8.080989583333333, "learning_rate": 0.0001, "loss": 7.6592, "loss/crossentropy": 2.2313437908887863, "loss/hidden": 3.32421875, "loss/jsd": 0.0, "loss/logits": 0.19155636206269264, "step": 1640 }, { "epoch": 0.04125, "grad_norm": 31.5, "grad_norm_var": 3.8863932291666665, "learning_rate": 0.0001, "loss": 7.6964, "loss/crossentropy": 2.0529640942811964, "loss/hidden": 3.658984375, "loss/jsd": 0.0, "loss/logits": 0.23474433943629264, "step": 1650 }, { "epoch": 0.0415, "grad_norm": 33.25, "grad_norm_var": 1.0374348958333333, "learning_rate": 0.0001, "loss": 7.6483, "loss/crossentropy": 2.175355441868305, "loss/hidden": 3.41953125, "loss/jsd": 0.0, "loss/logits": 0.19280093312263488, "step": 1660 }, { "epoch": 0.04175, "grad_norm": 29.625, "grad_norm_var": 2.4268229166666666, "learning_rate": 0.0001, "loss": 7.6277, "loss/crossentropy": 2.121490114927292, "loss/hidden": 3.4421875, "loss/jsd": 0.0, "loss/logits": 0.20306031554937362, "step": 1670 }, { "epoch": 0.042, "grad_norm": 36.75, "grad_norm_var": 186.4869140625, "learning_rate": 0.0001, "loss": 7.7912, "loss/crossentropy": 2.123810574412346, "loss/hidden": 3.596875, "loss/jsd": 0.0, "loss/logits": 0.20700039602816106, "step": 1680 }, { "epoch": 0.04225, "grad_norm": 33.0, "grad_norm_var": 194.72708333333333, "learning_rate": 0.0001, "loss": 7.5507, "loss/crossentropy": 2.198003688454628, "loss/hidden": 3.384375, "loss/jsd": 0.0, "loss/logits": 0.19452486634254457, "step": 1690 }, { "epoch": 0.0425, "grad_norm": 32.25, "grad_norm_var": 3.842643229166667, "learning_rate": 0.0001, "loss": 7.6641, "loss/crossentropy": 2.1328449815511705, "loss/hidden": 3.508984375, "loss/jsd": 0.0, "loss/logits": 0.20609250776469706, "step": 1700 }, { "epoch": 0.04275, "grad_norm": 36.5, "grad_norm_var": 23.064322916666665, "learning_rate": 0.0001, "loss": 7.5838, "loss/crossentropy": 2.1690568923950195, "loss/hidden": 3.459765625, "loss/jsd": 0.0, "loss/logits": 0.19301791079342365, "step": 1710 }, { "epoch": 0.043, "grad_norm": 30.0, "grad_norm_var": 5.9712890625, "learning_rate": 0.0001, "loss": 7.5441, "loss/crossentropy": 2.1149508744478225, "loss/hidden": 3.327734375, "loss/jsd": 0.0, "loss/logits": 0.19253603778779507, "step": 1720 }, { "epoch": 0.04325, "grad_norm": 33.0, "grad_norm_var": 3.82890625, "learning_rate": 0.0001, "loss": 7.6644, "loss/crossentropy": 2.070442554354668, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.19407737776637077, "step": 1730 }, { "epoch": 0.0435, "grad_norm": 30.625, "grad_norm_var": 1.5947265625, "learning_rate": 0.0001, "loss": 7.6468, "loss/crossentropy": 2.2249585568904875, "loss/hidden": 3.437890625, "loss/jsd": 0.0, "loss/logits": 0.20079109650105237, "step": 1740 }, { "epoch": 0.04375, "grad_norm": 32.25, "grad_norm_var": 4.1384765625, "learning_rate": 0.0001, "loss": 7.6522, "loss/crossentropy": 2.1712467283010484, "loss/hidden": 3.481640625, "loss/jsd": 0.0, "loss/logits": 0.21240621842443944, "step": 1750 }, { "epoch": 0.044, "grad_norm": 30.75, "grad_norm_var": 2.7018229166666665, "learning_rate": 0.0001, "loss": 7.6326, "loss/crossentropy": 2.1433209091424943, "loss/hidden": 3.50546875, "loss/jsd": 0.0, "loss/logits": 0.19783576354384422, "step": 1760 }, { "epoch": 0.04425, "grad_norm": 32.25, "grad_norm_var": 10.2634765625, "learning_rate": 0.0001, "loss": 7.7252, "loss/crossentropy": 2.1377856612205504, "loss/hidden": 3.462109375, "loss/jsd": 0.0, "loss/logits": 0.19519764352589847, "step": 1770 }, { "epoch": 0.0445, "grad_norm": 32.25, "grad_norm_var": 9.817122395833334, "learning_rate": 0.0001, "loss": 7.6165, "loss/crossentropy": 2.2387378960847855, "loss/hidden": 3.465234375, "loss/jsd": 0.0, "loss/logits": 0.2087532427161932, "step": 1780 }, { "epoch": 0.04475, "grad_norm": 31.0, "grad_norm_var": 1.9650390625, "learning_rate": 0.0001, "loss": 7.6701, "loss/crossentropy": 2.280033028125763, "loss/hidden": 3.31875, "loss/jsd": 0.0, "loss/logits": 0.1907376278191805, "step": 1790 }, { "epoch": 0.045, "grad_norm": 33.75, "grad_norm_var": 2.4197265625, "learning_rate": 0.0001, "loss": 7.6553, "loss/crossentropy": 2.205285739898682, "loss/hidden": 3.448828125, "loss/jsd": 0.0, "loss/logits": 0.1980523556470871, "step": 1800 }, { "epoch": 0.04525, "grad_norm": 31.0, "grad_norm_var": 3.2462890625, "learning_rate": 0.0001, "loss": 7.6001, "loss/crossentropy": 2.047496220469475, "loss/hidden": 3.548046875, "loss/jsd": 0.0, "loss/logits": 0.19389633461833, "step": 1810 }, { "epoch": 0.0455, "grad_norm": 31.125, "grad_norm_var": 2.562239583333333, "learning_rate": 0.0001, "loss": 7.615, "loss/crossentropy": 2.174453580379486, "loss/hidden": 3.516796875, "loss/jsd": 0.0, "loss/logits": 0.20545508041977883, "step": 1820 }, { "epoch": 0.04575, "grad_norm": 34.25, "grad_norm_var": 3.4296223958333334, "learning_rate": 0.0001, "loss": 7.638, "loss/crossentropy": 2.0722746759653092, "loss/hidden": 3.437109375, "loss/jsd": 0.0, "loss/logits": 0.19747158586978913, "step": 1830 }, { "epoch": 0.046, "grad_norm": 34.75, "grad_norm_var": 3.0666015625, "learning_rate": 0.0001, "loss": 7.7087, "loss/crossentropy": 2.1196924835443496, "loss/hidden": 3.622265625, "loss/jsd": 0.0, "loss/logits": 0.20298538953065873, "step": 1840 }, { "epoch": 0.04625, "grad_norm": 29.75, "grad_norm_var": 2.6119140625, "learning_rate": 0.0001, "loss": 7.6036, "loss/crossentropy": 2.1688392132520677, "loss/hidden": 3.323046875, "loss/jsd": 0.0, "loss/logits": 0.17962730433791876, "step": 1850 }, { "epoch": 0.0465, "grad_norm": 33.75, "grad_norm_var": 1.3374348958333333, "learning_rate": 0.0001, "loss": 7.7051, "loss/crossentropy": 2.1148360162973403, "loss/hidden": 3.422265625, "loss/jsd": 0.0, "loss/logits": 0.2195219134911895, "step": 1860 }, { "epoch": 0.04675, "grad_norm": 33.5, "grad_norm_var": 2.45625, "learning_rate": 0.0001, "loss": 7.6397, "loss/crossentropy": 2.016439202427864, "loss/hidden": 3.529296875, "loss/jsd": 0.0, "loss/logits": 0.2061541959643364, "step": 1870 }, { "epoch": 0.047, "grad_norm": 29.75, "grad_norm_var": 4.173893229166667, "learning_rate": 0.0001, "loss": 7.7054, "loss/crossentropy": 2.1470705419778824, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.21371309272944927, "step": 1880 }, { "epoch": 0.04725, "grad_norm": 32.5, "grad_norm_var": 4.054622395833333, "learning_rate": 0.0001, "loss": 7.5959, "loss/crossentropy": 2.265937978029251, "loss/hidden": 3.31328125, "loss/jsd": 0.0, "loss/logits": 0.1929181769490242, "step": 1890 }, { "epoch": 0.0475, "grad_norm": 30.125, "grad_norm_var": 6.6197265625, "learning_rate": 0.0001, "loss": 7.6012, "loss/crossentropy": 2.0853475779294968, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.20215214397758247, "step": 1900 }, { "epoch": 0.04775, "grad_norm": 34.0, "grad_norm_var": 23.695572916666666, "learning_rate": 0.0001, "loss": 7.7544, "loss/crossentropy": 2.162308484315872, "loss/hidden": 3.4796875, "loss/jsd": 0.0, "loss/logits": 0.2013952497392893, "step": 1910 }, { "epoch": 0.048, "grad_norm": 30.75, "grad_norm_var": 4.120768229166667, "learning_rate": 0.0001, "loss": 7.6092, "loss/crossentropy": 2.088267083466053, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.2007219024002552, "step": 1920 }, { "epoch": 0.04825, "grad_norm": 35.75, "grad_norm_var": 2.97890625, "learning_rate": 0.0001, "loss": 7.7141, "loss/crossentropy": 2.0617689430713653, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.1928685350343585, "step": 1930 }, { "epoch": 0.0485, "grad_norm": 35.0, "grad_norm_var": 6.073893229166667, "learning_rate": 0.0001, "loss": 7.7068, "loss/crossentropy": 2.1201131522655485, "loss/hidden": 3.501953125, "loss/jsd": 0.0, "loss/logits": 0.20757155679166317, "step": 1940 }, { "epoch": 0.04875, "grad_norm": 31.5, "grad_norm_var": 21.345572916666665, "learning_rate": 0.0001, "loss": 7.6198, "loss/crossentropy": 2.235423868894577, "loss/hidden": 3.36875, "loss/jsd": 0.0, "loss/logits": 0.1959926813840866, "step": 1950 }, { "epoch": 0.049, "grad_norm": 30.625, "grad_norm_var": 28.99765625, "learning_rate": 0.0001, "loss": 7.6389, "loss/crossentropy": 2.205905148386955, "loss/hidden": 3.446484375, "loss/jsd": 0.0, "loss/logits": 0.2116202499717474, "step": 1960 }, { "epoch": 0.04925, "grad_norm": 33.25, "grad_norm_var": 9.9369140625, "learning_rate": 0.0001, "loss": 7.7121, "loss/crossentropy": 2.163422483205795, "loss/hidden": 3.40390625, "loss/jsd": 0.0, "loss/logits": 0.1981559544801712, "step": 1970 }, { "epoch": 0.0495, "grad_norm": 29.0, "grad_norm_var": 9.118489583333334, "learning_rate": 0.0001, "loss": 7.6772, "loss/crossentropy": 2.1636913806200027, "loss/hidden": 3.442578125, "loss/jsd": 0.0, "loss/logits": 0.1968079771846533, "step": 1980 }, { "epoch": 0.04975, "grad_norm": 35.25, "grad_norm_var": 4.377018229166667, "learning_rate": 0.0001, "loss": 7.5948, "loss/crossentropy": 2.174500140547752, "loss/hidden": 3.42578125, "loss/jsd": 0.0, "loss/logits": 0.1923616673797369, "step": 1990 }, { "epoch": 0.05, "grad_norm": 30.75, "grad_norm_var": 6.15625, "learning_rate": 0.0001, "loss": 7.5639, "loss/crossentropy": 2.1197956264019013, "loss/hidden": 3.50703125, "loss/jsd": 0.0, "loss/logits": 0.20423812307417394, "step": 2000 }, { "epoch": 0.05025, "grad_norm": 33.5, "grad_norm_var": 4.725455729166667, "learning_rate": 0.0001, "loss": 7.6238, "loss/crossentropy": 2.1442053347826002, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.20131820477545262, "step": 2010 }, { "epoch": 0.0505, "grad_norm": 33.0, "grad_norm_var": 3.2212890625, "learning_rate": 0.0001, "loss": 7.6024, "loss/crossentropy": 2.1970301985740663, "loss/hidden": 3.43203125, "loss/jsd": 0.0, "loss/logits": 0.19248049296438693, "step": 2020 }, { "epoch": 0.05075, "grad_norm": 31.5, "grad_norm_var": 2.2853515625, "learning_rate": 0.0001, "loss": 7.6279, "loss/crossentropy": 2.0732986360788344, "loss/hidden": 3.476953125, "loss/jsd": 0.0, "loss/logits": 0.19802382439374924, "step": 2030 }, { "epoch": 0.051, "grad_norm": 35.25, "grad_norm_var": 3.2129557291666666, "learning_rate": 0.0001, "loss": 7.643, "loss/crossentropy": 2.196815450489521, "loss/hidden": 3.48203125, "loss/jsd": 0.0, "loss/logits": 0.20899684820324183, "step": 2040 }, { "epoch": 0.05125, "grad_norm": 34.5, "grad_norm_var": 4.093489583333334, "learning_rate": 0.0001, "loss": 7.6321, "loss/crossentropy": 2.083095496892929, "loss/hidden": 3.40390625, "loss/jsd": 0.0, "loss/logits": 0.18292178437113762, "step": 2050 }, { "epoch": 0.0515, "grad_norm": 31.875, "grad_norm_var": 19.478059895833333, "learning_rate": 0.0001, "loss": 7.5882, "loss/crossentropy": 2.2153579622507094, "loss/hidden": 3.360546875, "loss/jsd": 0.0, "loss/logits": 0.19245057981461286, "step": 2060 }, { "epoch": 0.05175, "grad_norm": 34.5, "grad_norm_var": 16.97265625, "learning_rate": 0.0001, "loss": 7.6006, "loss/crossentropy": 2.2516845196485518, "loss/hidden": 3.385546875, "loss/jsd": 0.0, "loss/logits": 0.1975632380694151, "step": 2070 }, { "epoch": 0.052, "grad_norm": 29.75, "grad_norm_var": 2.71015625, "learning_rate": 0.0001, "loss": 7.6393, "loss/crossentropy": 2.1561204314231874, "loss/hidden": 3.465234375, "loss/jsd": 0.0, "loss/logits": 0.2124529665336013, "step": 2080 }, { "epoch": 0.05225, "grad_norm": 35.75, "grad_norm_var": 36.984375, "learning_rate": 0.0001, "loss": 7.7289, "loss/crossentropy": 2.2129232093691824, "loss/hidden": 3.43125, "loss/jsd": 0.0, "loss/logits": 0.1984808323904872, "step": 2090 }, { "epoch": 0.0525, "grad_norm": 28.875, "grad_norm_var": 38.076822916666664, "learning_rate": 0.0001, "loss": 7.5446, "loss/crossentropy": 2.281945154070854, "loss/hidden": 3.387109375, "loss/jsd": 0.0, "loss/logits": 0.1915616899728775, "step": 2100 }, { "epoch": 0.05275, "grad_norm": 30.5, "grad_norm_var": 2.10625, "learning_rate": 0.0001, "loss": 7.6215, "loss/crossentropy": 2.0773366719484327, "loss/hidden": 3.439453125, "loss/jsd": 0.0, "loss/logits": 0.18790210355073214, "step": 2110 }, { "epoch": 0.053, "grad_norm": 31.25, "grad_norm_var": 1.24765625, "learning_rate": 0.0001, "loss": 7.4968, "loss/crossentropy": 2.186223568022251, "loss/hidden": 3.36875, "loss/jsd": 0.0, "loss/logits": 0.1900737203657627, "step": 2120 }, { "epoch": 0.05325, "grad_norm": 30.75, "grad_norm_var": 4.638541666666667, "learning_rate": 0.0001, "loss": 7.6574, "loss/crossentropy": 2.2741902500391005, "loss/hidden": 3.38359375, "loss/jsd": 0.0, "loss/logits": 0.1898935280740261, "step": 2130 }, { "epoch": 0.0535, "grad_norm": 33.75, "grad_norm_var": 18.001041666666666, "learning_rate": 0.0001, "loss": 7.6903, "loss/crossentropy": 2.1332941919565203, "loss/hidden": 3.42265625, "loss/jsd": 0.0, "loss/logits": 0.1954928996041417, "step": 2140 }, { "epoch": 0.05375, "grad_norm": 34.5, "grad_norm_var": 17.939583333333335, "learning_rate": 0.0001, "loss": 7.5786, "loss/crossentropy": 2.2076333969831468, "loss/hidden": 3.4828125, "loss/jsd": 0.0, "loss/logits": 0.20088096596300603, "step": 2150 }, { "epoch": 0.054, "grad_norm": 33.5, "grad_norm_var": 8.947916666666666, "learning_rate": 0.0001, "loss": 7.5995, "loss/crossentropy": 2.201739010214806, "loss/hidden": 3.394140625, "loss/jsd": 0.0, "loss/logits": 0.20021349862217902, "step": 2160 }, { "epoch": 0.05425, "grad_norm": 30.0, "grad_norm_var": 185.5900390625, "learning_rate": 0.0001, "loss": 7.6214, "loss/crossentropy": 2.1913442850112914, "loss/hidden": 3.407421875, "loss/jsd": 0.0, "loss/logits": 0.1996122680604458, "step": 2170 }, { "epoch": 0.0545, "grad_norm": 30.625, "grad_norm_var": 186.84166666666667, "learning_rate": 0.0001, "loss": 7.7375, "loss/crossentropy": 2.173484447598457, "loss/hidden": 3.428515625, "loss/jsd": 0.0, "loss/logits": 0.18794310167431832, "step": 2180 }, { "epoch": 0.05475, "grad_norm": 31.375, "grad_norm_var": 8.699739583333333, "learning_rate": 0.0001, "loss": 7.6023, "loss/crossentropy": 2.207549235224724, "loss/hidden": 3.494140625, "loss/jsd": 0.0, "loss/logits": 0.21360519118607044, "step": 2190 }, { "epoch": 0.055, "grad_norm": 32.0, "grad_norm_var": 5.228059895833334, "learning_rate": 0.0001, "loss": 7.5821, "loss/crossentropy": 2.168141430988908, "loss/hidden": 3.355078125, "loss/jsd": 0.0, "loss/logits": 0.1850608481094241, "step": 2200 }, { "epoch": 0.05525, "grad_norm": 29.625, "grad_norm_var": 10.708268229166666, "learning_rate": 0.0001, "loss": 7.6191, "loss/crossentropy": 2.26080215126276, "loss/hidden": 3.440625, "loss/jsd": 0.0, "loss/logits": 0.19979589320719243, "step": 2210 }, { "epoch": 0.0555, "grad_norm": 31.25, "grad_norm_var": 10.148958333333333, "learning_rate": 0.0001, "loss": 7.613, "loss/crossentropy": 2.2105998665094377, "loss/hidden": 3.430859375, "loss/jsd": 0.0, "loss/logits": 0.20264392383396626, "step": 2220 }, { "epoch": 0.05575, "grad_norm": 29.5, "grad_norm_var": 9.128059895833333, "learning_rate": 0.0001, "loss": 7.5296, "loss/crossentropy": 2.076467031240463, "loss/hidden": 3.59296875, "loss/jsd": 0.0, "loss/logits": 0.22621012963354586, "step": 2230 }, { "epoch": 0.056, "grad_norm": 34.5, "grad_norm_var": 3.99375, "learning_rate": 0.0001, "loss": 7.6686, "loss/crossentropy": 2.0320577889680864, "loss/hidden": 3.575, "loss/jsd": 0.0, "loss/logits": 0.19981470778584481, "step": 2240 }, { "epoch": 0.05625, "grad_norm": 30.75, "grad_norm_var": 14.473893229166666, "learning_rate": 0.0001, "loss": 7.5723, "loss/crossentropy": 2.084241083264351, "loss/hidden": 3.545703125, "loss/jsd": 0.0, "loss/logits": 0.2100257944315672, "step": 2250 }, { "epoch": 0.0565, "grad_norm": 31.5, "grad_norm_var": 44.7041015625, "learning_rate": 0.0001, "loss": 7.6107, "loss/crossentropy": 2.2695932418107985, "loss/hidden": 3.371875, "loss/jsd": 0.0, "loss/logits": 0.19781356416642665, "step": 2260 }, { "epoch": 0.05675, "grad_norm": 30.875, "grad_norm_var": 9.897916666666667, "learning_rate": 0.0001, "loss": 7.6187, "loss/crossentropy": 2.188571906089783, "loss/hidden": 3.43984375, "loss/jsd": 0.0, "loss/logits": 0.2015857521444559, "step": 2270 }, { "epoch": 0.057, "grad_norm": 28.375, "grad_norm_var": 3.490625, "learning_rate": 0.0001, "loss": 7.6139, "loss/crossentropy": 2.1134210243821143, "loss/hidden": 3.481640625, "loss/jsd": 0.0, "loss/logits": 0.22417646870017052, "step": 2280 }, { "epoch": 0.05725, "grad_norm": 32.0, "grad_norm_var": 6.479622395833333, "learning_rate": 0.0001, "loss": 7.6591, "loss/crossentropy": 2.1894455403089523, "loss/hidden": 3.465625, "loss/jsd": 0.0, "loss/logits": 0.19847002141177655, "step": 2290 }, { "epoch": 0.0575, "grad_norm": 31.25, "grad_norm_var": 8.809309895833334, "learning_rate": 0.0001, "loss": 7.657, "loss/crossentropy": 2.1524556159973143, "loss/hidden": 3.319921875, "loss/jsd": 0.0, "loss/logits": 0.1857963975518942, "step": 2300 }, { "epoch": 0.05775, "grad_norm": 32.25, "grad_norm_var": 3.121809895833333, "learning_rate": 0.0001, "loss": 7.6475, "loss/crossentropy": 2.2037901908159254, "loss/hidden": 3.44140625, "loss/jsd": 0.0, "loss/logits": 0.22191528491675855, "step": 2310 }, { "epoch": 0.058, "grad_norm": 32.5, "grad_norm_var": 2.6348307291666666, "learning_rate": 0.0001, "loss": 7.5846, "loss/crossentropy": 2.214809921383858, "loss/hidden": 3.3453125, "loss/jsd": 0.0, "loss/logits": 0.19421134144067764, "step": 2320 }, { "epoch": 0.05825, "grad_norm": 29.375, "grad_norm_var": 2.939322916666667, "learning_rate": 0.0001, "loss": 7.6893, "loss/crossentropy": 2.2204942047595977, "loss/hidden": 3.437890625, "loss/jsd": 0.0, "loss/logits": 0.21439925488084555, "step": 2330 }, { "epoch": 0.0585, "grad_norm": 28.625, "grad_norm_var": 3.8744140625, "learning_rate": 0.0001, "loss": 7.6038, "loss/crossentropy": 2.1540059238672256, "loss/hidden": 3.56953125, "loss/jsd": 0.0, "loss/logits": 0.24175845962017775, "step": 2340 }, { "epoch": 0.05875, "grad_norm": 31.875, "grad_norm_var": 2.1426432291666666, "learning_rate": 0.0001, "loss": 7.6379, "loss/crossentropy": 2.1948168754577635, "loss/hidden": 3.43046875, "loss/jsd": 0.0, "loss/logits": 0.2041913490742445, "step": 2350 }, { "epoch": 0.059, "grad_norm": 32.5, "grad_norm_var": 1.7869140625, "learning_rate": 0.0001, "loss": 7.7135, "loss/crossentropy": 2.1938526153564455, "loss/hidden": 3.434375, "loss/jsd": 0.0, "loss/logits": 0.1992840923368931, "step": 2360 }, { "epoch": 0.05925, "grad_norm": 33.75, "grad_norm_var": 1.1343098958333333, "learning_rate": 0.0001, "loss": 7.6931, "loss/crossentropy": 2.12769907861948, "loss/hidden": 3.3796875, "loss/jsd": 0.0, "loss/logits": 0.18500677905976773, "step": 2370 }, { "epoch": 0.0595, "grad_norm": 33.0, "grad_norm_var": 3.1884765625, "learning_rate": 0.0001, "loss": 7.6297, "loss/crossentropy": 2.1268584340810777, "loss/hidden": 3.485546875, "loss/jsd": 0.0, "loss/logits": 0.20107861533761023, "step": 2380 }, { "epoch": 0.05975, "grad_norm": 31.0, "grad_norm_var": 5.368489583333333, "learning_rate": 0.0001, "loss": 7.555, "loss/crossentropy": 2.198070913553238, "loss/hidden": 3.4171875, "loss/jsd": 0.0, "loss/logits": 0.19437791910022498, "step": 2390 }, { "epoch": 0.06, "grad_norm": 31.0, "grad_norm_var": 3.218489583333333, "learning_rate": 0.0001, "loss": 7.633, "loss/crossentropy": 2.1521017968654634, "loss/hidden": 3.472265625, "loss/jsd": 0.0, "loss/logits": 0.19696612432599067, "step": 2400 }, { "epoch": 0.06025, "grad_norm": 31.625, "grad_norm_var": 1.4098307291666667, "learning_rate": 0.0001, "loss": 7.6634, "loss/crossentropy": 2.0935733556747436, "loss/hidden": 3.46171875, "loss/jsd": 0.0, "loss/logits": 0.19025789983570576, "step": 2410 }, { "epoch": 0.0605, "grad_norm": 31.75, "grad_norm_var": 4.812434895833333, "learning_rate": 0.0001, "loss": 7.6523, "loss/crossentropy": 2.206766763329506, "loss/hidden": 3.421484375, "loss/jsd": 0.0, "loss/logits": 0.19223052635788918, "step": 2420 }, { "epoch": 0.06075, "grad_norm": 32.0, "grad_norm_var": 5.545247395833333, "learning_rate": 0.0001, "loss": 7.6703, "loss/crossentropy": 2.2091148614883425, "loss/hidden": 3.471875, "loss/jsd": 0.0, "loss/logits": 0.2191623793914914, "step": 2430 }, { "epoch": 0.061, "grad_norm": 31.875, "grad_norm_var": 3.06640625, "learning_rate": 0.0001, "loss": 7.6194, "loss/crossentropy": 2.2076220482587816, "loss/hidden": 3.52265625, "loss/jsd": 0.0, "loss/logits": 0.21331611163914205, "step": 2440 }, { "epoch": 0.06125, "grad_norm": 33.75, "grad_norm_var": 3.753125, "learning_rate": 0.0001, "loss": 7.6143, "loss/crossentropy": 2.1473243802785875, "loss/hidden": 3.415625, "loss/jsd": 0.0, "loss/logits": 0.20034591071307659, "step": 2450 }, { "epoch": 0.0615, "grad_norm": 31.25, "grad_norm_var": 4.09765625, "learning_rate": 0.0001, "loss": 7.6166, "loss/crossentropy": 2.2176205784082414, "loss/hidden": 3.4015625, "loss/jsd": 0.0, "loss/logits": 0.1890367180109024, "step": 2460 }, { "epoch": 0.06175, "grad_norm": 32.75, "grad_norm_var": 2.56015625, "learning_rate": 0.0001, "loss": 7.5864, "loss/crossentropy": 2.139193335175514, "loss/hidden": 3.55234375, "loss/jsd": 0.0, "loss/logits": 0.19700367711484432, "step": 2470 }, { "epoch": 0.062, "grad_norm": 30.625, "grad_norm_var": 3.5400390625, "learning_rate": 0.0001, "loss": 7.6061, "loss/crossentropy": 2.101886364817619, "loss/hidden": 3.543359375, "loss/jsd": 0.0, "loss/logits": 0.20392275378108024, "step": 2480 }, { "epoch": 0.06225, "grad_norm": 31.625, "grad_norm_var": 2.3353515625, "learning_rate": 0.0001, "loss": 7.5912, "loss/crossentropy": 2.1105535492300986, "loss/hidden": 3.451953125, "loss/jsd": 0.0, "loss/logits": 0.20442402064800264, "step": 2490 }, { "epoch": 0.0625, "grad_norm": 32.25, "grad_norm_var": 2.183333333333333, "learning_rate": 0.0001, "loss": 7.6553, "loss/crossentropy": 2.1315447479486465, "loss/hidden": 3.471484375, "loss/jsd": 0.0, "loss/logits": 0.20164060425013303, "step": 2500 }, { "epoch": 0.06275, "grad_norm": 33.25, "grad_norm_var": 147.15149739583333, "learning_rate": 0.0001, "loss": 7.6542, "loss/crossentropy": 2.0641630738973618, "loss/hidden": 3.468359375, "loss/jsd": 0.0, "loss/logits": 0.20385651774704455, "step": 2510 }, { "epoch": 0.063, "grad_norm": 30.5, "grad_norm_var": 150.99166666666667, "learning_rate": 0.0001, "loss": 7.5568, "loss/crossentropy": 2.191535955667496, "loss/hidden": 3.402734375, "loss/jsd": 0.0, "loss/logits": 0.19768227599561214, "step": 2520 }, { "epoch": 0.06325, "grad_norm": 29.0, "grad_norm_var": 2.13515625, "learning_rate": 0.0001, "loss": 7.6544, "loss/crossentropy": 2.199158227443695, "loss/hidden": 3.45, "loss/jsd": 0.0, "loss/logits": 0.2097537014633417, "step": 2530 }, { "epoch": 0.0635, "grad_norm": 31.5, "grad_norm_var": 2.731705729166667, "learning_rate": 0.0001, "loss": 7.5063, "loss/crossentropy": 2.1456793427467344, "loss/hidden": 3.403515625, "loss/jsd": 0.0, "loss/logits": 0.19175102189183235, "step": 2540 }, { "epoch": 0.06375, "grad_norm": 32.5, "grad_norm_var": 6.859830729166666, "learning_rate": 0.0001, "loss": 7.5828, "loss/crossentropy": 2.255453732609749, "loss/hidden": 3.4203125, "loss/jsd": 0.0, "loss/logits": 0.19693338237702845, "step": 2550 }, { "epoch": 0.064, "grad_norm": 31.625, "grad_norm_var": 5.178125, "learning_rate": 0.0001, "loss": 7.5702, "loss/crossentropy": 2.2270909011363984, "loss/hidden": 3.43359375, "loss/jsd": 0.0, "loss/logits": 0.20889390334486962, "step": 2560 }, { "epoch": 0.06425, "grad_norm": 33.5, "grad_norm_var": 3.6372395833333333, "learning_rate": 0.0001, "loss": 7.5904, "loss/crossentropy": 2.190132850408554, "loss/hidden": 3.3953125, "loss/jsd": 0.0, "loss/logits": 0.20386817157268525, "step": 2570 }, { "epoch": 0.0645, "grad_norm": 34.25, "grad_norm_var": 10.79765625, "learning_rate": 0.0001, "loss": 7.5854, "loss/crossentropy": 2.07715407460928, "loss/hidden": 3.581640625, "loss/jsd": 0.0, "loss/logits": 0.20797281824052333, "step": 2580 }, { "epoch": 0.06475, "grad_norm": 33.5, "grad_norm_var": 12.35625, "learning_rate": 0.0001, "loss": 7.6279, "loss/crossentropy": 2.1247923612594604, "loss/hidden": 3.434765625, "loss/jsd": 0.0, "loss/logits": 0.21620508767664431, "step": 2590 }, { "epoch": 0.065, "grad_norm": 32.75, "grad_norm_var": 5.094791666666667, "learning_rate": 0.0001, "loss": 7.5996, "loss/crossentropy": 2.087959203124046, "loss/hidden": 3.521875, "loss/jsd": 0.0, "loss/logits": 0.19923710729926825, "step": 2600 }, { "epoch": 0.06525, "grad_norm": 30.0, "grad_norm_var": 7.6150390625, "learning_rate": 0.0001, "loss": 7.682, "loss/crossentropy": 2.1805250465869905, "loss/hidden": 3.38984375, "loss/jsd": 0.0, "loss/logits": 0.2021130472421646, "step": 2610 }, { "epoch": 0.0655, "grad_norm": 34.75, "grad_norm_var": 7.06015625, "learning_rate": 0.0001, "loss": 7.7244, "loss/crossentropy": 2.1178730964660644, "loss/hidden": 3.402734375, "loss/jsd": 0.0, "loss/logits": 0.19405451826751233, "step": 2620 }, { "epoch": 0.06575, "grad_norm": 29.75, "grad_norm_var": 3.065559895833333, "learning_rate": 0.0001, "loss": 7.6483, "loss/crossentropy": 2.1593512505292893, "loss/hidden": 3.381640625, "loss/jsd": 0.0, "loss/logits": 0.2096536297351122, "step": 2630 }, { "epoch": 0.066, "grad_norm": 33.25, "grad_norm_var": 4.623372395833333, "learning_rate": 0.0001, "loss": 7.5982, "loss/crossentropy": 2.159625916182995, "loss/hidden": 3.365234375, "loss/jsd": 0.0, "loss/logits": 0.18939675595611333, "step": 2640 }, { "epoch": 0.06625, "grad_norm": 53.0, "grad_norm_var": 49.99524739583333, "learning_rate": 0.0001, "loss": 7.6918, "loss/crossentropy": 2.114516945183277, "loss/hidden": 3.51484375, "loss/jsd": 0.0, "loss/logits": 0.2009023107588291, "step": 2650 }, { "epoch": 0.0665, "grad_norm": 30.125, "grad_norm_var": 38.81399739583333, "learning_rate": 0.0001, "loss": 7.5543, "loss/crossentropy": 2.132766366004944, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.1965734062716365, "step": 2660 }, { "epoch": 0.06675, "grad_norm": 30.875, "grad_norm_var": 1.9559895833333334, "learning_rate": 0.0001, "loss": 7.6338, "loss/crossentropy": 2.1092610150575637, "loss/hidden": 3.431640625, "loss/jsd": 0.0, "loss/logits": 0.17978871315717698, "step": 2670 }, { "epoch": 0.067, "grad_norm": 29.875, "grad_norm_var": 4.47890625, "learning_rate": 0.0001, "loss": 7.617, "loss/crossentropy": 2.2271903961896897, "loss/hidden": 3.461328125, "loss/jsd": 0.0, "loss/logits": 0.2073811784386635, "step": 2680 }, { "epoch": 0.06725, "grad_norm": 30.125, "grad_norm_var": 3.2249348958333335, "learning_rate": 0.0001, "loss": 7.641, "loss/crossentropy": 2.0155764549970625, "loss/hidden": 3.603515625, "loss/jsd": 0.0, "loss/logits": 0.2049756994470954, "step": 2690 }, { "epoch": 0.0675, "grad_norm": 33.5, "grad_norm_var": 3.0061848958333335, "learning_rate": 0.0001, "loss": 7.6253, "loss/crossentropy": 2.221065053343773, "loss/hidden": 3.482421875, "loss/jsd": 0.0, "loss/logits": 0.2112014289945364, "step": 2700 }, { "epoch": 0.06775, "grad_norm": 32.25, "grad_norm_var": 18.753125, "learning_rate": 0.0001, "loss": 7.6427, "loss/crossentropy": 2.180001160502434, "loss/hidden": 3.353125, "loss/jsd": 0.0, "loss/logits": 0.193130424618721, "step": 2710 }, { "epoch": 0.068, "grad_norm": 32.5, "grad_norm_var": 20.773893229166667, "learning_rate": 0.0001, "loss": 7.6163, "loss/crossentropy": 2.283226564526558, "loss/hidden": 3.48515625, "loss/jsd": 0.0, "loss/logits": 0.19388929307460784, "step": 2720 }, { "epoch": 0.06825, "grad_norm": 31.25, "grad_norm_var": 1.61640625, "learning_rate": 0.0001, "loss": 7.587, "loss/crossentropy": 2.1378406554460527, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.2115953892469406, "step": 2730 }, { "epoch": 0.0685, "grad_norm": 32.5, "grad_norm_var": 1.8684895833333333, "learning_rate": 0.0001, "loss": 7.6011, "loss/crossentropy": 2.0985760882496836, "loss/hidden": 3.48125, "loss/jsd": 0.0, "loss/logits": 0.19036055766046048, "step": 2740 }, { "epoch": 0.06875, "grad_norm": 32.25, "grad_norm_var": 2.9535807291666667, "learning_rate": 0.0001, "loss": 7.6583, "loss/crossentropy": 2.1665745437145234, "loss/hidden": 3.36640625, "loss/jsd": 0.0, "loss/logits": 0.18649150040000678, "step": 2750 }, { "epoch": 0.069, "grad_norm": 34.5, "grad_norm_var": 6.343489583333334, "learning_rate": 0.0001, "loss": 7.5495, "loss/crossentropy": 2.176983141899109, "loss/hidden": 3.412890625, "loss/jsd": 0.0, "loss/logits": 0.20285341441631316, "step": 2760 }, { "epoch": 0.06925, "grad_norm": 32.5, "grad_norm_var": 4.972916666666666, "learning_rate": 0.0001, "loss": 7.6597, "loss/crossentropy": 2.1060123026371, "loss/hidden": 3.400390625, "loss/jsd": 0.0, "loss/logits": 0.19327255934476853, "step": 2770 }, { "epoch": 0.0695, "grad_norm": 31.625, "grad_norm_var": 32.33639322916667, "learning_rate": 0.0001, "loss": 7.5862, "loss/crossentropy": 2.1663936868309976, "loss/hidden": 3.512109375, "loss/jsd": 0.0, "loss/logits": 0.22126073129475116, "step": 2780 }, { "epoch": 0.06975, "grad_norm": 32.5, "grad_norm_var": 5.694791666666666, "learning_rate": 0.0001, "loss": 7.5124, "loss/crossentropy": 2.225750984251499, "loss/hidden": 3.350390625, "loss/jsd": 0.0, "loss/logits": 0.19473073966801166, "step": 2790 }, { "epoch": 0.07, "grad_norm": 31.5, "grad_norm_var": 4.237434895833333, "learning_rate": 0.0001, "loss": 7.6174, "loss/crossentropy": 2.0647315263748167, "loss/hidden": 3.4734375, "loss/jsd": 0.0, "loss/logits": 0.2136565549299121, "step": 2800 }, { "epoch": 0.07025, "grad_norm": 36.0, "grad_norm_var": 4.792708333333334, "learning_rate": 0.0001, "loss": 7.6789, "loss/crossentropy": 2.1971701353788378, "loss/hidden": 3.440625, "loss/jsd": 0.0, "loss/logits": 0.21544951274991037, "step": 2810 }, { "epoch": 0.0705, "grad_norm": 31.125, "grad_norm_var": 11.145247395833334, "learning_rate": 0.0001, "loss": 7.7043, "loss/crossentropy": 2.2537077218294144, "loss/hidden": 3.395703125, "loss/jsd": 0.0, "loss/logits": 0.19961411394178868, "step": 2820 }, { "epoch": 0.07075, "grad_norm": 30.5, "grad_norm_var": 85.65182291666666, "learning_rate": 0.0001, "loss": 7.6427, "loss/crossentropy": 2.0513558954000475, "loss/hidden": 3.615234375, "loss/jsd": 0.0, "loss/logits": 0.23545071221888064, "step": 2830 }, { "epoch": 0.071, "grad_norm": 31.375, "grad_norm_var": 64.77180989583333, "learning_rate": 0.0001, "loss": 7.6378, "loss/crossentropy": 2.186201846599579, "loss/hidden": 3.55078125, "loss/jsd": 0.0, "loss/logits": 0.20769538041204214, "step": 2840 }, { "epoch": 0.07125, "grad_norm": 32.25, "grad_norm_var": 2.0268229166666667, "learning_rate": 0.0001, "loss": 7.5525, "loss/crossentropy": 2.161085495352745, "loss/hidden": 3.27421875, "loss/jsd": 0.0, "loss/logits": 0.18488222286105155, "step": 2850 }, { "epoch": 0.0715, "grad_norm": 33.0, "grad_norm_var": 10.437434895833333, "learning_rate": 0.0001, "loss": 7.6376, "loss/crossentropy": 2.09626332372427, "loss/hidden": 3.33203125, "loss/jsd": 0.0, "loss/logits": 0.179809108376503, "step": 2860 }, { "epoch": 0.07175, "grad_norm": 33.25, "grad_norm_var": 9.233072916666666, "learning_rate": 0.0001, "loss": 7.62, "loss/crossentropy": 2.2382488936185836, "loss/hidden": 3.353515625, "loss/jsd": 0.0, "loss/logits": 0.20872681811451912, "step": 2870 }, { "epoch": 0.072, "grad_norm": 28.5, "grad_norm_var": 8.784375, "learning_rate": 0.0001, "loss": 7.6516, "loss/crossentropy": 2.1699771240353583, "loss/hidden": 3.470703125, "loss/jsd": 0.0, "loss/logits": 0.2100867312401533, "step": 2880 }, { "epoch": 0.07225, "grad_norm": 33.75, "grad_norm_var": 9.269791666666666, "learning_rate": 0.0001, "loss": 7.5839, "loss/crossentropy": 2.1368533104658125, "loss/hidden": 3.426953125, "loss/jsd": 0.0, "loss/logits": 0.21750828213989734, "step": 2890 }, { "epoch": 0.0725, "grad_norm": 36.0, "grad_norm_var": 5.518489583333333, "learning_rate": 0.0001, "loss": 7.6849, "loss/crossentropy": 2.12222815155983, "loss/hidden": 3.580859375, "loss/jsd": 0.0, "loss/logits": 0.21114687696099282, "step": 2900 }, { "epoch": 0.07275, "grad_norm": 31.125, "grad_norm_var": 5.622916666666667, "learning_rate": 0.0001, "loss": 7.5109, "loss/crossentropy": 2.171084225177765, "loss/hidden": 3.403125, "loss/jsd": 0.0, "loss/logits": 0.1979156408458948, "step": 2910 }, { "epoch": 0.073, "grad_norm": 31.125, "grad_norm_var": 1.5619140625, "learning_rate": 0.0001, "loss": 7.6895, "loss/crossentropy": 2.164732736349106, "loss/hidden": 3.404296875, "loss/jsd": 0.0, "loss/logits": 0.20426477529108525, "step": 2920 }, { "epoch": 0.07325, "grad_norm": 29.375, "grad_norm_var": 1.7854166666666667, "learning_rate": 0.0001, "loss": 7.5573, "loss/crossentropy": 2.1073058575391768, "loss/hidden": 3.50234375, "loss/jsd": 0.0, "loss/logits": 0.2097570365294814, "step": 2930 }, { "epoch": 0.0735, "grad_norm": 30.875, "grad_norm_var": 2.4955729166666667, "learning_rate": 0.0001, "loss": 7.5697, "loss/crossentropy": 2.153279659152031, "loss/hidden": 3.361328125, "loss/jsd": 0.0, "loss/logits": 0.1900124330073595, "step": 2940 }, { "epoch": 0.07375, "grad_norm": 49.5, "grad_norm_var": 22.75390625, "learning_rate": 0.0001, "loss": 7.671, "loss/crossentropy": 2.2612457245588304, "loss/hidden": 3.414453125, "loss/jsd": 0.0, "loss/logits": 0.18990697022527456, "step": 2950 }, { "epoch": 0.074, "grad_norm": 32.0, "grad_norm_var": 24.510416666666668, "learning_rate": 0.0001, "loss": 7.5814, "loss/crossentropy": 2.123460465669632, "loss/hidden": 3.496484375, "loss/jsd": 0.0, "loss/logits": 0.20658994875848294, "step": 2960 }, { "epoch": 0.07425, "grad_norm": 30.125, "grad_norm_var": 118.29837239583334, "learning_rate": 0.0001, "loss": 7.5481, "loss/crossentropy": 2.2275219768285752, "loss/hidden": 3.33515625, "loss/jsd": 0.0, "loss/logits": 0.18647960387170315, "step": 2970 }, { "epoch": 0.0745, "grad_norm": 29.375, "grad_norm_var": 21.989322916666666, "learning_rate": 0.0001, "loss": 7.5209, "loss/crossentropy": 2.1731285482645033, "loss/hidden": 3.4171875, "loss/jsd": 0.0, "loss/logits": 0.1889862149953842, "step": 2980 }, { "epoch": 0.07475, "grad_norm": 31.125, "grad_norm_var": 4.253059895833333, "learning_rate": 0.0001, "loss": 7.5815, "loss/crossentropy": 2.2546483501791954, "loss/hidden": 3.415625, "loss/jsd": 0.0, "loss/logits": 0.18856723569333553, "step": 2990 }, { "epoch": 0.075, "grad_norm": 32.0, "grad_norm_var": 5.730143229166667, "learning_rate": 0.0001, "loss": 7.5835, "loss/crossentropy": 2.092367857694626, "loss/hidden": 3.52421875, "loss/jsd": 0.0, "loss/logits": 0.20502115599811077, "step": 3000 }, { "epoch": 0.07525, "grad_norm": 29.5, "grad_norm_var": 15.926822916666667, "learning_rate": 0.0001, "loss": 7.5849, "loss/crossentropy": 2.109961675107479, "loss/hidden": 3.432421875, "loss/jsd": 0.0, "loss/logits": 0.20341113824397325, "step": 3010 }, { "epoch": 0.0755, "grad_norm": 32.5, "grad_norm_var": 3.314322916666667, "learning_rate": 0.0001, "loss": 7.5896, "loss/crossentropy": 2.1648701071739196, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.20436643473803998, "step": 3020 }, { "epoch": 0.07575, "grad_norm": 32.75, "grad_norm_var": 1.4754557291666666, "learning_rate": 0.0001, "loss": 7.5959, "loss/crossentropy": 2.2054502993822096, "loss/hidden": 3.440234375, "loss/jsd": 0.0, "loss/logits": 0.20024821683764457, "step": 3030 }, { "epoch": 0.076, "grad_norm": 30.875, "grad_norm_var": 6.4931640625, "learning_rate": 0.0001, "loss": 7.6957, "loss/crossentropy": 2.166448511183262, "loss/hidden": 3.484765625, "loss/jsd": 0.0, "loss/logits": 0.20373598877340554, "step": 3040 }, { "epoch": 0.07625, "grad_norm": 36.5, "grad_norm_var": 9.442122395833334, "learning_rate": 0.0001, "loss": 7.5957, "loss/crossentropy": 2.218970799446106, "loss/hidden": 3.503515625, "loss/jsd": 0.0, "loss/logits": 0.20884830448776484, "step": 3050 }, { "epoch": 0.0765, "grad_norm": 32.25, "grad_norm_var": 6.062955729166666, "learning_rate": 0.0001, "loss": 7.5458, "loss/crossentropy": 2.080473840236664, "loss/hidden": 3.57109375, "loss/jsd": 0.0, "loss/logits": 0.20190774220973254, "step": 3060 }, { "epoch": 0.07675, "grad_norm": 29.5, "grad_norm_var": 2.6093098958333334, "learning_rate": 0.0001, "loss": 7.5646, "loss/crossentropy": 2.1775156021118165, "loss/hidden": 3.355859375, "loss/jsd": 0.0, "loss/logits": 0.19825822599232196, "step": 3070 }, { "epoch": 0.077, "grad_norm": 30.125, "grad_norm_var": 2.690625, "learning_rate": 0.0001, "loss": 7.7174, "loss/crossentropy": 2.246141794323921, "loss/hidden": 3.425390625, "loss/jsd": 0.0, "loss/logits": 0.20639744736254215, "step": 3080 }, { "epoch": 0.07725, "grad_norm": 30.875, "grad_norm_var": 4.271809895833333, "learning_rate": 0.0001, "loss": 7.6196, "loss/crossentropy": 2.060313332080841, "loss/hidden": 3.481640625, "loss/jsd": 0.0, "loss/logits": 0.21316638588905334, "step": 3090 }, { "epoch": 0.0775, "grad_norm": 34.0, "grad_norm_var": 2.873893229166667, "learning_rate": 0.0001, "loss": 7.5637, "loss/crossentropy": 2.153154730796814, "loss/hidden": 3.44453125, "loss/jsd": 0.0, "loss/logits": 0.20764457508921624, "step": 3100 }, { "epoch": 0.07775, "grad_norm": 33.25, "grad_norm_var": 2.2708333333333335, "learning_rate": 0.0001, "loss": 7.5924, "loss/crossentropy": 2.2558963537216186, "loss/hidden": 3.364453125, "loss/jsd": 0.0, "loss/logits": 0.19444480016827584, "step": 3110 }, { "epoch": 0.078, "grad_norm": 29.5, "grad_norm_var": 2.2301432291666665, "learning_rate": 0.0001, "loss": 7.7202, "loss/crossentropy": 2.190881980955601, "loss/hidden": 3.422265625, "loss/jsd": 0.0, "loss/logits": 0.2061827789992094, "step": 3120 }, { "epoch": 0.07825, "grad_norm": 31.375, "grad_norm_var": 2.991080729166667, "learning_rate": 0.0001, "loss": 7.5479, "loss/crossentropy": 2.1357465982437134, "loss/hidden": 3.394140625, "loss/jsd": 0.0, "loss/logits": 0.19807269163429736, "step": 3130 }, { "epoch": 0.0785, "grad_norm": 29.125, "grad_norm_var": 5.457291666666666, "learning_rate": 0.0001, "loss": 7.641, "loss/crossentropy": 2.166859371960163, "loss/hidden": 3.398046875, "loss/jsd": 0.0, "loss/logits": 0.1900594387203455, "step": 3140 }, { "epoch": 0.07875, "grad_norm": 32.75, "grad_norm_var": 27.958072916666666, "learning_rate": 0.0001, "loss": 7.5223, "loss/crossentropy": 2.1595762044191362, "loss/hidden": 3.551171875, "loss/jsd": 0.0, "loss/logits": 0.22254167906939984, "step": 3150 }, { "epoch": 0.079, "grad_norm": 29.75, "grad_norm_var": 3.4056640625, "learning_rate": 0.0001, "loss": 7.6761, "loss/crossentropy": 2.156154304742813, "loss/hidden": 3.407421875, "loss/jsd": 0.0, "loss/logits": 0.2026256375014782, "step": 3160 }, { "epoch": 0.07925, "grad_norm": 32.0, "grad_norm_var": 7.246875, "learning_rate": 0.0001, "loss": 7.4683, "loss/crossentropy": 2.1108324408531187, "loss/hidden": 3.5015625, "loss/jsd": 0.0, "loss/logits": 0.18734413515776396, "step": 3170 }, { "epoch": 0.0795, "grad_norm": 28.375, "grad_norm_var": 3.9212890625, "learning_rate": 0.0001, "loss": 7.5591, "loss/crossentropy": 2.1108986347913743, "loss/hidden": 3.487109375, "loss/jsd": 0.0, "loss/logits": 0.19466390162706376, "step": 3180 }, { "epoch": 0.07975, "grad_norm": 32.5, "grad_norm_var": 15.6962890625, "learning_rate": 0.0001, "loss": 7.6375, "loss/crossentropy": 2.1180114537477492, "loss/hidden": 3.478125, "loss/jsd": 0.0, "loss/logits": 0.21690767258405685, "step": 3190 }, { "epoch": 0.08, "grad_norm": 31.875, "grad_norm_var": 12.757747395833333, "learning_rate": 0.0001, "loss": 7.5977, "loss/crossentropy": 2.1203838691115378, "loss/hidden": 3.5421875, "loss/jsd": 0.0, "loss/logits": 0.20782926268875598, "step": 3200 }, { "epoch": 0.08025, "grad_norm": 28.375, "grad_norm_var": 3.3218098958333333, "learning_rate": 0.0001, "loss": 7.6303, "loss/crossentropy": 2.1929849207401277, "loss/hidden": 3.348828125, "loss/jsd": 0.0, "loss/logits": 0.19219291880726813, "step": 3210 }, { "epoch": 0.0805, "grad_norm": 30.0, "grad_norm_var": 3.1958333333333333, "learning_rate": 0.0001, "loss": 7.5282, "loss/crossentropy": 2.2013367488980293, "loss/hidden": 3.6, "loss/jsd": 0.0, "loss/logits": 0.21040805242955685, "step": 3220 }, { "epoch": 0.08075, "grad_norm": 32.25, "grad_norm_var": 1.8684895833333333, "learning_rate": 0.0001, "loss": 7.5331, "loss/crossentropy": 2.184007254242897, "loss/hidden": 3.371875, "loss/jsd": 0.0, "loss/logits": 0.19979026056826116, "step": 3230 }, { "epoch": 0.081, "grad_norm": 30.75, "grad_norm_var": 2.130989583333333, "learning_rate": 0.0001, "loss": 7.5964, "loss/crossentropy": 2.2199858695268633, "loss/hidden": 3.38515625, "loss/jsd": 0.0, "loss/logits": 0.21446770764887332, "step": 3240 }, { "epoch": 0.08125, "grad_norm": 32.75, "grad_norm_var": 2.6483723958333334, "learning_rate": 0.0001, "loss": 7.602, "loss/crossentropy": 2.1263694643974302, "loss/hidden": 3.482421875, "loss/jsd": 0.0, "loss/logits": 0.19737922623753548, "step": 3250 }, { "epoch": 0.0815, "grad_norm": 30.75, "grad_norm_var": 3.207291666666667, "learning_rate": 0.0001, "loss": 7.5927, "loss/crossentropy": 2.184669151902199, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.18790993094444275, "step": 3260 }, { "epoch": 0.08175, "grad_norm": 29.875, "grad_norm_var": 2.857291666666667, "learning_rate": 0.0001, "loss": 7.615, "loss/crossentropy": 2.0831361666321753, "loss/hidden": 3.48203125, "loss/jsd": 0.0, "loss/logits": 0.19330178536474704, "step": 3270 }, { "epoch": 0.082, "grad_norm": 30.625, "grad_norm_var": 15.702018229166667, "learning_rate": 0.0001, "loss": 7.6216, "loss/crossentropy": 2.158697286248207, "loss/hidden": 3.3765625, "loss/jsd": 0.0, "loss/logits": 0.18888361509889365, "step": 3280 }, { "epoch": 0.08225, "grad_norm": 32.75, "grad_norm_var": 18.211393229166667, "learning_rate": 0.0001, "loss": 7.564, "loss/crossentropy": 2.2913430631160736, "loss/hidden": 3.4828125, "loss/jsd": 0.0, "loss/logits": 0.2058469709008932, "step": 3290 }, { "epoch": 0.0825, "grad_norm": 35.5, "grad_norm_var": 4.24140625, "learning_rate": 0.0001, "loss": 7.5168, "loss/crossentropy": 2.2065580666065214, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.18786473274230958, "step": 3300 }, { "epoch": 0.08275, "grad_norm": 32.5, "grad_norm_var": 3.692643229166667, "learning_rate": 0.0001, "loss": 7.5099, "loss/crossentropy": 2.1358665406703947, "loss/hidden": 3.36875, "loss/jsd": 0.0, "loss/logits": 0.18491616416722537, "step": 3310 }, { "epoch": 0.083, "grad_norm": 31.75, "grad_norm_var": 2.50390625, "learning_rate": 0.0001, "loss": 7.6092, "loss/crossentropy": 2.206757593154907, "loss/hidden": 3.4328125, "loss/jsd": 0.0, "loss/logits": 0.20058272033929825, "step": 3320 }, { "epoch": 0.08325, "grad_norm": 34.5, "grad_norm_var": 1.7497395833333333, "learning_rate": 0.0001, "loss": 7.5493, "loss/crossentropy": 2.0709328591823577, "loss/hidden": 3.450390625, "loss/jsd": 0.0, "loss/logits": 0.1953151250258088, "step": 3330 }, { "epoch": 0.0835, "grad_norm": 34.5, "grad_norm_var": 2.9395182291666666, "learning_rate": 0.0001, "loss": 7.7584, "loss/crossentropy": 2.1559954971075057, "loss/hidden": 3.5484375, "loss/jsd": 0.0, "loss/logits": 0.20604321975260972, "step": 3340 }, { "epoch": 0.08375, "grad_norm": 33.75, "grad_norm_var": 17.864322916666666, "learning_rate": 0.0001, "loss": 7.6969, "loss/crossentropy": 2.1975975424051284, "loss/hidden": 3.40078125, "loss/jsd": 0.0, "loss/logits": 0.2027706265449524, "step": 3350 }, { "epoch": 0.084, "grad_norm": 33.25, "grad_norm_var": 2.7997395833333334, "learning_rate": 0.0001, "loss": 7.61, "loss/crossentropy": 2.018556122481823, "loss/hidden": 3.403515625, "loss/jsd": 0.0, "loss/logits": 0.18029189426451922, "step": 3360 }, { "epoch": 0.08425, "grad_norm": 33.0, "grad_norm_var": 2.5994140625, "learning_rate": 0.0001, "loss": 7.5397, "loss/crossentropy": 2.1838466703891752, "loss/hidden": 3.39453125, "loss/jsd": 0.0, "loss/logits": 0.2002351511269808, "step": 3370 }, { "epoch": 0.0845, "grad_norm": 32.75, "grad_norm_var": 2.912955729166667, "learning_rate": 0.0001, "loss": 7.5982, "loss/crossentropy": 2.184953287243843, "loss/hidden": 3.391015625, "loss/jsd": 0.0, "loss/logits": 0.19311312437057496, "step": 3380 }, { "epoch": 0.08475, "grad_norm": 34.25, "grad_norm_var": 3.309375, "learning_rate": 0.0001, "loss": 7.5841, "loss/crossentropy": 2.2160476714372637, "loss/hidden": 3.408984375, "loss/jsd": 0.0, "loss/logits": 0.1966065490618348, "step": 3390 }, { "epoch": 0.085, "grad_norm": 31.75, "grad_norm_var": 2.1936848958333335, "learning_rate": 0.0001, "loss": 7.588, "loss/crossentropy": 2.2071674168109894, "loss/hidden": 3.423828125, "loss/jsd": 0.0, "loss/logits": 0.19414376243948936, "step": 3400 }, { "epoch": 0.08525, "grad_norm": 33.0, "grad_norm_var": 1.6301432291666667, "learning_rate": 0.0001, "loss": 7.5966, "loss/crossentropy": 2.117925961315632, "loss/hidden": 3.44609375, "loss/jsd": 0.0, "loss/logits": 0.21105701606720687, "step": 3410 }, { "epoch": 0.0855, "grad_norm": 32.25, "grad_norm_var": 5.82265625, "learning_rate": 0.0001, "loss": 7.62, "loss/crossentropy": 2.0512605965137483, "loss/hidden": 3.509375, "loss/jsd": 0.0, "loss/logits": 0.20283049941062928, "step": 3420 }, { "epoch": 0.08575, "grad_norm": 31.75, "grad_norm_var": 6.539583333333334, "learning_rate": 0.0001, "loss": 7.5429, "loss/crossentropy": 2.074887050688267, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.18658901005983353, "step": 3430 }, { "epoch": 0.086, "grad_norm": 33.5, "grad_norm_var": 8.426497395833334, "learning_rate": 0.0001, "loss": 7.5759, "loss/crossentropy": 2.1776267290115356, "loss/hidden": 3.466796875, "loss/jsd": 0.0, "loss/logits": 0.20649599879980088, "step": 3440 }, { "epoch": 0.08625, "grad_norm": 43.5, "grad_norm_var": 14.962434895833333, "learning_rate": 0.0001, "loss": 7.6429, "loss/crossentropy": 2.12281953394413, "loss/hidden": 3.5265625, "loss/jsd": 0.0, "loss/logits": 0.20702828094363213, "step": 3450 }, { "epoch": 0.0865, "grad_norm": 31.0, "grad_norm_var": 196.96555989583334, "learning_rate": 0.0001, "loss": 7.7919, "loss/crossentropy": 2.1028707295656206, "loss/hidden": 3.536328125, "loss/jsd": 0.0, "loss/logits": 0.22825684808194638, "step": 3460 }, { "epoch": 0.08675, "grad_norm": 32.25, "grad_norm_var": 206.46979166666668, "learning_rate": 0.0001, "loss": 7.5698, "loss/crossentropy": 2.1032180160284044, "loss/hidden": 3.45625, "loss/jsd": 0.0, "loss/logits": 0.1899772472679615, "step": 3470 }, { "epoch": 0.087, "grad_norm": 37.0, "grad_norm_var": 15.6322265625, "learning_rate": 0.0001, "loss": 7.5884, "loss/crossentropy": 2.0837722390890123, "loss/hidden": 3.446484375, "loss/jsd": 0.0, "loss/logits": 0.20534363873302935, "step": 3480 }, { "epoch": 0.08725, "grad_norm": 30.0, "grad_norm_var": 16.8806640625, "learning_rate": 0.0001, "loss": 7.5719, "loss/crossentropy": 2.1673771381378173, "loss/hidden": 3.45234375, "loss/jsd": 0.0, "loss/logits": 0.21180946305394172, "step": 3490 }, { "epoch": 0.0875, "grad_norm": 33.0, "grad_norm_var": 16.212955729166666, "learning_rate": 0.0001, "loss": 7.5171, "loss/crossentropy": 2.2269717276096346, "loss/hidden": 3.294140625, "loss/jsd": 0.0, "loss/logits": 0.18251859862357378, "step": 3500 }, { "epoch": 0.08775, "grad_norm": 33.5, "grad_norm_var": 395.25390625, "learning_rate": 0.0001, "loss": 7.6825, "loss/crossentropy": 2.2768601924180984, "loss/hidden": 3.307421875, "loss/jsd": 0.0, "loss/logits": 0.17959882766008378, "step": 3510 }, { "epoch": 0.088, "grad_norm": 31.875, "grad_norm_var": 400.7280598958333, "learning_rate": 0.0001, "loss": 7.5387, "loss/crossentropy": 2.174117147922516, "loss/hidden": 3.325390625, "loss/jsd": 0.0, "loss/logits": 0.1909211568534374, "step": 3520 }, { "epoch": 0.08825, "grad_norm": 34.0, "grad_norm_var": 3.2527951689747005e+18, "learning_rate": 0.0001, "loss": 7.5099, "loss/crossentropy": 2.102574473619461, "loss/hidden": 3.365625, "loss/jsd": 0.0, "loss/logits": 0.18131749220192434, "step": 3530 }, { "epoch": 0.0885, "grad_norm": 34.75, "grad_norm_var": 3.252795168997245e+18, "learning_rate": 0.0001, "loss": 7.5664, "loss/crossentropy": 2.121107617020607, "loss/hidden": 3.433203125, "loss/jsd": 0.0, "loss/logits": 0.18711038120090961, "step": 3540 }, { "epoch": 0.08875, "grad_norm": 35.25, "grad_norm_var": 26.5384765625, "learning_rate": 0.0001, "loss": 7.5577, "loss/crossentropy": 2.1354643225669863, "loss/hidden": 3.372265625, "loss/jsd": 0.0, "loss/logits": 0.18915031235665083, "step": 3550 }, { "epoch": 0.089, "grad_norm": 29.25, "grad_norm_var": 39.177083333333336, "learning_rate": 0.0001, "loss": 7.5603, "loss/crossentropy": 2.111011874675751, "loss/hidden": 3.425390625, "loss/jsd": 0.0, "loss/logits": 0.20049102939665317, "step": 3560 }, { "epoch": 0.08925, "grad_norm": 30.5, "grad_norm_var": 24.4916015625, "learning_rate": 0.0001, "loss": 7.5407, "loss/crossentropy": 2.091498665511608, "loss/hidden": 3.429296875, "loss/jsd": 0.0, "loss/logits": 0.19228591658174993, "step": 3570 }, { "epoch": 0.0895, "grad_norm": 30.0, "grad_norm_var": 21.582291666666666, "learning_rate": 0.0001, "loss": 7.5146, "loss/crossentropy": 2.1603414684534075, "loss/hidden": 3.586328125, "loss/jsd": 0.0, "loss/logits": 0.22721791528165342, "step": 3580 }, { "epoch": 0.08975, "grad_norm": 29.25, "grad_norm_var": 18.798893229166666, "learning_rate": 0.0001, "loss": 7.5322, "loss/crossentropy": 2.1110543325543403, "loss/hidden": 3.439453125, "loss/jsd": 0.0, "loss/logits": 0.19287437647581102, "step": 3590 }, { "epoch": 0.09, "grad_norm": 40.75, "grad_norm_var": 15.987955729166666, "learning_rate": 0.0001, "loss": 7.55, "loss/crossentropy": 2.211816768348217, "loss/hidden": 3.38984375, "loss/jsd": 0.0, "loss/logits": 0.19003268536180257, "step": 3600 }, { "epoch": 0.09025, "grad_norm": 29.75, "grad_norm_var": 14.333268229166666, "learning_rate": 0.0001, "loss": 7.6044, "loss/crossentropy": 2.2199724197387694, "loss/hidden": 3.45234375, "loss/jsd": 0.0, "loss/logits": 0.19937946014106273, "step": 3610 }, { "epoch": 0.0905, "grad_norm": 29.5, "grad_norm_var": 7.994205729166667, "learning_rate": 0.0001, "loss": 7.5672, "loss/crossentropy": 2.1754990458488463, "loss/hidden": 3.455859375, "loss/jsd": 0.0, "loss/logits": 0.1872939633205533, "step": 3620 }, { "epoch": 0.09075, "grad_norm": 29.625, "grad_norm_var": 8.087955729166667, "learning_rate": 0.0001, "loss": 7.443, "loss/crossentropy": 2.2714238941669462, "loss/hidden": 3.36484375, "loss/jsd": 0.0, "loss/logits": 0.1929878756403923, "step": 3630 }, { "epoch": 0.091, "grad_norm": 30.25, "grad_norm_var": 6.550455729166667, "learning_rate": 0.0001, "loss": 7.5694, "loss/crossentropy": 2.1840985506772994, "loss/hidden": 3.504296875, "loss/jsd": 0.0, "loss/logits": 0.19485698137432336, "step": 3640 }, { "epoch": 0.09125, "grad_norm": 32.25, "grad_norm_var": 7.7494140625, "learning_rate": 0.0001, "loss": 7.5571, "loss/crossentropy": 2.1566817820072175, "loss/hidden": 3.503125, "loss/jsd": 0.0, "loss/logits": 0.21431526727974415, "step": 3650 }, { "epoch": 0.0915, "grad_norm": 34.75, "grad_norm_var": 5.333268229166666, "learning_rate": 0.0001, "loss": 7.5377, "loss/crossentropy": 2.0471107825636863, "loss/hidden": 3.497265625, "loss/jsd": 0.0, "loss/logits": 0.20124074276536702, "step": 3660 }, { "epoch": 0.09175, "grad_norm": 33.0, "grad_norm_var": 7.4353515625, "learning_rate": 0.0001, "loss": 7.6125, "loss/crossentropy": 2.1804106384515762, "loss/hidden": 3.440625, "loss/jsd": 0.0, "loss/logits": 0.22469761371612548, "step": 3670 }, { "epoch": 0.092, "grad_norm": 33.5, "grad_norm_var": 4.3572265625, "learning_rate": 0.0001, "loss": 7.6491, "loss/crossentropy": 2.2595307737588883, "loss/hidden": 3.352734375, "loss/jsd": 0.0, "loss/logits": 0.1894306108355522, "step": 3680 }, { "epoch": 0.09225, "grad_norm": 36.25, "grad_norm_var": 8.666666666666666, "learning_rate": 0.0001, "loss": 7.5857, "loss/crossentropy": 2.0454846382141114, "loss/hidden": 3.465234375, "loss/jsd": 0.0, "loss/logits": 0.1902542944997549, "step": 3690 }, { "epoch": 0.0925, "grad_norm": 28.625, "grad_norm_var": 6.204166666666667, "learning_rate": 0.0001, "loss": 7.6065, "loss/crossentropy": 2.1835698932409286, "loss/hidden": 3.4828125, "loss/jsd": 0.0, "loss/logits": 0.20310410112142563, "step": 3700 }, { "epoch": 0.09275, "grad_norm": 35.25, "grad_norm_var": 7.305989583333333, "learning_rate": 0.0001, "loss": 7.6004, "loss/crossentropy": 2.0759357810020447, "loss/hidden": 3.446484375, "loss/jsd": 0.0, "loss/logits": 0.21512960288673638, "step": 3710 }, { "epoch": 0.093, "grad_norm": 38.25, "grad_norm_var": 19.737239583333334, "learning_rate": 0.0001, "loss": 7.6564, "loss/crossentropy": 2.2961436778306963, "loss/hidden": 3.318359375, "loss/jsd": 0.0, "loss/logits": 0.198493617400527, "step": 3720 }, { "epoch": 0.09325, "grad_norm": 30.5, "grad_norm_var": 17.01015625, "learning_rate": 0.0001, "loss": 7.6998, "loss/crossentropy": 2.1192551463842393, "loss/hidden": 3.510546875, "loss/jsd": 0.0, "loss/logits": 0.19986802861094474, "step": 3730 }, { "epoch": 0.0935, "grad_norm": 36.25, "grad_norm_var": 10.20625, "learning_rate": 0.0001, "loss": 7.4855, "loss/crossentropy": 2.0999212980270388, "loss/hidden": 3.377734375, "loss/jsd": 0.0, "loss/logits": 0.19102167561650277, "step": 3740 }, { "epoch": 0.09375, "grad_norm": 33.75, "grad_norm_var": 7.556705729166667, "learning_rate": 0.0001, "loss": 7.6165, "loss/crossentropy": 2.1783443093299866, "loss/hidden": 3.418359375, "loss/jsd": 0.0, "loss/logits": 0.1862858783453703, "step": 3750 }, { "epoch": 0.094, "grad_norm": 28.125, "grad_norm_var": 5.3603515625, "learning_rate": 0.0001, "loss": 7.5372, "loss/crossentropy": 2.0993641003966332, "loss/hidden": 3.405078125, "loss/jsd": 0.0, "loss/logits": 0.17717746701091527, "step": 3760 }, { "epoch": 0.09425, "grad_norm": 30.625, "grad_norm_var": 5.297330729166666, "learning_rate": 0.0001, "loss": 7.614, "loss/crossentropy": 2.1238688945770265, "loss/hidden": 3.424609375, "loss/jsd": 0.0, "loss/logits": 0.20928110517561435, "step": 3770 }, { "epoch": 0.0945, "grad_norm": 29.75, "grad_norm_var": 4.573893229166667, "learning_rate": 0.0001, "loss": 7.5672, "loss/crossentropy": 2.203542584180832, "loss/hidden": 3.40546875, "loss/jsd": 0.0, "loss/logits": 0.19581303521990776, "step": 3780 }, { "epoch": 0.09475, "grad_norm": 38.0, "grad_norm_var": 5.322330729166667, "learning_rate": 0.0001, "loss": 7.5661, "loss/crossentropy": 2.159272998571396, "loss/hidden": 3.46953125, "loss/jsd": 0.0, "loss/logits": 0.1973846558481455, "step": 3790 }, { "epoch": 0.095, "grad_norm": 30.75, "grad_norm_var": 5.291666666666667, "learning_rate": 0.0001, "loss": 7.5495, "loss/crossentropy": 2.188914805650711, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.2011850569397211, "step": 3800 }, { "epoch": 0.09525, "grad_norm": 31.125, "grad_norm_var": 6.112434895833333, "learning_rate": 0.0001, "loss": 7.5776, "loss/crossentropy": 2.1599004954099654, "loss/hidden": 3.464453125, "loss/jsd": 0.0, "loss/logits": 0.21134469993412494, "step": 3810 }, { "epoch": 0.0955, "grad_norm": 35.25, "grad_norm_var": 29.9431640625, "learning_rate": 0.0001, "loss": 7.6779, "loss/crossentropy": 2.1196573287248612, "loss/hidden": 3.576171875, "loss/jsd": 0.0, "loss/logits": 0.2261866919696331, "step": 3820 }, { "epoch": 0.09575, "grad_norm": 37.5, "grad_norm_var": 11.21640625, "learning_rate": 0.0001, "loss": 7.4703, "loss/crossentropy": 2.149521693587303, "loss/hidden": 3.373046875, "loss/jsd": 0.0, "loss/logits": 0.1926643056795001, "step": 3830 }, { "epoch": 0.096, "grad_norm": 32.0, "grad_norm_var": 4.3494140625, "learning_rate": 0.0001, "loss": 7.5513, "loss/crossentropy": 2.1707202911376955, "loss/hidden": 3.425, "loss/jsd": 0.0, "loss/logits": 0.19597616009414195, "step": 3840 }, { "epoch": 0.09625, "grad_norm": 29.5, "grad_norm_var": 630.7197916666667, "learning_rate": 0.0001, "loss": 7.6191, "loss/crossentropy": 2.0859180808067324, "loss/hidden": 3.4640625, "loss/jsd": 0.0, "loss/logits": 0.20257378201931714, "step": 3850 }, { "epoch": 0.0965, "grad_norm": 59.5, "grad_norm_var": 100.66223958333333, "learning_rate": 0.0001, "loss": 7.5774, "loss/crossentropy": 2.1815837740898134, "loss/hidden": 3.426953125, "loss/jsd": 0.0, "loss/logits": 0.18438388928771018, "step": 3860 }, { "epoch": 0.09675, "grad_norm": 36.75, "grad_norm_var": 66.29212239583333, "learning_rate": 0.0001, "loss": 7.5808, "loss/crossentropy": 2.0505243610590695, "loss/hidden": 3.440234375, "loss/jsd": 0.0, "loss/logits": 0.18692483827471734, "step": 3870 }, { "epoch": 0.097, "grad_norm": 30.375, "grad_norm_var": 4.266080729166666, "learning_rate": 0.0001, "loss": 7.5664, "loss/crossentropy": 2.2033773183822634, "loss/hidden": 3.4625, "loss/jsd": 0.0, "loss/logits": 0.19963842574507, "step": 3880 }, { "epoch": 0.09725, "grad_norm": 32.25, "grad_norm_var": 5.0025390625, "learning_rate": 0.0001, "loss": 7.59, "loss/crossentropy": 2.1328989803791045, "loss/hidden": 3.417578125, "loss/jsd": 0.0, "loss/logits": 0.19634215533733368, "step": 3890 }, { "epoch": 0.0975, "grad_norm": 34.0, "grad_norm_var": 2.1322916666666667, "learning_rate": 0.0001, "loss": 7.6407, "loss/crossentropy": 2.170455330610275, "loss/hidden": 3.43828125, "loss/jsd": 0.0, "loss/logits": 0.19931643791496753, "step": 3900 }, { "epoch": 0.09775, "grad_norm": 34.75, "grad_norm_var": 3.558333333333333, "learning_rate": 0.0001, "loss": 7.5899, "loss/crossentropy": 2.1301774442195893, "loss/hidden": 3.468359375, "loss/jsd": 0.0, "loss/logits": 0.19248898830264807, "step": 3910 }, { "epoch": 0.098, "grad_norm": 33.75, "grad_norm_var": 3.3478515625, "learning_rate": 0.0001, "loss": 7.6526, "loss/crossentropy": 2.1600559651851654, "loss/hidden": 3.541796875, "loss/jsd": 0.0, "loss/logits": 0.22827934101223946, "step": 3920 }, { "epoch": 0.09825, "grad_norm": 30.75, "grad_norm_var": 6.117643229166666, "learning_rate": 0.0001, "loss": 7.5635, "loss/crossentropy": 2.0728287249803543, "loss/hidden": 3.534765625, "loss/jsd": 0.0, "loss/logits": 0.20216128267347813, "step": 3930 }, { "epoch": 0.0985, "grad_norm": 33.25, "grad_norm_var": 7.5087890625, "learning_rate": 0.0001, "loss": 7.7253, "loss/crossentropy": 2.1514860481023788, "loss/hidden": 3.530078125, "loss/jsd": 0.0, "loss/logits": 0.21096254773437978, "step": 3940 }, { "epoch": 0.09875, "grad_norm": 34.25, "grad_norm_var": 2.6207682291666665, "learning_rate": 0.0001, "loss": 7.538, "loss/crossentropy": 2.169696259498596, "loss/hidden": 3.48671875, "loss/jsd": 0.0, "loss/logits": 0.21556729041039943, "step": 3950 }, { "epoch": 0.099, "grad_norm": 31.25, "grad_norm_var": 5.3150390625, "learning_rate": 0.0001, "loss": 7.54, "loss/crossentropy": 2.1874313950538635, "loss/hidden": 3.399609375, "loss/jsd": 0.0, "loss/logits": 0.19146509394049643, "step": 3960 }, { "epoch": 0.09925, "grad_norm": 32.0, "grad_norm_var": 23.437239583333334, "learning_rate": 0.0001, "loss": 7.6292, "loss/crossentropy": 2.165771406888962, "loss/hidden": 3.592578125, "loss/jsd": 0.0, "loss/logits": 0.20717886611819267, "step": 3970 }, { "epoch": 0.0995, "grad_norm": 31.375, "grad_norm_var": 407.4603515625, "learning_rate": 0.0001, "loss": 7.7177, "loss/crossentropy": 2.1150890797376634, "loss/hidden": 3.501953125, "loss/jsd": 0.0, "loss/logits": 0.19676875434815883, "step": 3980 }, { "epoch": 0.09975, "grad_norm": 33.5, "grad_norm_var": 8.463997395833333, "learning_rate": 0.0001, "loss": 7.6397, "loss/crossentropy": 2.13585125207901, "loss/hidden": 3.597265625, "loss/jsd": 0.0, "loss/logits": 0.2018281053751707, "step": 3990 }, { "epoch": 0.1, "grad_norm": 36.0, "grad_norm_var": 8.787434895833334, "learning_rate": 0.0001, "loss": 7.6957, "loss/crossentropy": 2.062576304376125, "loss/hidden": 3.556640625, "loss/jsd": 0.0, "loss/logits": 0.20351322293281554, "step": 4000 }, { "epoch": 0.10025, "grad_norm": 32.25, "grad_norm_var": 2.6931640625, "learning_rate": 0.0001, "loss": 7.5171, "loss/crossentropy": 2.1093045681715012, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.19258121848106385, "step": 4010 }, { "epoch": 0.1005, "grad_norm": 38.5, "grad_norm_var": 6.976497395833333, "learning_rate": 0.0001, "loss": 7.7046, "loss/crossentropy": 2.1054726734757425, "loss/hidden": 3.515234375, "loss/jsd": 0.0, "loss/logits": 0.18498583231121302, "step": 4020 }, { "epoch": 0.10075, "grad_norm": 32.25, "grad_norm_var": 16.50390625, "learning_rate": 0.0001, "loss": 7.6242, "loss/crossentropy": 2.0566830962896345, "loss/hidden": 3.537890625, "loss/jsd": 0.0, "loss/logits": 0.21257028207182885, "step": 4030 }, { "epoch": 0.101, "grad_norm": 30.125, "grad_norm_var": 21.61640625, "learning_rate": 0.0001, "loss": 7.5611, "loss/crossentropy": 2.0847130313515665, "loss/hidden": 3.378515625, "loss/jsd": 0.0, "loss/logits": 0.19317954257130623, "step": 4040 }, { "epoch": 0.10125, "grad_norm": 31.0, "grad_norm_var": 16.408268229166666, "learning_rate": 0.0001, "loss": 7.5456, "loss/crossentropy": 2.116552269458771, "loss/hidden": 3.444140625, "loss/jsd": 0.0, "loss/logits": 0.1943613938987255, "step": 4050 }, { "epoch": 0.1015, "grad_norm": 31.25, "grad_norm_var": 17.984375, "learning_rate": 0.0001, "loss": 7.6259, "loss/crossentropy": 2.2868128657341003, "loss/hidden": 3.494921875, "loss/jsd": 0.0, "loss/logits": 0.2375142715871334, "step": 4060 }, { "epoch": 0.10175, "grad_norm": 29.625, "grad_norm_var": 2.2025390625, "learning_rate": 0.0001, "loss": 7.5637, "loss/crossentropy": 2.092506285011768, "loss/hidden": 3.466796875, "loss/jsd": 0.0, "loss/logits": 0.1903899708762765, "step": 4070 }, { "epoch": 0.102, "grad_norm": 30.25, "grad_norm_var": 55.06764322916667, "learning_rate": 0.0001, "loss": 7.6365, "loss/crossentropy": 2.2538520216941835, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.2204372201114893, "step": 4080 }, { "epoch": 0.10225, "grad_norm": 48.0, "grad_norm_var": 66.06555989583333, "learning_rate": 0.0001, "loss": 7.6539, "loss/crossentropy": 2.198161965608597, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.18744452036917209, "step": 4090 }, { "epoch": 0.1025, "grad_norm": 31.625, "grad_norm_var": 25.937239583333334, "learning_rate": 0.0001, "loss": 7.5872, "loss/crossentropy": 2.161240801215172, "loss/hidden": 3.548046875, "loss/jsd": 0.0, "loss/logits": 0.19324529767036439, "step": 4100 }, { "epoch": 0.10275, "grad_norm": 31.125, "grad_norm_var": 2.9613932291666667, "learning_rate": 0.0001, "loss": 7.5854, "loss/crossentropy": 2.185439817607403, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.19476189762353896, "step": 4110 }, { "epoch": 0.103, "grad_norm": 29.0, "grad_norm_var": 5.4556640625, "learning_rate": 0.0001, "loss": 7.6728, "loss/crossentropy": 2.1513148337602614, "loss/hidden": 3.4875, "loss/jsd": 0.0, "loss/logits": 0.20135847330093384, "step": 4120 }, { "epoch": 0.10325, "grad_norm": 36.25, "grad_norm_var": 4.3994140625, "learning_rate": 0.0001, "loss": 7.6006, "loss/crossentropy": 2.0776968479156492, "loss/hidden": 3.49140625, "loss/jsd": 0.0, "loss/logits": 0.19831380508840085, "step": 4130 }, { "epoch": 0.1035, "grad_norm": 33.0, "grad_norm_var": 4.205143229166667, "learning_rate": 0.0001, "loss": 7.6157, "loss/crossentropy": 2.0971890702843665, "loss/hidden": 3.64140625, "loss/jsd": 0.0, "loss/logits": 0.2015662420541048, "step": 4140 }, { "epoch": 0.10375, "grad_norm": 35.5, "grad_norm_var": 23.512239583333333, "learning_rate": 0.0001, "loss": 7.6638, "loss/crossentropy": 2.128816670179367, "loss/hidden": 3.420703125, "loss/jsd": 0.0, "loss/logits": 0.19698726907372474, "step": 4150 }, { "epoch": 0.104, "grad_norm": 30.75, "grad_norm_var": 22.026822916666667, "learning_rate": 0.0001, "loss": 7.6175, "loss/crossentropy": 2.0965539067983627, "loss/hidden": 3.455859375, "loss/jsd": 0.0, "loss/logits": 0.22271894477307796, "step": 4160 }, { "epoch": 0.10425, "grad_norm": 32.5, "grad_norm_var": 2.426041666666667, "learning_rate": 0.0001, "loss": 7.555, "loss/crossentropy": 2.215752348303795, "loss/hidden": 3.364453125, "loss/jsd": 0.0, "loss/logits": 0.1962002281099558, "step": 4170 }, { "epoch": 0.1045, "grad_norm": 39.5, "grad_norm_var": 23.042708333333334, "learning_rate": 0.0001, "loss": 7.6246, "loss/crossentropy": 2.0542988061904905, "loss/hidden": 3.440625, "loss/jsd": 0.0, "loss/logits": 0.19908196646720172, "step": 4180 }, { "epoch": 0.10475, "grad_norm": 34.75, "grad_norm_var": 6.342643229166667, "learning_rate": 0.0001, "loss": 7.487, "loss/crossentropy": 2.2133218079805372, "loss/hidden": 3.382421875, "loss/jsd": 0.0, "loss/logits": 0.18823296912014484, "step": 4190 }, { "epoch": 0.105, "grad_norm": 31.125, "grad_norm_var": 173.98430989583332, "learning_rate": 0.0001, "loss": 7.643, "loss/crossentropy": 2.1089532509446145, "loss/hidden": 3.501953125, "loss/jsd": 0.0, "loss/logits": 0.20591201409697532, "step": 4200 }, { "epoch": 0.10525, "grad_norm": 36.75, "grad_norm_var": 7.351041666666666, "learning_rate": 0.0001, "loss": 7.5643, "loss/crossentropy": 2.289369744062424, "loss/hidden": 3.36640625, "loss/jsd": 0.0, "loss/logits": 0.19611021652817726, "step": 4210 }, { "epoch": 0.1055, "grad_norm": 35.75, "grad_norm_var": 6.539322916666666, "learning_rate": 0.0001, "loss": 7.6756, "loss/crossentropy": 2.1963788866996765, "loss/hidden": 3.49921875, "loss/jsd": 0.0, "loss/logits": 0.19495316371321678, "step": 4220 }, { "epoch": 0.10575, "grad_norm": 31.875, "grad_norm_var": 6.276041666666667, "learning_rate": 0.0001, "loss": 7.5552, "loss/crossentropy": 2.078681927919388, "loss/hidden": 3.441015625, "loss/jsd": 0.0, "loss/logits": 0.1941295877099037, "step": 4230 }, { "epoch": 0.106, "grad_norm": 44.5, "grad_norm_var": 34.80520833333333, "learning_rate": 0.0001, "loss": 7.6072, "loss/crossentropy": 2.222626182436943, "loss/hidden": 3.2828125, "loss/jsd": 0.0, "loss/logits": 0.1848284311592579, "step": 4240 }, { "epoch": 0.10625, "grad_norm": 31.375, "grad_norm_var": 35.084375, "learning_rate": 0.0001, "loss": 7.6435, "loss/crossentropy": 2.152433153986931, "loss/hidden": 3.3390625, "loss/jsd": 0.0, "loss/logits": 0.1829435657709837, "step": 4250 }, { "epoch": 0.1065, "grad_norm": 31.625, "grad_norm_var": 4.431705729166667, "learning_rate": 0.0001, "loss": 7.5977, "loss/crossentropy": 2.1493207842111586, "loss/hidden": 3.466015625, "loss/jsd": 0.0, "loss/logits": 0.19450047723948954, "step": 4260 }, { "epoch": 0.10675, "grad_norm": 30.0, "grad_norm_var": 8.024739583333334, "learning_rate": 0.0001, "loss": 7.5423, "loss/crossentropy": 2.0623584628105163, "loss/hidden": 3.42890625, "loss/jsd": 0.0, "loss/logits": 0.19236240349709988, "step": 4270 }, { "epoch": 0.107, "grad_norm": 51.75, "grad_norm_var": 105.76145833333334, "learning_rate": 0.0001, "loss": 7.577, "loss/crossentropy": 2.103591626882553, "loss/hidden": 3.369140625, "loss/jsd": 0.0, "loss/logits": 0.18201838787645103, "step": 4280 }, { "epoch": 0.10725, "grad_norm": 33.25, "grad_norm_var": 144.54973958333332, "learning_rate": 0.0001, "loss": 7.7078, "loss/crossentropy": 2.1076686546206473, "loss/hidden": 3.56328125, "loss/jsd": 0.0, "loss/logits": 0.19488887619227171, "step": 4290 }, { "epoch": 0.1075, "grad_norm": 32.0, "grad_norm_var": 190.62057291666667, "learning_rate": 0.0001, "loss": 7.5811, "loss/crossentropy": 2.168550156056881, "loss/hidden": 3.34453125, "loss/jsd": 0.0, "loss/logits": 0.1908732896670699, "step": 4300 }, { "epoch": 0.10775, "grad_norm": 28.875, "grad_norm_var": 149.1822265625, "learning_rate": 0.0001, "loss": 7.5999, "loss/crossentropy": 2.100285217165947, "loss/hidden": 3.411328125, "loss/jsd": 0.0, "loss/logits": 0.19184609185904264, "step": 4310 }, { "epoch": 0.108, "grad_norm": 44.25, "grad_norm_var": 12.502018229166667, "learning_rate": 0.0001, "loss": 7.7075, "loss/crossentropy": 2.0968768775463102, "loss/hidden": 3.51796875, "loss/jsd": 0.0, "loss/logits": 0.20168912429362534, "step": 4320 }, { "epoch": 0.10825, "grad_norm": 31.25, "grad_norm_var": 12.81875, "learning_rate": 0.0001, "loss": 7.5576, "loss/crossentropy": 2.1037441343069077, "loss/hidden": 3.370703125, "loss/jsd": 0.0, "loss/logits": 0.17891897186636924, "step": 4330 }, { "epoch": 0.1085, "grad_norm": 33.5, "grad_norm_var": 2.700455729166667, "learning_rate": 0.0001, "loss": 7.5097, "loss/crossentropy": 2.210876139998436, "loss/hidden": 3.36171875, "loss/jsd": 0.0, "loss/logits": 0.19316814988851547, "step": 4340 }, { "epoch": 0.10875, "grad_norm": 31.125, "grad_norm_var": 17.4666015625, "learning_rate": 0.0001, "loss": 7.5755, "loss/crossentropy": 2.1331328481435774, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.20047767795622348, "step": 4350 }, { "epoch": 0.109, "grad_norm": 32.75, "grad_norm_var": 3.198372395833333, "learning_rate": 0.0001, "loss": 7.5385, "loss/crossentropy": 2.153067779541016, "loss/hidden": 3.476953125, "loss/jsd": 0.0, "loss/logits": 0.20173794813454152, "step": 4360 }, { "epoch": 0.10925, "grad_norm": 32.0, "grad_norm_var": 4.010416666666667, "learning_rate": 0.0001, "loss": 7.5426, "loss/crossentropy": 2.165090653300285, "loss/hidden": 3.330859375, "loss/jsd": 0.0, "loss/logits": 0.18712956104427575, "step": 4370 }, { "epoch": 0.1095, "grad_norm": 30.0, "grad_norm_var": 1.77265625, "learning_rate": 0.0001, "loss": 7.605, "loss/crossentropy": 2.217823189496994, "loss/hidden": 3.37265625, "loss/jsd": 0.0, "loss/logits": 0.20003505125641824, "step": 4380 }, { "epoch": 0.10975, "grad_norm": 29.875, "grad_norm_var": 1.7603515625, "learning_rate": 0.0001, "loss": 7.507, "loss/crossentropy": 2.138795481622219, "loss/hidden": 3.48828125, "loss/jsd": 0.0, "loss/logits": 0.1892871480435133, "step": 4390 }, { "epoch": 0.11, "grad_norm": 30.375, "grad_norm_var": 22.449739583333333, "learning_rate": 0.0001, "loss": 7.6965, "loss/crossentropy": 2.22740375995636, "loss/hidden": 3.5578125, "loss/jsd": 0.0, "loss/logits": 0.2264870759099722, "step": 4400 }, { "epoch": 0.11025, "grad_norm": 29.5, "grad_norm_var": 37.71666666666667, "learning_rate": 0.0001, "loss": 7.604, "loss/crossentropy": 2.1781785815954207, "loss/hidden": 3.471875, "loss/jsd": 0.0, "loss/logits": 0.2012148156762123, "step": 4410 }, { "epoch": 0.1105, "grad_norm": 33.0, "grad_norm_var": 25.305989583333332, "learning_rate": 0.0001, "loss": 7.5767, "loss/crossentropy": 2.0333445832133292, "loss/hidden": 3.55703125, "loss/jsd": 0.0, "loss/logits": 0.1852614250034094, "step": 4420 }, { "epoch": 0.11075, "grad_norm": 32.0, "grad_norm_var": 0.9760416666666667, "learning_rate": 0.0001, "loss": 7.6218, "loss/crossentropy": 2.2101993292570112, "loss/hidden": 3.505078125, "loss/jsd": 0.0, "loss/logits": 0.2069159124046564, "step": 4430 }, { "epoch": 0.111, "grad_norm": 30.5, "grad_norm_var": 7.8837890625, "learning_rate": 0.0001, "loss": 7.6543, "loss/crossentropy": 2.0182371377944945, "loss/hidden": 3.475390625, "loss/jsd": 0.0, "loss/logits": 0.19055260960012674, "step": 4440 }, { "epoch": 0.11125, "grad_norm": 29.0, "grad_norm_var": 18.167643229166668, "learning_rate": 0.0001, "loss": 7.5559, "loss/crossentropy": 2.209046494960785, "loss/hidden": 3.4546875, "loss/jsd": 0.0, "loss/logits": 0.1926161792129278, "step": 4450 }, { "epoch": 0.1115, "grad_norm": 30.375, "grad_norm_var": 19.9634765625, "learning_rate": 0.0001, "loss": 7.5527, "loss/crossentropy": 2.2265418380498887, "loss/hidden": 3.28671875, "loss/jsd": 0.0, "loss/logits": 0.17907681576907636, "step": 4460 }, { "epoch": 0.11175, "grad_norm": 35.25, "grad_norm_var": 3.0434895833333333, "learning_rate": 0.0001, "loss": 7.6046, "loss/crossentropy": 2.1534146428108216, "loss/hidden": 3.436328125, "loss/jsd": 0.0, "loss/logits": 0.19181067440658808, "step": 4470 }, { "epoch": 0.112, "grad_norm": 31.375, "grad_norm_var": 2.161393229166667, "learning_rate": 0.0001, "loss": 7.5151, "loss/crossentropy": 2.2303753718733788, "loss/hidden": 3.46796875, "loss/jsd": 0.0, "loss/logits": 0.19312014058232307, "step": 4480 }, { "epoch": 0.11225, "grad_norm": 31.25, "grad_norm_var": 2.071875, "learning_rate": 0.0001, "loss": 7.5733, "loss/crossentropy": 2.2565354451537134, "loss/hidden": 3.300390625, "loss/jsd": 0.0, "loss/logits": 0.19555974584072827, "step": 4490 }, { "epoch": 0.1125, "grad_norm": 30.125, "grad_norm_var": 6.21015625, "learning_rate": 0.0001, "loss": 7.5721, "loss/crossentropy": 2.1691703468561174, "loss/hidden": 3.350390625, "loss/jsd": 0.0, "loss/logits": 0.19526711832731963, "step": 4500 }, { "epoch": 0.11275, "grad_norm": 30.75, "grad_norm_var": 34.985416666666666, "learning_rate": 0.0001, "loss": 7.6031, "loss/crossentropy": 2.191486781835556, "loss/hidden": 3.38515625, "loss/jsd": 0.0, "loss/logits": 0.2022854283452034, "step": 4510 }, { "epoch": 0.113, "grad_norm": 32.75, "grad_norm_var": 34.91041666666667, "learning_rate": 0.0001, "loss": 7.566, "loss/crossentropy": 2.07875557243824, "loss/hidden": 3.510546875, "loss/jsd": 0.0, "loss/logits": 0.20517632961273194, "step": 4520 }, { "epoch": 0.11325, "grad_norm": 30.25, "grad_norm_var": 3.1869140625, "learning_rate": 0.0001, "loss": 7.6204, "loss/crossentropy": 2.1490323692560196, "loss/hidden": 3.55390625, "loss/jsd": 0.0, "loss/logits": 0.20650937724858523, "step": 4530 }, { "epoch": 0.1135, "grad_norm": 31.625, "grad_norm_var": 3.6639973958333334, "learning_rate": 0.0001, "loss": 7.5605, "loss/crossentropy": 2.19907369017601, "loss/hidden": 3.3765625, "loss/jsd": 0.0, "loss/logits": 0.18378095962107183, "step": 4540 }, { "epoch": 0.11375, "grad_norm": 33.0, "grad_norm_var": 3.34140625, "learning_rate": 0.0001, "loss": 7.5943, "loss/crossentropy": 2.0509427756071092, "loss/hidden": 3.46796875, "loss/jsd": 0.0, "loss/logits": 0.2012764386832714, "step": 4550 }, { "epoch": 0.114, "grad_norm": 32.25, "grad_norm_var": 55.52916666666667, "learning_rate": 0.0001, "loss": 7.5349, "loss/crossentropy": 2.247987303137779, "loss/hidden": 3.491015625, "loss/jsd": 0.0, "loss/logits": 0.2166461084038019, "step": 4560 }, { "epoch": 0.11425, "grad_norm": 31.0, "grad_norm_var": 59.064518229166666, "learning_rate": 0.0001, "loss": 7.5699, "loss/crossentropy": 2.256947749853134, "loss/hidden": 3.369921875, "loss/jsd": 0.0, "loss/logits": 0.19454225115478038, "step": 4570 }, { "epoch": 0.1145, "grad_norm": 30.5, "grad_norm_var": 12.089583333333334, "learning_rate": 0.0001, "loss": 7.5587, "loss/crossentropy": 2.230518189072609, "loss/hidden": 3.380078125, "loss/jsd": 0.0, "loss/logits": 0.19083393104374408, "step": 4580 }, { "epoch": 0.11475, "grad_norm": 31.5, "grad_norm_var": 18.3056640625, "learning_rate": 0.0001, "loss": 7.664, "loss/crossentropy": 2.113222661614418, "loss/hidden": 3.51640625, "loss/jsd": 0.0, "loss/logits": 0.20051947552710772, "step": 4590 }, { "epoch": 0.115, "grad_norm": 29.25, "grad_norm_var": 26.7837890625, "learning_rate": 0.0001, "loss": 7.6508, "loss/crossentropy": 2.2963487923145296, "loss/hidden": 3.430859375, "loss/jsd": 0.0, "loss/logits": 0.21824662014842033, "step": 4600 }, { "epoch": 0.11525, "grad_norm": 31.75, "grad_norm_var": 3.249739583333333, "learning_rate": 0.0001, "loss": 7.5491, "loss/crossentropy": 2.2380147099494936, "loss/hidden": 3.488671875, "loss/jsd": 0.0, "loss/logits": 0.2021485272794962, "step": 4610 }, { "epoch": 0.1155, "grad_norm": 34.0, "grad_norm_var": 3.2072265625, "learning_rate": 0.0001, "loss": 7.6035, "loss/crossentropy": 2.1933206588029863, "loss/hidden": 3.519921875, "loss/jsd": 0.0, "loss/logits": 0.2073045803233981, "step": 4620 }, { "epoch": 0.11575, "grad_norm": 30.375, "grad_norm_var": 25.005989583333335, "learning_rate": 0.0001, "loss": 7.577, "loss/crossentropy": 2.3104471057653426, "loss/hidden": 3.377734375, "loss/jsd": 0.0, "loss/logits": 0.20087463557720184, "step": 4630 }, { "epoch": 0.116, "grad_norm": 33.5, "grad_norm_var": 539.2056640625, "learning_rate": 0.0001, "loss": 7.5764, "loss/crossentropy": 2.1786745607852938, "loss/hidden": 3.39453125, "loss/jsd": 0.0, "loss/logits": 0.19096632562577726, "step": 4640 }, { "epoch": 0.11625, "grad_norm": 30.5, "grad_norm_var": 132.26139322916666, "learning_rate": 0.0001, "loss": 7.7424, "loss/crossentropy": 2.1628643572330475, "loss/hidden": 3.526171875, "loss/jsd": 0.0, "loss/logits": 0.1994694285094738, "step": 4650 }, { "epoch": 0.1165, "grad_norm": 40.25, "grad_norm_var": 12.672330729166667, "learning_rate": 0.0001, "loss": 7.6456, "loss/crossentropy": 2.0927803248167036, "loss/hidden": 3.507421875, "loss/jsd": 0.0, "loss/logits": 0.20613454841077328, "step": 4660 }, { "epoch": 0.11675, "grad_norm": 37.25, "grad_norm_var": 7.458072916666667, "learning_rate": 0.0001, "loss": 7.622, "loss/crossentropy": 2.311227411031723, "loss/hidden": 3.402734375, "loss/jsd": 0.0, "loss/logits": 0.21675110273063183, "step": 4670 }, { "epoch": 0.117, "grad_norm": 34.5, "grad_norm_var": 138.99524739583333, "learning_rate": 0.0001, "loss": 7.6633, "loss/crossentropy": 2.1860357582569123, "loss/hidden": 3.509375, "loss/jsd": 0.0, "loss/logits": 0.20624178424477577, "step": 4680 }, { "epoch": 0.11725, "grad_norm": 38.25, "grad_norm_var": 11.161393229166666, "learning_rate": 0.0001, "loss": 7.654, "loss/crossentropy": 2.246461641788483, "loss/hidden": 3.435546875, "loss/jsd": 0.0, "loss/logits": 0.19816880766302347, "step": 4690 }, { "epoch": 0.1175, "grad_norm": 98.5, "grad_norm_var": 275.63645833333334, "learning_rate": 0.0001, "loss": 7.6871, "loss/crossentropy": 2.1662321478128432, "loss/hidden": 3.52421875, "loss/jsd": 0.0, "loss/logits": 0.23473294898867608, "step": 4700 }, { "epoch": 0.11775, "grad_norm": 32.25, "grad_norm_var": 273.496875, "learning_rate": 0.0001, "loss": 7.5664, "loss/crossentropy": 2.1411470264196395, "loss/hidden": 3.466015625, "loss/jsd": 0.0, "loss/logits": 0.20816716887056827, "step": 4710 }, { "epoch": 0.118, "grad_norm": 30.625, "grad_norm_var": 1.4697916666666666, "learning_rate": 0.0001, "loss": 7.5106, "loss/crossentropy": 2.184460151195526, "loss/hidden": 3.387890625, "loss/jsd": 0.0, "loss/logits": 0.19597234334796668, "step": 4720 }, { "epoch": 0.11825, "grad_norm": 32.75, "grad_norm_var": 151.39368489583333, "learning_rate": 0.0001, "loss": 7.7143, "loss/crossentropy": 2.18603872358799, "loss/hidden": 3.509375, "loss/jsd": 0.0, "loss/logits": 0.21333505641669034, "step": 4730 }, { "epoch": 0.1185, "grad_norm": 31.125, "grad_norm_var": 41.80774739583333, "learning_rate": 0.0001, "loss": 7.5158, "loss/crossentropy": 2.038896057009697, "loss/hidden": 3.413671875, "loss/jsd": 0.0, "loss/logits": 0.18354782909154893, "step": 4740 }, { "epoch": 0.11875, "grad_norm": 31.5, "grad_norm_var": 20.727083333333333, "learning_rate": 0.0001, "loss": 7.6044, "loss/crossentropy": 2.07361024916172, "loss/hidden": 3.505078125, "loss/jsd": 0.0, "loss/logits": 0.2144785810261965, "step": 4750 }, { "epoch": 0.119, "grad_norm": 32.0, "grad_norm_var": 14.9994140625, "learning_rate": 0.0001, "loss": 7.5936, "loss/crossentropy": 2.172766661643982, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.1904382836073637, "step": 4760 }, { "epoch": 0.11925, "grad_norm": 33.0, "grad_norm_var": 11.213541666666666, "learning_rate": 0.0001, "loss": 7.6024, "loss/crossentropy": 2.2863214761018753, "loss/hidden": 3.464453125, "loss/jsd": 0.0, "loss/logits": 0.20077989026904106, "step": 4770 }, { "epoch": 0.1195, "grad_norm": 32.0, "grad_norm_var": 23.308268229166668, "learning_rate": 0.0001, "loss": 7.5743, "loss/crossentropy": 2.172411371767521, "loss/hidden": 3.357421875, "loss/jsd": 0.0, "loss/logits": 0.1879224268719554, "step": 4780 }, { "epoch": 0.11975, "grad_norm": 29.625, "grad_norm_var": 27.662239583333335, "learning_rate": 0.0001, "loss": 7.6003, "loss/crossentropy": 2.061740070581436, "loss/hidden": 3.424609375, "loss/jsd": 0.0, "loss/logits": 0.196690865047276, "step": 4790 }, { "epoch": 0.12, "grad_norm": 34.5, "grad_norm_var": 9.820768229166667, "learning_rate": 0.0001, "loss": 7.5893, "loss/crossentropy": 2.1725818127393723, "loss/hidden": 3.33828125, "loss/jsd": 0.0, "loss/logits": 0.18248203694820403, "step": 4800 }, { "epoch": 0.12025, "grad_norm": 31.375, "grad_norm_var": 5.5478515625, "learning_rate": 0.0001, "loss": 7.5542, "loss/crossentropy": 2.0823758363723757, "loss/hidden": 3.503125, "loss/jsd": 0.0, "loss/logits": 0.19437449853867292, "step": 4810 }, { "epoch": 0.1205, "grad_norm": 31.125, "grad_norm_var": 2.8754557291666667, "learning_rate": 0.0001, "loss": 7.7018, "loss/crossentropy": 2.220066267251968, "loss/hidden": 3.52734375, "loss/jsd": 0.0, "loss/logits": 0.20239269211888314, "step": 4820 }, { "epoch": 0.12075, "grad_norm": 33.25, "grad_norm_var": 6.868489583333333, "learning_rate": 0.0001, "loss": 7.5765, "loss/crossentropy": 2.0765403911471365, "loss/hidden": 3.435546875, "loss/jsd": 0.0, "loss/logits": 0.19827509336173535, "step": 4830 }, { "epoch": 0.121, "grad_norm": 34.0, "grad_norm_var": 27.168489583333333, "learning_rate": 0.0001, "loss": 7.7248, "loss/crossentropy": 2.1397932201623915, "loss/hidden": 3.50546875, "loss/jsd": 0.0, "loss/logits": 0.204028557613492, "step": 4840 }, { "epoch": 0.12125, "grad_norm": 38.0, "grad_norm_var": 22.633072916666666, "learning_rate": 0.0001, "loss": 7.6658, "loss/crossentropy": 2.3317618519067764, "loss/hidden": 3.33046875, "loss/jsd": 0.0, "loss/logits": 0.19565313905477524, "step": 4850 }, { "epoch": 0.1215, "grad_norm": 29.875, "grad_norm_var": 4.6041015625, "learning_rate": 0.0001, "loss": 7.5415, "loss/crossentropy": 2.060061091184616, "loss/hidden": 3.491015625, "loss/jsd": 0.0, "loss/logits": 0.19083615019917488, "step": 4860 }, { "epoch": 0.12175, "grad_norm": 33.75, "grad_norm_var": 5.06875, "learning_rate": 0.0001, "loss": 7.651, "loss/crossentropy": 2.158045071363449, "loss/hidden": 3.52421875, "loss/jsd": 0.0, "loss/logits": 0.2148456061258912, "step": 4870 }, { "epoch": 0.122, "grad_norm": 30.625, "grad_norm_var": 16.091666666666665, "learning_rate": 0.0001, "loss": 7.5793, "loss/crossentropy": 2.0583921030163763, "loss/hidden": 3.487109375, "loss/jsd": 0.0, "loss/logits": 0.19111349806189537, "step": 4880 }, { "epoch": 0.12225, "grad_norm": 33.25, "grad_norm_var": 17.422330729166667, "learning_rate": 0.0001, "loss": 7.6588, "loss/crossentropy": 2.1186117827892303, "loss/hidden": 3.500390625, "loss/jsd": 0.0, "loss/logits": 0.19436944983899593, "step": 4890 }, { "epoch": 0.1225, "grad_norm": 36.75, "grad_norm_var": 3.2676432291666666, "learning_rate": 0.0001, "loss": 7.4748, "loss/crossentropy": 2.2382855489850044, "loss/hidden": 3.39921875, "loss/jsd": 0.0, "loss/logits": 0.19216692261397839, "step": 4900 }, { "epoch": 0.12275, "grad_norm": 29.375, "grad_norm_var": 31.048958333333335, "learning_rate": 0.0001, "loss": 7.5018, "loss/crossentropy": 2.1136436641216276, "loss/hidden": 3.482421875, "loss/jsd": 0.0, "loss/logits": 0.19329534620046615, "step": 4910 }, { "epoch": 0.123, "grad_norm": 40.5, "grad_norm_var": 8.3572265625, "learning_rate": 0.0001, "loss": 7.5661, "loss/crossentropy": 2.043731611967087, "loss/hidden": 3.472265625, "loss/jsd": 0.0, "loss/logits": 0.18812808189541103, "step": 4920 }, { "epoch": 0.12325, "grad_norm": 31.25, "grad_norm_var": 16.280989583333334, "learning_rate": 0.0001, "loss": 7.5565, "loss/crossentropy": 2.129016649723053, "loss/hidden": 3.345703125, "loss/jsd": 0.0, "loss/logits": 0.1941742904484272, "step": 4930 }, { "epoch": 0.1235, "grad_norm": 30.125, "grad_norm_var": 1.4504557291666667, "learning_rate": 0.0001, "loss": 7.5356, "loss/crossentropy": 2.1981059461832047, "loss/hidden": 3.533203125, "loss/jsd": 0.0, "loss/logits": 0.19572316966950892, "step": 4940 }, { "epoch": 0.12375, "grad_norm": 54.5, "grad_norm_var": 36.024739583333336, "learning_rate": 0.0001, "loss": 7.5463, "loss/crossentropy": 2.107844803482294, "loss/hidden": 3.551953125, "loss/jsd": 0.0, "loss/logits": 0.18902508020401002, "step": 4950 }, { "epoch": 0.124, "grad_norm": 33.0, "grad_norm_var": 64.778125, "learning_rate": 0.0001, "loss": 7.6053, "loss/crossentropy": 2.1164773657917975, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.19264463931322098, "step": 4960 }, { "epoch": 0.12425, "grad_norm": 31.5, "grad_norm_var": 42.0666015625, "learning_rate": 0.0001, "loss": 7.5774, "loss/crossentropy": 2.176864555478096, "loss/hidden": 3.491796875, "loss/jsd": 0.0, "loss/logits": 0.20114411041140556, "step": 4970 }, { "epoch": 0.1245, "grad_norm": 32.25, "grad_norm_var": 33.07180989583333, "learning_rate": 0.0001, "loss": 7.5698, "loss/crossentropy": 2.1456878036260605, "loss/hidden": 3.38515625, "loss/jsd": 0.0, "loss/logits": 0.19218399338424205, "step": 4980 }, { "epoch": 0.12475, "grad_norm": 33.25, "grad_norm_var": 9.938997395833333, "learning_rate": 0.0001, "loss": 7.5467, "loss/crossentropy": 2.3402266025543215, "loss/hidden": 3.290625, "loss/jsd": 0.0, "loss/logits": 0.18540082685649395, "step": 4990 }, { "epoch": 0.125, "grad_norm": 32.5, "grad_norm_var": 1.6580729166666666, "learning_rate": 0.0001, "loss": 7.5883, "loss/crossentropy": 2.186999189853668, "loss/hidden": 3.341796875, "loss/jsd": 0.0, "loss/logits": 0.21516974158585073, "step": 5000 }, { "epoch": 0.12525, "grad_norm": 33.5, "grad_norm_var": 4.194205729166667, "learning_rate": 0.0001, "loss": 7.5936, "loss/crossentropy": 2.171919286251068, "loss/hidden": 3.41484375, "loss/jsd": 0.0, "loss/logits": 0.19737605061382055, "step": 5010 }, { "epoch": 0.1255, "grad_norm": 32.0, "grad_norm_var": 0.67265625, "learning_rate": 0.0001, "loss": 7.509, "loss/crossentropy": 2.2188323110342028, "loss/hidden": 3.33984375, "loss/jsd": 0.0, "loss/logits": 0.18663883544504642, "step": 5020 }, { "epoch": 0.12575, "grad_norm": 29.75, "grad_norm_var": 10.642643229166667, "learning_rate": 0.0001, "loss": 7.6301, "loss/crossentropy": 2.191204625368118, "loss/hidden": 3.348046875, "loss/jsd": 0.0, "loss/logits": 0.1946074590086937, "step": 5030 }, { "epoch": 0.126, "grad_norm": 31.125, "grad_norm_var": 1.6541015625, "learning_rate": 0.0001, "loss": 7.502, "loss/crossentropy": 2.1200972706079484, "loss/hidden": 3.43671875, "loss/jsd": 0.0, "loss/logits": 0.19346977435052395, "step": 5040 }, { "epoch": 0.12625, "grad_norm": 44.75, "grad_norm_var": 16.30390625, "learning_rate": 0.0001, "loss": 7.5705, "loss/crossentropy": 2.1876573234796526, "loss/hidden": 3.320703125, "loss/jsd": 0.0, "loss/logits": 0.18829105645418168, "step": 5050 }, { "epoch": 0.1265, "grad_norm": 31.875, "grad_norm_var": 20.667708333333334, "learning_rate": 0.0001, "loss": 7.5126, "loss/crossentropy": 2.2233003705739973, "loss/hidden": 3.3703125, "loss/jsd": 0.0, "loss/logits": 0.19192412812262774, "step": 5060 }, { "epoch": 0.12675, "grad_norm": 33.0, "grad_norm_var": 2.3285807291666667, "learning_rate": 0.0001, "loss": 7.5683, "loss/crossentropy": 2.0495440497994424, "loss/hidden": 3.4625, "loss/jsd": 0.0, "loss/logits": 0.18455710131675004, "step": 5070 }, { "epoch": 0.127, "grad_norm": 31.5, "grad_norm_var": 2.6684895833333333, "learning_rate": 0.0001, "loss": 7.4892, "loss/crossentropy": 2.1829854756593705, "loss/hidden": 3.415234375, "loss/jsd": 0.0, "loss/logits": 0.1906156621873379, "step": 5080 }, { "epoch": 0.12725, "grad_norm": 31.75, "grad_norm_var": 26.134375, "learning_rate": 0.0001, "loss": 7.7141, "loss/crossentropy": 2.150173208117485, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.20331819988787175, "step": 5090 }, { "epoch": 0.1275, "grad_norm": 36.5, "grad_norm_var": 25.376822916666665, "learning_rate": 0.0001, "loss": 7.572, "loss/crossentropy": 2.2022222489118577, "loss/hidden": 3.522265625, "loss/jsd": 0.0, "loss/logits": 0.20444649420678615, "step": 5100 }, { "epoch": 0.12775, "grad_norm": 31.375, "grad_norm_var": 15.917643229166666, "learning_rate": 0.0001, "loss": 7.5598, "loss/crossentropy": 2.0867941707372664, "loss/hidden": 3.56328125, "loss/jsd": 0.0, "loss/logits": 0.19711919017136098, "step": 5110 }, { "epoch": 0.128, "grad_norm": 30.125, "grad_norm_var": 16.92265625, "learning_rate": 0.0001, "loss": 7.4871, "loss/crossentropy": 2.1122621968388557, "loss/hidden": 3.371484375, "loss/jsd": 0.0, "loss/logits": 0.1813073743134737, "step": 5120 }, { "epoch": 0.12825, "grad_norm": 30.75, "grad_norm_var": 5.108072916666667, "learning_rate": 0.0001, "loss": 7.5229, "loss/crossentropy": 2.0356661707162855, "loss/hidden": 3.480859375, "loss/jsd": 0.0, "loss/logits": 0.19353711605072021, "step": 5130 }, { "epoch": 0.1285, "grad_norm": 32.5, "grad_norm_var": 34.81920572916667, "learning_rate": 0.0001, "loss": 7.64, "loss/crossentropy": 2.1762280851602553, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.20253173671662808, "step": 5140 }, { "epoch": 0.12875, "grad_norm": 32.25, "grad_norm_var": 51.16223958333333, "learning_rate": 0.0001, "loss": 7.6475, "loss/crossentropy": 2.14297553896904, "loss/hidden": 3.48671875, "loss/jsd": 0.0, "loss/logits": 0.2105777282267809, "step": 5150 }, { "epoch": 0.129, "grad_norm": 34.0, "grad_norm_var": 2.513641882357806e+18, "learning_rate": 0.0001, "loss": 7.6232, "loss/crossentropy": 2.190713110566139, "loss/hidden": 3.689453125, "loss/jsd": 0.0, "loss/logits": 0.2259815253317356, "step": 5160 }, { "epoch": 0.12925, "grad_norm": 50.0, "grad_norm_var": 2.513641882159625e+18, "learning_rate": 0.0001, "loss": 7.5304, "loss/crossentropy": 2.287049275636673, "loss/hidden": 3.22109375, "loss/jsd": 0.0, "loss/logits": 0.1791717953979969, "step": 5170 }, { "epoch": 0.1295, "grad_norm": 32.25, "grad_norm_var": 57.03932291666667, "learning_rate": 0.0001, "loss": 7.5666, "loss/crossentropy": 2.205572660267353, "loss/hidden": 3.40234375, "loss/jsd": 0.0, "loss/logits": 0.1940192885696888, "step": 5180 }, { "epoch": 0.12975, "grad_norm": 33.25, "grad_norm_var": 2.115625, "learning_rate": 0.0001, "loss": 7.5783, "loss/crossentropy": 2.170276886224747, "loss/hidden": 3.498828125, "loss/jsd": 0.0, "loss/logits": 0.19651044271886348, "step": 5190 }, { "epoch": 0.13, "grad_norm": 31.5, "grad_norm_var": 3.549934895833333, "learning_rate": 0.0001, "loss": 7.6073, "loss/crossentropy": 2.2128631293773653, "loss/hidden": 3.335546875, "loss/jsd": 0.0, "loss/logits": 0.19390376433730125, "step": 5200 }, { "epoch": 0.13025, "grad_norm": 35.75, "grad_norm_var": 9.099934895833334, "learning_rate": 0.0001, "loss": 7.575, "loss/crossentropy": 2.171933504939079, "loss/hidden": 3.3765625, "loss/jsd": 0.0, "loss/logits": 0.1874479927122593, "step": 5210 }, { "epoch": 0.1305, "grad_norm": 38.25, "grad_norm_var": 12.2869140625, "learning_rate": 0.0001, "loss": 7.5635, "loss/crossentropy": 2.1169356971979143, "loss/hidden": 3.334375, "loss/jsd": 0.0, "loss/logits": 0.1857094492763281, "step": 5220 }, { "epoch": 0.13075, "grad_norm": 51.25, "grad_norm_var": 892.175, "learning_rate": 0.0001, "loss": 7.5924, "loss/crossentropy": 2.1072397351264955, "loss/hidden": 3.505078125, "loss/jsd": 0.0, "loss/logits": 0.20098341945558787, "step": 5230 }, { "epoch": 0.131, "grad_norm": 32.5, "grad_norm_var": 895.24140625, "learning_rate": 0.0001, "loss": 7.5454, "loss/crossentropy": 2.260575148463249, "loss/hidden": 3.348046875, "loss/jsd": 0.0, "loss/logits": 0.18017468005418777, "step": 5240 }, { "epoch": 0.13125, "grad_norm": 48.0, "grad_norm_var": 114.225, "learning_rate": 0.0001, "loss": 7.6162, "loss/crossentropy": 2.2057667702436445, "loss/hidden": 3.387890625, "loss/jsd": 0.0, "loss/logits": 0.20519790165126323, "step": 5250 }, { "epoch": 0.1315, "grad_norm": 74.0, "grad_norm_var": 349.00983072916665, "learning_rate": 0.0001, "loss": 7.6702, "loss/crossentropy": 2.154927045106888, "loss/hidden": 3.366796875, "loss/jsd": 0.0, "loss/logits": 0.21574116442352534, "step": 5260 }, { "epoch": 0.13175, "grad_norm": 31.0, "grad_norm_var": 229.61243489583333, "learning_rate": 0.0001, "loss": 7.4891, "loss/crossentropy": 2.107641798257828, "loss/hidden": 3.414453125, "loss/jsd": 0.0, "loss/logits": 0.18632576130330564, "step": 5270 }, { "epoch": 0.132, "grad_norm": 33.75, "grad_norm_var": 7.731705729166666, "learning_rate": 0.0001, "loss": 7.5382, "loss/crossentropy": 2.245188394188881, "loss/hidden": 3.26875, "loss/jsd": 0.0, "loss/logits": 0.18824921660125254, "step": 5280 }, { "epoch": 0.13225, "grad_norm": 33.75, "grad_norm_var": 3.63125, "learning_rate": 0.0001, "loss": 7.4886, "loss/crossentropy": 2.2113157629966738, "loss/hidden": 3.294140625, "loss/jsd": 0.0, "loss/logits": 0.1719427563250065, "step": 5290 }, { "epoch": 0.1325, "grad_norm": 31.125, "grad_norm_var": 5.176822916666667, "learning_rate": 0.0001, "loss": 7.5181, "loss/crossentropy": 2.2539247930049897, "loss/hidden": 3.31640625, "loss/jsd": 0.0, "loss/logits": 0.18716043829917908, "step": 5300 }, { "epoch": 0.13275, "grad_norm": 30.5, "grad_norm_var": 5.7791015625, "learning_rate": 0.0001, "loss": 7.4139, "loss/crossentropy": 2.232303848862648, "loss/hidden": 3.4, "loss/jsd": 0.0, "loss/logits": 0.18582747615873813, "step": 5310 }, { "epoch": 0.133, "grad_norm": 29.25, "grad_norm_var": 2.792708333333333, "learning_rate": 0.0001, "loss": 7.4902, "loss/crossentropy": 2.2038251549005508, "loss/hidden": 3.330859375, "loss/jsd": 0.0, "loss/logits": 0.19186565633863212, "step": 5320 }, { "epoch": 0.13325, "grad_norm": 31.0, "grad_norm_var": 6.145768229166666, "learning_rate": 0.0001, "loss": 7.568, "loss/crossentropy": 2.258873853087425, "loss/hidden": 3.490625, "loss/jsd": 0.0, "loss/logits": 0.2075445156544447, "step": 5330 }, { "epoch": 0.1335, "grad_norm": 35.25, "grad_norm_var": 2.1952473958333334, "learning_rate": 0.0001, "loss": 7.5231, "loss/crossentropy": 2.1799694120883943, "loss/hidden": 3.3015625, "loss/jsd": 0.0, "loss/logits": 0.18682638984173536, "step": 5340 }, { "epoch": 0.13375, "grad_norm": 30.0, "grad_norm_var": 1.9739583333333333, "learning_rate": 0.0001, "loss": 7.5416, "loss/crossentropy": 2.2539006620645523, "loss/hidden": 3.34921875, "loss/jsd": 0.0, "loss/logits": 0.18906075097620487, "step": 5350 }, { "epoch": 0.134, "grad_norm": 31.125, "grad_norm_var": 2.880989583333333, "learning_rate": 0.0001, "loss": 7.5205, "loss/crossentropy": 2.111976405978203, "loss/hidden": 3.3953125, "loss/jsd": 0.0, "loss/logits": 0.18815945349633695, "step": 5360 }, { "epoch": 0.13425, "grad_norm": 41.75, "grad_norm_var": 8.989322916666667, "learning_rate": 0.0001, "loss": 7.5645, "loss/crossentropy": 2.150078758597374, "loss/hidden": 3.305859375, "loss/jsd": 0.0, "loss/logits": 0.18150232955813408, "step": 5370 }, { "epoch": 0.1345, "grad_norm": 32.25, "grad_norm_var": 9.262239583333333, "learning_rate": 0.0001, "loss": 7.5691, "loss/crossentropy": 2.2475525766611097, "loss/hidden": 3.469921875, "loss/jsd": 0.0, "loss/logits": 0.22073253151029348, "step": 5380 }, { "epoch": 0.13475, "grad_norm": 34.25, "grad_norm_var": 4.981705729166666, "learning_rate": 0.0001, "loss": 7.5575, "loss/crossentropy": 2.1171315133571627, "loss/hidden": 3.445703125, "loss/jsd": 0.0, "loss/logits": 0.20035637486726046, "step": 5390 }, { "epoch": 0.135, "grad_norm": 212.0, "grad_norm_var": 2028.59375, "learning_rate": 0.0001, "loss": 7.6561, "loss/crossentropy": 2.147063474357128, "loss/hidden": 3.507421875, "loss/jsd": 0.0, "loss/logits": 0.20187063701450825, "step": 5400 }, { "epoch": 0.13525, "grad_norm": 28.5, "grad_norm_var": 2002.4431640625, "learning_rate": 0.0001, "loss": 7.5921, "loss/crossentropy": 2.248355305194855, "loss/hidden": 3.5234375, "loss/jsd": 0.0, "loss/logits": 0.2067353159189224, "step": 5410 }, { "epoch": 0.1355, "grad_norm": 30.375, "grad_norm_var": 2.59765625, "learning_rate": 0.0001, "loss": 7.5656, "loss/crossentropy": 2.140727072954178, "loss/hidden": 3.42109375, "loss/jsd": 0.0, "loss/logits": 0.20080696120858194, "step": 5420 }, { "epoch": 0.13575, "grad_norm": 30.875, "grad_norm_var": 2.299934895833333, "learning_rate": 0.0001, "loss": 7.6261, "loss/crossentropy": 2.2185936748981474, "loss/hidden": 3.36796875, "loss/jsd": 0.0, "loss/logits": 0.1954452872276306, "step": 5430 }, { "epoch": 0.136, "grad_norm": 32.0, "grad_norm_var": 2.5455729166666665, "learning_rate": 0.0001, "loss": 7.5373, "loss/crossentropy": 2.0910344183444978, "loss/hidden": 3.333984375, "loss/jsd": 0.0, "loss/logits": 0.1913301758468151, "step": 5440 }, { "epoch": 0.13625, "grad_norm": 34.5, "grad_norm_var": 2.590559895833333, "learning_rate": 0.0001, "loss": 7.5366, "loss/crossentropy": 2.278426119685173, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.1943113673478365, "step": 5450 }, { "epoch": 0.1365, "grad_norm": 31.625, "grad_norm_var": 2.5580729166666667, "learning_rate": 0.0001, "loss": 7.5847, "loss/crossentropy": 2.076927217841148, "loss/hidden": 3.4703125, "loss/jsd": 0.0, "loss/logits": 0.18883342035114764, "step": 5460 }, { "epoch": 0.13675, "grad_norm": 34.75, "grad_norm_var": 28.4197265625, "learning_rate": 0.0001, "loss": 7.6209, "loss/crossentropy": 2.234089860320091, "loss/hidden": 3.47734375, "loss/jsd": 0.0, "loss/logits": 0.21088924966752529, "step": 5470 }, { "epoch": 0.137, "grad_norm": 31.25, "grad_norm_var": 169.49479166666666, "learning_rate": 0.0001, "loss": 7.5375, "loss/crossentropy": 2.117088034749031, "loss/hidden": 3.3953125, "loss/jsd": 0.0, "loss/logits": 0.1874922074377537, "step": 5480 }, { "epoch": 0.13725, "grad_norm": 31.0, "grad_norm_var": 168.70462239583333, "learning_rate": 0.0001, "loss": 7.5082, "loss/crossentropy": 2.1952930808067324, "loss/hidden": 3.319921875, "loss/jsd": 0.0, "loss/logits": 0.19221495129168034, "step": 5490 }, { "epoch": 0.1375, "grad_norm": 30.5, "grad_norm_var": 27.639518229166665, "learning_rate": 0.0001, "loss": 7.4408, "loss/crossentropy": 2.184998545050621, "loss/hidden": 3.355859375, "loss/jsd": 0.0, "loss/logits": 0.17893593553453685, "step": 5500 }, { "epoch": 0.13775, "grad_norm": 32.5, "grad_norm_var": 1.6910807291666667, "learning_rate": 0.0001, "loss": 7.6031, "loss/crossentropy": 2.280049467086792, "loss/hidden": 3.506640625, "loss/jsd": 0.0, "loss/logits": 0.21599141787737608, "step": 5510 }, { "epoch": 0.138, "grad_norm": 31.5, "grad_norm_var": 2.730208333333333, "learning_rate": 0.0001, "loss": 7.5417, "loss/crossentropy": 2.159082019329071, "loss/hidden": 3.31640625, "loss/jsd": 0.0, "loss/logits": 0.19252310022711755, "step": 5520 }, { "epoch": 0.13825, "grad_norm": 31.125, "grad_norm_var": 2.520768229166667, "learning_rate": 0.0001, "loss": 7.6031, "loss/crossentropy": 2.2485590517520904, "loss/hidden": 3.448828125, "loss/jsd": 0.0, "loss/logits": 0.19249292369931936, "step": 5530 }, { "epoch": 0.1385, "grad_norm": 31.75, "grad_norm_var": 2.939583333333333, "learning_rate": 0.0001, "loss": 7.5843, "loss/crossentropy": 2.0814964517951013, "loss/hidden": 3.501953125, "loss/jsd": 0.0, "loss/logits": 0.19598778411746026, "step": 5540 }, { "epoch": 0.13875, "grad_norm": 30.25, "grad_norm_var": 515.5889973958333, "learning_rate": 0.0001, "loss": 7.6637, "loss/crossentropy": 2.1314420223236086, "loss/hidden": 3.451953125, "loss/jsd": 0.0, "loss/logits": 0.20185216665267944, "step": 5550 }, { "epoch": 0.139, "grad_norm": 33.0, "grad_norm_var": 3.4643229166666667, "learning_rate": 0.0001, "loss": 7.5407, "loss/crossentropy": 2.190821570158005, "loss/hidden": 3.362890625, "loss/jsd": 0.0, "loss/logits": 0.1984243031591177, "step": 5560 }, { "epoch": 0.13925, "grad_norm": 30.875, "grad_norm_var": 3.1483723958333334, "learning_rate": 0.0001, "loss": 7.587, "loss/crossentropy": 2.2163518011569976, "loss/hidden": 3.44140625, "loss/jsd": 0.0, "loss/logits": 0.19457473792135715, "step": 5570 }, { "epoch": 0.1395, "grad_norm": 28.625, "grad_norm_var": 3.090559895833333, "learning_rate": 0.0001, "loss": 7.5224, "loss/crossentropy": 2.077855309844017, "loss/hidden": 3.551171875, "loss/jsd": 0.0, "loss/logits": 0.19357334338128568, "step": 5580 }, { "epoch": 0.13975, "grad_norm": 31.25, "grad_norm_var": 18.9041015625, "learning_rate": 0.0001, "loss": 7.6508, "loss/crossentropy": 2.0995124436914923, "loss/hidden": 3.54921875, "loss/jsd": 0.0, "loss/logits": 0.20311756529845298, "step": 5590 }, { "epoch": 0.14, "grad_norm": 30.875, "grad_norm_var": 16.024739583333332, "learning_rate": 0.0001, "loss": 7.5486, "loss/crossentropy": 2.1742469370365143, "loss/hidden": 3.501171875, "loss/jsd": 0.0, "loss/logits": 0.2091912193223834, "step": 5600 }, { "epoch": 0.14025, "grad_norm": 31.0, "grad_norm_var": 1.0989583333333333, "learning_rate": 0.0001, "loss": 7.5563, "loss/crossentropy": 2.1435637921094894, "loss/hidden": 3.525390625, "loss/jsd": 0.0, "loss/logits": 0.20397210270166397, "step": 5610 }, { "epoch": 0.1405, "grad_norm": 44.0, "grad_norm_var": 11.0041015625, "learning_rate": 0.0001, "loss": 7.593, "loss/crossentropy": 2.151739400625229, "loss/hidden": 3.477734375, "loss/jsd": 0.0, "loss/logits": 0.20091456174850464, "step": 5620 }, { "epoch": 0.14075, "grad_norm": 31.125, "grad_norm_var": 16.405208333333334, "learning_rate": 0.0001, "loss": 7.5337, "loss/crossentropy": 2.2840585201978683, "loss/hidden": 3.346875, "loss/jsd": 0.0, "loss/logits": 0.21120928078889847, "step": 5630 }, { "epoch": 0.141, "grad_norm": 29.0, "grad_norm_var": 2.7375, "learning_rate": 0.0001, "loss": 7.4997, "loss/crossentropy": 2.041360355913639, "loss/hidden": 3.439453125, "loss/jsd": 0.0, "loss/logits": 0.2038326717913151, "step": 5640 }, { "epoch": 0.14125, "grad_norm": 29.25, "grad_norm_var": 23.368684895833333, "learning_rate": 0.0001, "loss": 7.5633, "loss/crossentropy": 2.111186498403549, "loss/hidden": 3.463671875, "loss/jsd": 0.0, "loss/logits": 0.20292142927646636, "step": 5650 }, { "epoch": 0.1415, "grad_norm": 33.5, "grad_norm_var": 4.765559895833333, "learning_rate": 0.0001, "loss": 7.577, "loss/crossentropy": 2.1156825721263885, "loss/hidden": 3.423828125, "loss/jsd": 0.0, "loss/logits": 0.19096483811736106, "step": 5660 }, { "epoch": 0.14175, "grad_norm": 29.625, "grad_norm_var": 6.212434895833334, "learning_rate": 0.0001, "loss": 7.561, "loss/crossentropy": 2.1884111180901527, "loss/hidden": 3.416015625, "loss/jsd": 0.0, "loss/logits": 0.19188922494649888, "step": 5670 }, { "epoch": 0.142, "grad_norm": 31.75, "grad_norm_var": 4.145572916666667, "learning_rate": 0.0001, "loss": 7.5785, "loss/crossentropy": 2.1473594516515733, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.19657318461686374, "step": 5680 }, { "epoch": 0.14225, "grad_norm": 32.75, "grad_norm_var": 9.868489583333334, "learning_rate": 0.0001, "loss": 7.5749, "loss/crossentropy": 2.224351739883423, "loss/hidden": 3.390234375, "loss/jsd": 0.0, "loss/logits": 0.19284768104553224, "step": 5690 }, { "epoch": 0.1425, "grad_norm": 29.375, "grad_norm_var": 24.6166015625, "learning_rate": 0.0001, "loss": 7.4841, "loss/crossentropy": 2.0835042744874954, "loss/hidden": 3.505859375, "loss/jsd": 0.0, "loss/logits": 0.1889321893453598, "step": 5700 }, { "epoch": 0.14275, "grad_norm": 32.5, "grad_norm_var": 3.8934895833333334, "learning_rate": 0.0001, "loss": 7.5411, "loss/crossentropy": 2.132970982789993, "loss/hidden": 3.4078125, "loss/jsd": 0.0, "loss/logits": 0.20858928225934506, "step": 5710 }, { "epoch": 0.143, "grad_norm": 33.0, "grad_norm_var": 96.9619140625, "learning_rate": 0.0001, "loss": 7.5633, "loss/crossentropy": 2.2334523528814314, "loss/hidden": 3.442578125, "loss/jsd": 0.0, "loss/logits": 0.20192687548696994, "step": 5720 }, { "epoch": 0.14325, "grad_norm": 32.0, "grad_norm_var": 90.8759765625, "learning_rate": 0.0001, "loss": 7.5, "loss/crossentropy": 2.181543472409248, "loss/hidden": 3.44453125, "loss/jsd": 0.0, "loss/logits": 0.20608940124511718, "step": 5730 }, { "epoch": 0.1435, "grad_norm": 29.125, "grad_norm_var": 68.45416666666667, "learning_rate": 0.0001, "loss": 7.6044, "loss/crossentropy": 2.2480223774909973, "loss/hidden": 3.317578125, "loss/jsd": 0.0, "loss/logits": 0.18675435222685338, "step": 5740 }, { "epoch": 0.14375, "grad_norm": 39.25, "grad_norm_var": 94.02057291666667, "learning_rate": 0.0001, "loss": 7.5096, "loss/crossentropy": 2.2112644970417024, "loss/hidden": 3.40390625, "loss/jsd": 0.0, "loss/logits": 0.21228924561291934, "step": 5750 }, { "epoch": 0.144, "grad_norm": 31.5, "grad_norm_var": 2.1797421917742129e+18, "learning_rate": 0.0001, "loss": 7.6201, "loss/crossentropy": 2.2231735616922377, "loss/hidden": 3.3765625, "loss/jsd": 0.0, "loss/logits": 0.19756054263561965, "step": 5760 }, { "epoch": 0.14425, "grad_norm": 38.75, "grad_norm_var": 58.33743489583333, "learning_rate": 0.0001, "loss": 7.5131, "loss/crossentropy": 2.10022853910923, "loss/hidden": 3.383984375, "loss/jsd": 0.0, "loss/logits": 0.1919796233996749, "step": 5770 }, { "epoch": 0.1445, "grad_norm": 34.25, "grad_norm_var": 46.984375, "learning_rate": 0.0001, "loss": 7.6109, "loss/crossentropy": 2.190520279109478, "loss/hidden": 3.358984375, "loss/jsd": 0.0, "loss/logits": 0.20563599281013012, "step": 5780 }, { "epoch": 0.14475, "grad_norm": 32.75, "grad_norm_var": 6.2166015625, "learning_rate": 0.0001, "loss": 7.6612, "loss/crossentropy": 2.0708674401044846, "loss/hidden": 3.473828125, "loss/jsd": 0.0, "loss/logits": 0.20486081317067145, "step": 5790 }, { "epoch": 0.145, "grad_norm": 30.5, "grad_norm_var": 21.637239583333333, "learning_rate": 0.0001, "loss": 7.5095, "loss/crossentropy": 2.2081795185804367, "loss/hidden": 3.50703125, "loss/jsd": 0.0, "loss/logits": 0.20764970332384108, "step": 5800 }, { "epoch": 0.14525, "grad_norm": 32.25, "grad_norm_var": 2.1119140625, "learning_rate": 0.0001, "loss": 7.5625, "loss/crossentropy": 2.134955820441246, "loss/hidden": 3.32890625, "loss/jsd": 0.0, "loss/logits": 0.18278160132467747, "step": 5810 }, { "epoch": 0.1455, "grad_norm": 32.0, "grad_norm_var": 26.182291666666668, "learning_rate": 0.0001, "loss": 7.5741, "loss/crossentropy": 2.101625883579254, "loss/hidden": 3.490234375, "loss/jsd": 0.0, "loss/logits": 0.21009023115038872, "step": 5820 }, { "epoch": 0.14575, "grad_norm": 32.25, "grad_norm_var": 12.02890625, "learning_rate": 0.0001, "loss": 7.5459, "loss/crossentropy": 2.269702708721161, "loss/hidden": 3.36796875, "loss/jsd": 0.0, "loss/logits": 0.1850876223295927, "step": 5830 }, { "epoch": 0.146, "grad_norm": 31.625, "grad_norm_var": 10.504166666666666, "learning_rate": 0.0001, "loss": 7.5756, "loss/crossentropy": 2.1922702878713607, "loss/hidden": 3.408203125, "loss/jsd": 0.0, "loss/logits": 0.1936382047832012, "step": 5840 }, { "epoch": 0.14625, "grad_norm": 34.25, "grad_norm_var": 2.6372395833333333, "learning_rate": 0.0001, "loss": 7.5806, "loss/crossentropy": 2.0927012979984285, "loss/hidden": 3.408203125, "loss/jsd": 0.0, "loss/logits": 0.200397995300591, "step": 5850 }, { "epoch": 0.1465, "grad_norm": 31.375, "grad_norm_var": 16.130143229166666, "learning_rate": 0.0001, "loss": 7.6031, "loss/crossentropy": 2.150560998916626, "loss/hidden": 3.379296875, "loss/jsd": 0.0, "loss/logits": 0.20800711959600449, "step": 5860 }, { "epoch": 0.14675, "grad_norm": 30.875, "grad_norm_var": 15.814583333333333, "learning_rate": 0.0001, "loss": 7.4831, "loss/crossentropy": 2.138367956876755, "loss/hidden": 3.299609375, "loss/jsd": 0.0, "loss/logits": 0.1825962917879224, "step": 5870 }, { "epoch": 0.147, "grad_norm": 31.375, "grad_norm_var": 11.920833333333333, "learning_rate": 0.0001, "loss": 7.5279, "loss/crossentropy": 2.0560067892074585, "loss/hidden": 3.4578125, "loss/jsd": 0.0, "loss/logits": 0.18732867166399955, "step": 5880 }, { "epoch": 0.14725, "grad_norm": 31.625, "grad_norm_var": 11.8087890625, "learning_rate": 0.0001, "loss": 7.4841, "loss/crossentropy": 2.204834724962711, "loss/hidden": 3.3015625, "loss/jsd": 0.0, "loss/logits": 0.18124654777348043, "step": 5890 }, { "epoch": 0.1475, "grad_norm": 29.625, "grad_norm_var": 12.66015625, "learning_rate": 0.0001, "loss": 7.5171, "loss/crossentropy": 2.185581070184708, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.19726393837481737, "step": 5900 }, { "epoch": 0.14775, "grad_norm": 29.625, "grad_norm_var": 2.1747395833333334, "learning_rate": 0.0001, "loss": 7.4905, "loss/crossentropy": 2.0858706533908844, "loss/hidden": 3.57734375, "loss/jsd": 0.0, "loss/logits": 0.20108112394809724, "step": 5910 }, { "epoch": 0.148, "grad_norm": 30.0, "grad_norm_var": 3.87265625, "learning_rate": 0.0001, "loss": 7.5801, "loss/crossentropy": 2.11753663122654, "loss/hidden": 3.512890625, "loss/jsd": 0.0, "loss/logits": 0.20907512214034796, "step": 5920 }, { "epoch": 0.14825, "grad_norm": 31.25, "grad_norm_var": 2.6212890625, "learning_rate": 0.0001, "loss": 7.5707, "loss/crossentropy": 2.1879894763231276, "loss/hidden": 3.381640625, "loss/jsd": 0.0, "loss/logits": 0.19724611584097146, "step": 5930 }, { "epoch": 0.1485, "grad_norm": 31.625, "grad_norm_var": 5.4134765625, "learning_rate": 0.0001, "loss": 7.5137, "loss/crossentropy": 2.0121699988842012, "loss/hidden": 3.653515625, "loss/jsd": 0.0, "loss/logits": 0.2113606294617057, "step": 5940 }, { "epoch": 0.14875, "grad_norm": 30.375, "grad_norm_var": 7.517643229166667, "learning_rate": 0.0001, "loss": 7.5851, "loss/crossentropy": 2.1624063462018968, "loss/hidden": 3.3625, "loss/jsd": 0.0, "loss/logits": 0.19148119539022446, "step": 5950 }, { "epoch": 0.149, "grad_norm": 40.5, "grad_norm_var": 18.078580729166667, "learning_rate": 0.0001, "loss": 7.4704, "loss/crossentropy": 2.100359010696411, "loss/hidden": 3.39765625, "loss/jsd": 0.0, "loss/logits": 0.18530675377696754, "step": 5960 }, { "epoch": 0.14925, "grad_norm": 33.0, "grad_norm_var": 106.165625, "learning_rate": 0.0001, "loss": 7.5243, "loss/crossentropy": 2.088457000255585, "loss/hidden": 3.45, "loss/jsd": 0.0, "loss/logits": 0.1950376622378826, "step": 5970 }, { "epoch": 0.1495, "grad_norm": 35.5, "grad_norm_var": 110.30618489583334, "learning_rate": 0.0001, "loss": 7.566, "loss/crossentropy": 2.25458045899868, "loss/hidden": 3.35, "loss/jsd": 0.0, "loss/logits": 0.192095298320055, "step": 5980 }, { "epoch": 0.14975, "grad_norm": 33.25, "grad_norm_var": 21.141080729166667, "learning_rate": 0.0001, "loss": 7.5733, "loss/crossentropy": 2.2489352226257324, "loss/hidden": 3.384375, "loss/jsd": 0.0, "loss/logits": 0.19246331304311753, "step": 5990 }, { "epoch": 0.15, "grad_norm": 33.25, "grad_norm_var": 1.6747395833333334, "learning_rate": 0.0001, "loss": 7.6863, "loss/crossentropy": 2.0839652568101883, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.20588702652603388, "step": 6000 }, { "epoch": 0.15025, "grad_norm": 29.875, "grad_norm_var": 9.457291666666666, "learning_rate": 0.0001, "loss": 7.5802, "loss/crossentropy": 2.1618823766708375, "loss/hidden": 3.36015625, "loss/jsd": 0.0, "loss/logits": 0.19899031519889832, "step": 6010 }, { "epoch": 0.1505, "grad_norm": 32.5, "grad_norm_var": 10.827018229166667, "learning_rate": 0.0001, "loss": 7.6105, "loss/crossentropy": 2.0799810975790023, "loss/hidden": 3.6, "loss/jsd": 0.0, "loss/logits": 0.20440249070525168, "step": 6020 }, { "epoch": 0.15075, "grad_norm": 30.875, "grad_norm_var": 3.2405598958333335, "learning_rate": 0.0001, "loss": 7.6337, "loss/crossentropy": 2.131713417172432, "loss/hidden": 3.492578125, "loss/jsd": 0.0, "loss/logits": 0.20057708621025086, "step": 6030 }, { "epoch": 0.151, "grad_norm": 31.125, "grad_norm_var": 17.879166666666666, "learning_rate": 0.0001, "loss": 7.7142, "loss/crossentropy": 2.116998878121376, "loss/hidden": 3.485546875, "loss/jsd": 0.0, "loss/logits": 0.2165115473791957, "step": 6040 }, { "epoch": 0.15125, "grad_norm": 31.375, "grad_norm_var": 19.562239583333334, "learning_rate": 0.0001, "loss": 7.5782, "loss/crossentropy": 2.187731945514679, "loss/hidden": 3.42421875, "loss/jsd": 0.0, "loss/logits": 0.1970472853630781, "step": 6050 }, { "epoch": 0.1515, "grad_norm": 51.0, "grad_norm_var": 25.872330729166666, "learning_rate": 0.0001, "loss": 7.5339, "loss/crossentropy": 2.0746294870972632, "loss/hidden": 3.49375, "loss/jsd": 0.0, "loss/logits": 0.18855103328824044, "step": 6060 }, { "epoch": 0.15175, "grad_norm": 31.75, "grad_norm_var": 24.65390625, "learning_rate": 0.0001, "loss": 7.6192, "loss/crossentropy": 2.223821607232094, "loss/hidden": 3.48203125, "loss/jsd": 0.0, "loss/logits": 0.20149823743849993, "step": 6070 }, { "epoch": 0.152, "grad_norm": 32.5, "grad_norm_var": 1.99765625, "learning_rate": 0.0001, "loss": 7.634, "loss/crossentropy": 2.187976914644241, "loss/hidden": 3.4640625, "loss/jsd": 0.0, "loss/logits": 0.19613044820725917, "step": 6080 }, { "epoch": 0.15225, "grad_norm": 32.0, "grad_norm_var": 2.8854166666666665, "learning_rate": 0.0001, "loss": 7.5475, "loss/crossentropy": 2.2121584147214888, "loss/hidden": 3.28828125, "loss/jsd": 0.0, "loss/logits": 0.1814923081547022, "step": 6090 }, { "epoch": 0.1525, "grad_norm": 33.0, "grad_norm_var": 2.9692057291666667, "learning_rate": 0.0001, "loss": 7.5725, "loss/crossentropy": 2.189678418636322, "loss/hidden": 3.412890625, "loss/jsd": 0.0, "loss/logits": 0.19301872439682483, "step": 6100 }, { "epoch": 0.15275, "grad_norm": 31.0, "grad_norm_var": 3.457747395833333, "learning_rate": 0.0001, "loss": 7.5315, "loss/crossentropy": 2.1482601583004, "loss/hidden": 3.35703125, "loss/jsd": 0.0, "loss/logits": 0.1997425738722086, "step": 6110 }, { "epoch": 0.153, "grad_norm": 36.0, "grad_norm_var": 4.5072265625, "learning_rate": 0.0001, "loss": 7.571, "loss/crossentropy": 2.1208418533205986, "loss/hidden": 3.41953125, "loss/jsd": 0.0, "loss/logits": 0.19502468593418598, "step": 6120 }, { "epoch": 0.15325, "grad_norm": 33.0, "grad_norm_var": 17.070572916666666, "learning_rate": 0.0001, "loss": 7.5239, "loss/crossentropy": 2.160035288333893, "loss/hidden": 3.4359375, "loss/jsd": 0.0, "loss/logits": 0.189887585490942, "step": 6130 }, { "epoch": 0.1535, "grad_norm": 30.625, "grad_norm_var": 22.48515625, "learning_rate": 0.0001, "loss": 7.5566, "loss/crossentropy": 2.173004740476608, "loss/hidden": 3.366796875, "loss/jsd": 0.0, "loss/logits": 0.2055901188403368, "step": 6140 }, { "epoch": 0.15375, "grad_norm": 35.5, "grad_norm_var": 16.00390625, "learning_rate": 0.0001, "loss": 7.6958, "loss/crossentropy": 2.0565848529338835, "loss/hidden": 3.53203125, "loss/jsd": 0.0, "loss/logits": 0.22364541105926036, "step": 6150 }, { "epoch": 0.154, "grad_norm": 32.0, "grad_norm_var": 11.542122395833333, "learning_rate": 0.0001, "loss": 7.6093, "loss/crossentropy": 2.085195133090019, "loss/hidden": 3.45546875, "loss/jsd": 0.0, "loss/logits": 0.19985817223787308, "step": 6160 }, { "epoch": 0.15425, "grad_norm": 29.5, "grad_norm_var": 2.1666666666666665, "learning_rate": 0.0001, "loss": 7.5611, "loss/crossentropy": 2.1293098747730257, "loss/hidden": 3.402734375, "loss/jsd": 0.0, "loss/logits": 0.19878034461289645, "step": 6170 }, { "epoch": 0.1545, "grad_norm": 31.25, "grad_norm_var": 20.2822265625, "learning_rate": 0.0001, "loss": 7.6405, "loss/crossentropy": 2.2258552461862564, "loss/hidden": 3.430078125, "loss/jsd": 0.0, "loss/logits": 0.20626316573470832, "step": 6180 }, { "epoch": 0.15475, "grad_norm": 31.25, "grad_norm_var": 20.730208333333334, "learning_rate": 0.0001, "loss": 7.4726, "loss/crossentropy": 2.1775721326470374, "loss/hidden": 3.449609375, "loss/jsd": 0.0, "loss/logits": 0.18398564979434012, "step": 6190 }, { "epoch": 0.155, "grad_norm": 32.75, "grad_norm_var": 3.1958333333333333, "learning_rate": 0.0001, "loss": 7.5734, "loss/crossentropy": 2.223562794923782, "loss/hidden": 3.34921875, "loss/jsd": 0.0, "loss/logits": 0.190413929335773, "step": 6200 }, { "epoch": 0.15525, "grad_norm": 30.125, "grad_norm_var": 5.166080729166667, "learning_rate": 0.0001, "loss": 7.6525, "loss/crossentropy": 2.085880035161972, "loss/hidden": 3.384375, "loss/jsd": 0.0, "loss/logits": 0.1801287617534399, "step": 6210 }, { "epoch": 0.1555, "grad_norm": 30.875, "grad_norm_var": 2.7035807291666667, "learning_rate": 0.0001, "loss": 7.5305, "loss/crossentropy": 2.1930001616477965, "loss/hidden": 3.420703125, "loss/jsd": 0.0, "loss/logits": 0.1861238319426775, "step": 6220 }, { "epoch": 0.15575, "grad_norm": 31.0, "grad_norm_var": 4.468489583333334, "learning_rate": 0.0001, "loss": 7.6889, "loss/crossentropy": 2.22431803047657, "loss/hidden": 3.388671875, "loss/jsd": 0.0, "loss/logits": 0.19135152641683817, "step": 6230 }, { "epoch": 0.156, "grad_norm": 33.75, "grad_norm_var": 4.770572916666667, "learning_rate": 0.0001, "loss": 7.5177, "loss/crossentropy": 2.093687379360199, "loss/hidden": 3.599609375, "loss/jsd": 0.0, "loss/logits": 0.1883873265236616, "step": 6240 }, { "epoch": 0.15625, "grad_norm": 31.5, "grad_norm_var": 1.1872395833333333, "learning_rate": 0.0001, "loss": 7.6073, "loss/crossentropy": 2.2236929804086687, "loss/hidden": 3.43828125, "loss/jsd": 0.0, "loss/logits": 0.213064675219357, "step": 6250 }, { "epoch": 0.1565, "grad_norm": 31.0, "grad_norm_var": 1.5947265625, "learning_rate": 0.0001, "loss": 7.5079, "loss/crossentropy": 2.0539269253611563, "loss/hidden": 3.4625, "loss/jsd": 0.0, "loss/logits": 0.1872910862788558, "step": 6260 }, { "epoch": 0.15675, "grad_norm": 40.75, "grad_norm_var": 21.492708333333333, "learning_rate": 0.0001, "loss": 7.6006, "loss/crossentropy": 2.266616016626358, "loss/hidden": 3.373828125, "loss/jsd": 0.0, "loss/logits": 0.1959751147776842, "step": 6270 }, { "epoch": 0.157, "grad_norm": 31.375, "grad_norm_var": 25.213541666666668, "learning_rate": 0.0001, "loss": 7.6284, "loss/crossentropy": 2.2850263684988024, "loss/hidden": 3.5203125, "loss/jsd": 0.0, "loss/logits": 0.2038384210318327, "step": 6280 }, { "epoch": 0.15725, "grad_norm": 31.75, "grad_norm_var": 2.0010416666666666, "learning_rate": 0.0001, "loss": 7.5067, "loss/crossentropy": 2.122395873069763, "loss/hidden": 3.448046875, "loss/jsd": 0.0, "loss/logits": 0.18144308719784022, "step": 6290 }, { "epoch": 0.1575, "grad_norm": 34.0, "grad_norm_var": 3.7462890625, "learning_rate": 0.0001, "loss": 7.5297, "loss/crossentropy": 1.9984511777758598, "loss/hidden": 3.563671875, "loss/jsd": 0.0, "loss/logits": 0.19643332287669182, "step": 6300 }, { "epoch": 0.15775, "grad_norm": 30.5, "grad_norm_var": 5.603059895833334, "learning_rate": 0.0001, "loss": 7.5455, "loss/crossentropy": 2.029903215169907, "loss/hidden": 3.496484375, "loss/jsd": 0.0, "loss/logits": 0.20556226037442685, "step": 6310 }, { "epoch": 0.158, "grad_norm": 32.0, "grad_norm_var": 35.5853515625, "learning_rate": 0.0001, "loss": 7.593, "loss/crossentropy": 2.19532273709774, "loss/hidden": 3.40390625, "loss/jsd": 0.0, "loss/logits": 0.1972879134118557, "step": 6320 }, { "epoch": 0.15825, "grad_norm": 38.75, "grad_norm_var": 18.0791015625, "learning_rate": 0.0001, "loss": 7.6415, "loss/crossentropy": 2.0865643858909606, "loss/hidden": 3.5046875, "loss/jsd": 0.0, "loss/logits": 0.2021018836647272, "step": 6330 }, { "epoch": 0.1585, "grad_norm": 30.875, "grad_norm_var": 12.580989583333333, "learning_rate": 0.0001, "loss": 7.658, "loss/crossentropy": 2.31261685192585, "loss/hidden": 3.40390625, "loss/jsd": 0.0, "loss/logits": 0.2027669247239828, "step": 6340 }, { "epoch": 0.15875, "grad_norm": 31.875, "grad_norm_var": 4.453580729166666, "learning_rate": 0.0001, "loss": 7.5713, "loss/crossentropy": 2.1463931113481522, "loss/hidden": 3.4390625, "loss/jsd": 0.0, "loss/logits": 0.18860225863754748, "step": 6350 }, { "epoch": 0.159, "grad_norm": 32.25, "grad_norm_var": 2.1, "learning_rate": 0.0001, "loss": 7.5179, "loss/crossentropy": 2.134768417477608, "loss/hidden": 3.369921875, "loss/jsd": 0.0, "loss/logits": 0.19954264387488366, "step": 6360 }, { "epoch": 0.15925, "grad_norm": 33.5, "grad_norm_var": 7.157747395833334, "learning_rate": 0.0001, "loss": 7.5755, "loss/crossentropy": 2.144196245074272, "loss/hidden": 3.425390625, "loss/jsd": 0.0, "loss/logits": 0.18948693461716176, "step": 6370 }, { "epoch": 0.1595, "grad_norm": 31.375, "grad_norm_var": 1.9546223958333333, "learning_rate": 0.0001, "loss": 7.6119, "loss/crossentropy": 2.147921970486641, "loss/hidden": 3.413671875, "loss/jsd": 0.0, "loss/logits": 0.19279659036546945, "step": 6380 }, { "epoch": 0.15975, "grad_norm": 33.25, "grad_norm_var": 1.9129557291666666, "learning_rate": 0.0001, "loss": 7.4797, "loss/crossentropy": 2.1023626953363417, "loss/hidden": 3.486328125, "loss/jsd": 0.0, "loss/logits": 0.1855178650468588, "step": 6390 }, { "epoch": 0.16, "grad_norm": 31.375, "grad_norm_var": 1.7577473958333334, "learning_rate": 0.0001, "loss": 7.5115, "loss/crossentropy": 2.1468859046697615, "loss/hidden": 3.398828125, "loss/jsd": 0.0, "loss/logits": 0.19039052687585353, "step": 6400 }, { "epoch": 0.16025, "grad_norm": 31.875, "grad_norm_var": 2.43515625, "learning_rate": 0.0001, "loss": 7.473, "loss/crossentropy": 2.1335047364234923, "loss/hidden": 3.349609375, "loss/jsd": 0.0, "loss/logits": 0.17829085066914557, "step": 6410 }, { "epoch": 0.1605, "grad_norm": 31.5, "grad_norm_var": 2.027083333333333, "learning_rate": 0.0001, "loss": 7.5477, "loss/crossentropy": 2.0842413723468782, "loss/hidden": 3.537109375, "loss/jsd": 0.0, "loss/logits": 0.20616262052208184, "step": 6420 }, { "epoch": 0.16075, "grad_norm": 33.75, "grad_norm_var": 33.49791666666667, "learning_rate": 0.0001, "loss": 7.5695, "loss/crossentropy": 2.153767225146294, "loss/hidden": 3.493359375, "loss/jsd": 0.0, "loss/logits": 0.20842270255088807, "step": 6430 }, { "epoch": 0.161, "grad_norm": 32.75, "grad_norm_var": 15.109375, "learning_rate": 0.0001, "loss": 7.6661, "loss/crossentropy": 2.0475671708583834, "loss/hidden": 3.6484375, "loss/jsd": 0.0, "loss/logits": 0.2175968911498785, "step": 6440 }, { "epoch": 0.16125, "grad_norm": 32.5, "grad_norm_var": 8402.958268229168, "learning_rate": 0.0001, "loss": 7.6392, "loss/crossentropy": 2.2127752989530562, "loss/hidden": 3.532421875, "loss/jsd": 0.0, "loss/logits": 0.21682111844420432, "step": 6450 }, { "epoch": 0.1615, "grad_norm": 33.25, "grad_norm_var": 24.9375, "learning_rate": 0.0001, "loss": 7.5701, "loss/crossentropy": 2.1261292159557343, "loss/hidden": 3.425390625, "loss/jsd": 0.0, "loss/logits": 0.20704152658581734, "step": 6460 }, { "epoch": 0.16175, "grad_norm": 35.5, "grad_norm_var": 30.0875, "learning_rate": 0.0001, "loss": 7.5408, "loss/crossentropy": 2.124222718179226, "loss/hidden": 3.5984375, "loss/jsd": 0.0, "loss/logits": 0.18728400766849518, "step": 6470 }, { "epoch": 0.162, "grad_norm": 32.25, "grad_norm_var": 3.1145833333333335, "learning_rate": 0.0001, "loss": 7.5831, "loss/crossentropy": 2.2058008939027784, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.19860644564032554, "step": 6480 }, { "epoch": 0.16225, "grad_norm": 31.875, "grad_norm_var": 28.412239583333335, "learning_rate": 0.0001, "loss": 7.5668, "loss/crossentropy": 2.1960059702396393, "loss/hidden": 3.370703125, "loss/jsd": 0.0, "loss/logits": 0.1868469040840864, "step": 6490 }, { "epoch": 0.1625, "grad_norm": 32.5, "grad_norm_var": 3.2416015625, "learning_rate": 0.0001, "loss": 7.4873, "loss/crossentropy": 2.180153116583824, "loss/hidden": 3.412109375, "loss/jsd": 0.0, "loss/logits": 0.18321371413767337, "step": 6500 }, { "epoch": 0.16275, "grad_norm": 31.125, "grad_norm_var": 2.113997395833333, "learning_rate": 0.0001, "loss": 7.5498, "loss/crossentropy": 2.085334287583828, "loss/hidden": 3.481640625, "loss/jsd": 0.0, "loss/logits": 0.20047369822859765, "step": 6510 }, { "epoch": 0.163, "grad_norm": 33.75, "grad_norm_var": 14.7166015625, "learning_rate": 0.0001, "loss": 7.5356, "loss/crossentropy": 2.081576499342918, "loss/hidden": 3.52890625, "loss/jsd": 0.0, "loss/logits": 0.19775602114386856, "step": 6520 }, { "epoch": 0.16325, "grad_norm": 32.25, "grad_norm_var": 13.976822916666666, "learning_rate": 0.0001, "loss": 7.4858, "loss/crossentropy": 2.0600350558757783, "loss/hidden": 3.427734375, "loss/jsd": 0.0, "loss/logits": 0.18539784867316483, "step": 6530 }, { "epoch": 0.1635, "grad_norm": 30.375, "grad_norm_var": 3.846875, "learning_rate": 0.0001, "loss": 7.5135, "loss/crossentropy": 2.117923478782177, "loss/hidden": 3.5109375, "loss/jsd": 0.0, "loss/logits": 0.2071079235523939, "step": 6540 }, { "epoch": 0.16375, "grad_norm": 33.25, "grad_norm_var": 21.288997395833334, "learning_rate": 0.0001, "loss": 7.5807, "loss/crossentropy": 2.087580367922783, "loss/hidden": 3.45390625, "loss/jsd": 0.0, "loss/logits": 0.18587088529020548, "step": 6550 }, { "epoch": 0.164, "grad_norm": 33.0, "grad_norm_var": 5.292708333333334, "learning_rate": 0.0001, "loss": 7.4736, "loss/crossentropy": 2.060797114670277, "loss/hidden": 3.485546875, "loss/jsd": 0.0, "loss/logits": 0.1906617671251297, "step": 6560 }, { "epoch": 0.16425, "grad_norm": 30.375, "grad_norm_var": 1.934375, "learning_rate": 0.0001, "loss": 7.5351, "loss/crossentropy": 2.177751311659813, "loss/hidden": 3.426953125, "loss/jsd": 0.0, "loss/logits": 0.20191191136837006, "step": 6570 }, { "epoch": 0.1645, "grad_norm": 33.0, "grad_norm_var": 1.4184895833333333, "learning_rate": 0.0001, "loss": 7.6058, "loss/crossentropy": 2.1490555882453917, "loss/hidden": 3.52109375, "loss/jsd": 0.0, "loss/logits": 0.19388978108763694, "step": 6580 }, { "epoch": 0.16475, "grad_norm": 29.125, "grad_norm_var": 2.8478515625, "learning_rate": 0.0001, "loss": 7.5149, "loss/crossentropy": 2.2254546850919725, "loss/hidden": 3.306640625, "loss/jsd": 0.0, "loss/logits": 0.18252336494624616, "step": 6590 }, { "epoch": 0.165, "grad_norm": 38.5, "grad_norm_var": 207.35201822916667, "learning_rate": 0.0001, "loss": 7.6059, "loss/crossentropy": 2.1390285924077035, "loss/hidden": 3.303515625, "loss/jsd": 0.0, "loss/logits": 0.1844609746709466, "step": 6600 }, { "epoch": 0.16525, "grad_norm": 35.0, "grad_norm_var": 203.78098958333334, "learning_rate": 0.0001, "loss": 7.5921, "loss/crossentropy": 2.092223954200745, "loss/hidden": 3.58828125, "loss/jsd": 0.0, "loss/logits": 0.17789147663861513, "step": 6610 }, { "epoch": 0.1655, "grad_norm": 30.0, "grad_norm_var": 65.76640625, "learning_rate": 0.0001, "loss": 7.558, "loss/crossentropy": 2.138624146580696, "loss/hidden": 3.508984375, "loss/jsd": 0.0, "loss/logits": 0.19836988989263774, "step": 6620 }, { "epoch": 0.16575, "grad_norm": 30.875, "grad_norm_var": 68.97337239583334, "learning_rate": 0.0001, "loss": 7.581, "loss/crossentropy": 2.1231719397008417, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.18597114123404027, "step": 6630 }, { "epoch": 0.166, "grad_norm": 32.25, "grad_norm_var": 125.8041015625, "learning_rate": 0.0001, "loss": 7.5804, "loss/crossentropy": 2.130907243490219, "loss/hidden": 3.41640625, "loss/jsd": 0.0, "loss/logits": 0.17537819929420947, "step": 6640 }, { "epoch": 0.16625, "grad_norm": 40.25, "grad_norm_var": 399.22057291666664, "learning_rate": 0.0001, "loss": 7.6269, "loss/crossentropy": 2.0258643075823786, "loss/hidden": 3.596875, "loss/jsd": 0.0, "loss/logits": 0.23089794162660837, "step": 6650 }, { "epoch": 0.1665, "grad_norm": 31.0, "grad_norm_var": 408.07233072916665, "learning_rate": 0.0001, "loss": 7.4616, "loss/crossentropy": 2.23043432533741, "loss/hidden": 3.40078125, "loss/jsd": 0.0, "loss/logits": 0.19158051013946534, "step": 6660 }, { "epoch": 0.16675, "grad_norm": 32.0, "grad_norm_var": 8.009830729166667, "learning_rate": 0.0001, "loss": 7.5931, "loss/crossentropy": 2.1956572234630585, "loss/hidden": 3.50234375, "loss/jsd": 0.0, "loss/logits": 0.20411487035453318, "step": 6670 }, { "epoch": 0.167, "grad_norm": 34.5, "grad_norm_var": 8.894791666666666, "learning_rate": 0.0001, "loss": 7.6195, "loss/crossentropy": 2.2672122746706007, "loss/hidden": 3.309765625, "loss/jsd": 0.0, "loss/logits": 0.18957087770104408, "step": 6680 }, { "epoch": 0.16725, "grad_norm": 31.0, "grad_norm_var": 66.73333333333333, "learning_rate": 0.0001, "loss": 7.5632, "loss/crossentropy": 2.2279567658901214, "loss/hidden": 3.32109375, "loss/jsd": 0.0, "loss/logits": 0.18730072304606438, "step": 6690 }, { "epoch": 0.1675, "grad_norm": 42.0, "grad_norm_var": 71.18854166666667, "learning_rate": 0.0001, "loss": 7.7089, "loss/crossentropy": 2.282482776045799, "loss/hidden": 3.4640625, "loss/jsd": 0.0, "loss/logits": 0.20141587276011705, "step": 6700 }, { "epoch": 0.16775, "grad_norm": 31.5, "grad_norm_var": 9.892122395833333, "learning_rate": 0.0001, "loss": 7.3758, "loss/crossentropy": 2.2182126119732857, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.1786667076870799, "step": 6710 }, { "epoch": 0.168, "grad_norm": 33.75, "grad_norm_var": 3.6957682291666667, "learning_rate": 0.0001, "loss": 7.5636, "loss/crossentropy": 2.1939273923635483, "loss/hidden": 3.40703125, "loss/jsd": 0.0, "loss/logits": 0.18930760622024537, "step": 6720 }, { "epoch": 0.16825, "grad_norm": 32.25, "grad_norm_var": 3.9009765625, "learning_rate": 0.0001, "loss": 7.5783, "loss/crossentropy": 2.0783598124980927, "loss/hidden": 3.35625, "loss/jsd": 0.0, "loss/logits": 0.18662143610417842, "step": 6730 }, { "epoch": 0.1685, "grad_norm": 35.5, "grad_norm_var": 5.1353515625, "learning_rate": 0.0001, "loss": 7.5009, "loss/crossentropy": 2.217881241440773, "loss/hidden": 3.4109375, "loss/jsd": 0.0, "loss/logits": 0.19149041585624219, "step": 6740 }, { "epoch": 0.16875, "grad_norm": 31.25, "grad_norm_var": 5.264322916666667, "learning_rate": 0.0001, "loss": 7.6346, "loss/crossentropy": 2.1645133450627325, "loss/hidden": 3.47890625, "loss/jsd": 0.0, "loss/logits": 0.20458366964012384, "step": 6750 }, { "epoch": 0.169, "grad_norm": 35.5, "grad_norm_var": 4.1775390625, "learning_rate": 0.0001, "loss": 7.5251, "loss/crossentropy": 2.171855625510216, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.1922192147001624, "step": 6760 }, { "epoch": 0.16925, "grad_norm": 30.25, "grad_norm_var": 50.66223958333333, "learning_rate": 0.0001, "loss": 7.5615, "loss/crossentropy": 2.269451642036438, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.19260151647031307, "step": 6770 }, { "epoch": 0.1695, "grad_norm": 32.5, "grad_norm_var": 52.72057291666667, "learning_rate": 0.0001, "loss": 7.5146, "loss/crossentropy": 2.2029780715703966, "loss/hidden": 3.32734375, "loss/jsd": 0.0, "loss/logits": 0.18055486269295215, "step": 6780 }, { "epoch": 0.16975, "grad_norm": 32.75, "grad_norm_var": 5.893684895833333, "learning_rate": 0.0001, "loss": 7.6728, "loss/crossentropy": 2.194507023692131, "loss/hidden": 3.4390625, "loss/jsd": 0.0, "loss/logits": 0.21081659942865372, "step": 6790 }, { "epoch": 0.17, "grad_norm": 39.0, "grad_norm_var": 9.028580729166666, "learning_rate": 0.0001, "loss": 7.6095, "loss/crossentropy": 2.228949736058712, "loss/hidden": 3.361328125, "loss/jsd": 0.0, "loss/logits": 0.19297500066459178, "step": 6800 }, { "epoch": 0.17025, "grad_norm": 51.25, "grad_norm_var": 26.128125, "learning_rate": 0.0001, "loss": 7.5173, "loss/crossentropy": 2.0145166903734206, "loss/hidden": 3.473828125, "loss/jsd": 0.0, "loss/logits": 0.19940503742545843, "step": 6810 }, { "epoch": 0.1705, "grad_norm": 30.0, "grad_norm_var": 45.58932291666667, "learning_rate": 0.0001, "loss": 7.5887, "loss/crossentropy": 2.1203667253255842, "loss/hidden": 3.576953125, "loss/jsd": 0.0, "loss/logits": 0.24707065224647523, "step": 6820 }, { "epoch": 0.17075, "grad_norm": 32.5, "grad_norm_var": 13.638997395833334, "learning_rate": 0.0001, "loss": 7.6031, "loss/crossentropy": 2.1143920481204987, "loss/hidden": 3.6578125, "loss/jsd": 0.0, "loss/logits": 0.2371676929295063, "step": 6830 }, { "epoch": 0.171, "grad_norm": 37.75, "grad_norm_var": 11.876822916666667, "learning_rate": 0.0001, "loss": 7.4937, "loss/crossentropy": 2.3085153490304946, "loss/hidden": 3.395703125, "loss/jsd": 0.0, "loss/logits": 0.19867490641772748, "step": 6840 }, { "epoch": 0.17125, "grad_norm": 31.625, "grad_norm_var": 7.063541666666667, "learning_rate": 0.0001, "loss": 7.3836, "loss/crossentropy": 2.2163452029228212, "loss/hidden": 3.31796875, "loss/jsd": 0.0, "loss/logits": 0.19262240454554558, "step": 6850 }, { "epoch": 0.1715, "grad_norm": 29.375, "grad_norm_var": 7.3353515625, "learning_rate": 0.0001, "loss": 7.5696, "loss/crossentropy": 2.1725872844457625, "loss/hidden": 3.282421875, "loss/jsd": 0.0, "loss/logits": 0.18168828263878822, "step": 6860 }, { "epoch": 0.17175, "grad_norm": 33.25, "grad_norm_var": 6.418489583333334, "learning_rate": 0.0001, "loss": 7.5374, "loss/crossentropy": 2.1833366841077804, "loss/hidden": 3.42109375, "loss/jsd": 0.0, "loss/logits": 0.18841250911355018, "step": 6870 }, { "epoch": 0.172, "grad_norm": 36.5, "grad_norm_var": 21.055208333333333, "learning_rate": 0.0001, "loss": 7.5021, "loss/crossentropy": 2.166957159340382, "loss/hidden": 3.430078125, "loss/jsd": 0.0, "loss/logits": 0.1870790719985962, "step": 6880 }, { "epoch": 0.17225, "grad_norm": 33.75, "grad_norm_var": 24.2744140625, "learning_rate": 0.0001, "loss": 7.463, "loss/crossentropy": 2.2468337625265122, "loss/hidden": 3.28359375, "loss/jsd": 0.0, "loss/logits": 0.19030643235892059, "step": 6890 }, { "epoch": 0.1725, "grad_norm": 27.875, "grad_norm_var": 5.3275390625, "learning_rate": 0.0001, "loss": 7.453, "loss/crossentropy": 2.1895847231149674, "loss/hidden": 3.420703125, "loss/jsd": 0.0, "loss/logits": 0.19813059270381927, "step": 6900 }, { "epoch": 0.17275, "grad_norm": 33.25, "grad_norm_var": 12.654166666666667, "learning_rate": 0.0001, "loss": 7.5002, "loss/crossentropy": 2.2333322286605837, "loss/hidden": 3.418359375, "loss/jsd": 0.0, "loss/logits": 0.20021227821707727, "step": 6910 }, { "epoch": 0.173, "grad_norm": 30.75, "grad_norm_var": 11.69765625, "learning_rate": 0.0001, "loss": 7.4872, "loss/crossentropy": 2.0374642267823218, "loss/hidden": 3.4, "loss/jsd": 0.0, "loss/logits": 0.18903901670128107, "step": 6920 }, { "epoch": 0.17325, "grad_norm": 31.875, "grad_norm_var": 2.220572916666667, "learning_rate": 0.0001, "loss": 7.5288, "loss/crossentropy": 2.2860298246145248, "loss/hidden": 3.413671875, "loss/jsd": 0.0, "loss/logits": 0.20242451801896094, "step": 6930 }, { "epoch": 0.1735, "grad_norm": 30.375, "grad_norm_var": 1.9811848958333333, "learning_rate": 0.0001, "loss": 7.6115, "loss/crossentropy": 2.174576237797737, "loss/hidden": 3.502734375, "loss/jsd": 0.0, "loss/logits": 0.20389086604118348, "step": 6940 }, { "epoch": 0.17375, "grad_norm": 29.5, "grad_norm_var": 5.17265625, "learning_rate": 0.0001, "loss": 7.6028, "loss/crossentropy": 2.160076954960823, "loss/hidden": 3.35703125, "loss/jsd": 0.0, "loss/logits": 0.18584340140223504, "step": 6950 }, { "epoch": 0.174, "grad_norm": 32.25, "grad_norm_var": 3.4150390625, "learning_rate": 0.0001, "loss": 7.6249, "loss/crossentropy": 2.2141772389411924, "loss/hidden": 3.384765625, "loss/jsd": 0.0, "loss/logits": 0.2044496938586235, "step": 6960 }, { "epoch": 0.17425, "grad_norm": 30.25, "grad_norm_var": 2.63125, "learning_rate": 0.0001, "loss": 7.5556, "loss/crossentropy": 2.1613443583250045, "loss/hidden": 3.461328125, "loss/jsd": 0.0, "loss/logits": 0.20456040017306804, "step": 6970 }, { "epoch": 0.1745, "grad_norm": 32.0, "grad_norm_var": 1.9184895833333333, "learning_rate": 0.0001, "loss": 7.5504, "loss/crossentropy": 2.179374423623085, "loss/hidden": 3.309375, "loss/jsd": 0.0, "loss/logits": 0.18155127875506877, "step": 6980 }, { "epoch": 0.17475, "grad_norm": 31.125, "grad_norm_var": 1.1302083333333333, "learning_rate": 0.0001, "loss": 7.4596, "loss/crossentropy": 2.2060728073120117, "loss/hidden": 3.326171875, "loss/jsd": 0.0, "loss/logits": 0.18002954982221125, "step": 6990 }, { "epoch": 0.175, "grad_norm": 27.875, "grad_norm_var": 2.27265625, "learning_rate": 0.0001, "loss": 7.5016, "loss/crossentropy": 2.1729022413492203, "loss/hidden": 3.37265625, "loss/jsd": 0.0, "loss/logits": 0.20215826146304608, "step": 7000 }, { "epoch": 0.17525, "grad_norm": 32.75, "grad_norm_var": 2.187239583333333, "learning_rate": 0.0001, "loss": 7.5186, "loss/crossentropy": 2.1925390481948854, "loss/hidden": 3.564453125, "loss/jsd": 0.0, "loss/logits": 0.21080133505165577, "step": 7010 }, { "epoch": 0.1755, "grad_norm": 33.0, "grad_norm_var": 3.3681640625, "learning_rate": 0.0001, "loss": 7.6812, "loss/crossentropy": 2.1888799130916596, "loss/hidden": 3.362109375, "loss/jsd": 0.0, "loss/logits": 0.1895390186458826, "step": 7020 }, { "epoch": 0.17575, "grad_norm": 29.0, "grad_norm_var": 3.4625, "learning_rate": 0.0001, "loss": 7.4658, "loss/crossentropy": 2.1911296755075456, "loss/hidden": 3.43828125, "loss/jsd": 0.0, "loss/logits": 0.18743323888629676, "step": 7030 }, { "epoch": 0.176, "grad_norm": 36.75, "grad_norm_var": 4.143489583333333, "learning_rate": 0.0001, "loss": 7.5154, "loss/crossentropy": 2.2533592522144317, "loss/hidden": 3.323046875, "loss/jsd": 0.0, "loss/logits": 0.18201965987682342, "step": 7040 }, { "epoch": 0.17625, "grad_norm": 34.0, "grad_norm_var": 3.7556640625, "learning_rate": 0.0001, "loss": 7.4748, "loss/crossentropy": 2.259921830892563, "loss/hidden": 3.408203125, "loss/jsd": 0.0, "loss/logits": 0.1965817864984274, "step": 7050 }, { "epoch": 0.1765, "grad_norm": 32.75, "grad_norm_var": 76.703125, "learning_rate": 0.0001, "loss": 7.6429, "loss/crossentropy": 2.18757144510746, "loss/hidden": 3.415625, "loss/jsd": 0.0, "loss/logits": 0.18871748261153698, "step": 7060 }, { "epoch": 0.17675, "grad_norm": 29.125, "grad_norm_var": 18.242708333333333, "learning_rate": 0.0001, "loss": 7.4899, "loss/crossentropy": 2.1853384137153626, "loss/hidden": 3.356640625, "loss/jsd": 0.0, "loss/logits": 0.18886385671794415, "step": 7070 }, { "epoch": 0.177, "grad_norm": 33.5, "grad_norm_var": 17.101497395833334, "learning_rate": 0.0001, "loss": 7.5441, "loss/crossentropy": 2.1331110268831255, "loss/hidden": 3.295703125, "loss/jsd": 0.0, "loss/logits": 0.1878782594576478, "step": 7080 }, { "epoch": 0.17725, "grad_norm": 35.25, "grad_norm_var": 6.013997395833333, "learning_rate": 0.0001, "loss": 7.463, "loss/crossentropy": 2.252199110388756, "loss/hidden": 3.41640625, "loss/jsd": 0.0, "loss/logits": 0.197531633451581, "step": 7090 }, { "epoch": 0.1775, "grad_norm": 32.75, "grad_norm_var": 6.199739583333334, "learning_rate": 0.0001, "loss": 7.5179, "loss/crossentropy": 2.1631226271390913, "loss/hidden": 3.53828125, "loss/jsd": 0.0, "loss/logits": 0.20652975142002106, "step": 7100 }, { "epoch": 0.17775, "grad_norm": 30.875, "grad_norm_var": 2.31015625, "learning_rate": 0.0001, "loss": 7.5643, "loss/crossentropy": 2.153787222504616, "loss/hidden": 3.455859375, "loss/jsd": 0.0, "loss/logits": 0.2014656089246273, "step": 7110 }, { "epoch": 0.178, "grad_norm": 29.875, "grad_norm_var": 4.601822916666666, "learning_rate": 0.0001, "loss": 7.5516, "loss/crossentropy": 2.1199822768568994, "loss/hidden": 3.448046875, "loss/jsd": 0.0, "loss/logits": 0.1905859999358654, "step": 7120 }, { "epoch": 0.17825, "grad_norm": 32.25, "grad_norm_var": 3.4760416666666667, "learning_rate": 0.0001, "loss": 7.5994, "loss/crossentropy": 2.26395897269249, "loss/hidden": 3.402734375, "loss/jsd": 0.0, "loss/logits": 0.19645936116576196, "step": 7130 }, { "epoch": 0.1785, "grad_norm": 36.25, "grad_norm_var": 3.8854166666666665, "learning_rate": 0.0001, "loss": 7.5969, "loss/crossentropy": 2.1285897165536882, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.18263876978307964, "step": 7140 }, { "epoch": 0.17875, "grad_norm": 34.0, "grad_norm_var": 4.657291666666667, "learning_rate": 0.0001, "loss": 7.5153, "loss/crossentropy": 2.127173164486885, "loss/hidden": 3.401171875, "loss/jsd": 0.0, "loss/logits": 0.19018456861376762, "step": 7150 }, { "epoch": 0.179, "grad_norm": 31.125, "grad_norm_var": 4.314322916666667, "learning_rate": 0.0001, "loss": 7.5891, "loss/crossentropy": 2.187330016493797, "loss/hidden": 3.371484375, "loss/jsd": 0.0, "loss/logits": 0.19513612650334836, "step": 7160 }, { "epoch": 0.17925, "grad_norm": 32.5, "grad_norm_var": 2.4479166666666665, "learning_rate": 0.0001, "loss": 7.5653, "loss/crossentropy": 2.0964292854070665, "loss/hidden": 3.467578125, "loss/jsd": 0.0, "loss/logits": 0.20541836731135846, "step": 7170 }, { "epoch": 0.1795, "grad_norm": 33.0, "grad_norm_var": 2.4025390625, "learning_rate": 0.0001, "loss": 7.5849, "loss/crossentropy": 2.1456298559904097, "loss/hidden": 3.515234375, "loss/jsd": 0.0, "loss/logits": 0.21056269146502019, "step": 7180 }, { "epoch": 0.17975, "grad_norm": 33.5, "grad_norm_var": 2.78515625, "learning_rate": 0.0001, "loss": 7.6594, "loss/crossentropy": 2.0873524725437163, "loss/hidden": 3.595703125, "loss/jsd": 0.0, "loss/logits": 0.2302736073732376, "step": 7190 }, { "epoch": 0.18, "grad_norm": 32.0, "grad_norm_var": 1.4697916666666666, "learning_rate": 0.0001, "loss": 7.5801, "loss/crossentropy": 2.0368966817855836, "loss/hidden": 3.572265625, "loss/jsd": 0.0, "loss/logits": 0.20361636602319777, "step": 7200 }, { "epoch": 0.18025, "grad_norm": 32.75, "grad_norm_var": 1.9942057291666666, "learning_rate": 0.0001, "loss": 7.6054, "loss/crossentropy": 2.079051211476326, "loss/hidden": 3.55546875, "loss/jsd": 0.0, "loss/logits": 0.21228218004107474, "step": 7210 }, { "epoch": 0.1805, "grad_norm": 33.25, "grad_norm_var": 4.237955729166667, "learning_rate": 0.0001, "loss": 7.6754, "loss/crossentropy": 2.130552776157856, "loss/hidden": 3.37265625, "loss/jsd": 0.0, "loss/logits": 0.1921296551823616, "step": 7220 }, { "epoch": 0.18075, "grad_norm": 30.5, "grad_norm_var": 2.526497395833333, "learning_rate": 0.0001, "loss": 7.5296, "loss/crossentropy": 2.0738827764987944, "loss/hidden": 3.403125, "loss/jsd": 0.0, "loss/logits": 0.19692382737994193, "step": 7230 }, { "epoch": 0.181, "grad_norm": 32.75, "grad_norm_var": 2.584375, "learning_rate": 0.0001, "loss": 7.6061, "loss/crossentropy": 2.1579408079385756, "loss/hidden": 3.51953125, "loss/jsd": 0.0, "loss/logits": 0.21399259492754935, "step": 7240 }, { "epoch": 0.18125, "grad_norm": 33.5, "grad_norm_var": 2.8416015625, "learning_rate": 0.0001, "loss": 7.6824, "loss/crossentropy": 2.221544751524925, "loss/hidden": 3.53203125, "loss/jsd": 0.0, "loss/logits": 0.2030305091291666, "step": 7250 }, { "epoch": 0.1815, "grad_norm": 35.5, "grad_norm_var": 1.8848307291666666, "learning_rate": 0.0001, "loss": 7.5215, "loss/crossentropy": 2.180348289012909, "loss/hidden": 3.46796875, "loss/jsd": 0.0, "loss/logits": 0.2144750364124775, "step": 7260 }, { "epoch": 0.18175, "grad_norm": 53.25, "grad_norm_var": 31.4353515625, "learning_rate": 0.0001, "loss": 7.6027, "loss/crossentropy": 2.1652086317539214, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.20979799777269365, "step": 7270 }, { "epoch": 0.182, "grad_norm": 31.375, "grad_norm_var": 31.370768229166668, "learning_rate": 0.0001, "loss": 7.5252, "loss/crossentropy": 2.0706035763025286, "loss/hidden": 3.38828125, "loss/jsd": 0.0, "loss/logits": 0.18490511737763882, "step": 7280 }, { "epoch": 0.18225, "grad_norm": 32.25, "grad_norm_var": 8.717643229166667, "learning_rate": 0.0001, "loss": 7.5688, "loss/crossentropy": 2.2949971139431, "loss/hidden": 3.4234375, "loss/jsd": 0.0, "loss/logits": 0.20526532903313638, "step": 7290 }, { "epoch": 0.1825, "grad_norm": 32.0, "grad_norm_var": 4.91640625, "learning_rate": 0.0001, "loss": 7.5752, "loss/crossentropy": 2.2193857818841933, "loss/hidden": 3.330078125, "loss/jsd": 0.0, "loss/logits": 0.1804453806951642, "step": 7300 }, { "epoch": 0.18275, "grad_norm": 31.375, "grad_norm_var": 3.701041666666667, "learning_rate": 0.0001, "loss": 7.6028, "loss/crossentropy": 2.207156080007553, "loss/hidden": 3.436328125, "loss/jsd": 0.0, "loss/logits": 0.2132449522614479, "step": 7310 }, { "epoch": 0.183, "grad_norm": 31.75, "grad_norm_var": 3.289322916666667, "learning_rate": 0.0001, "loss": 7.5818, "loss/crossentropy": 2.271843919157982, "loss/hidden": 3.36640625, "loss/jsd": 0.0, "loss/logits": 0.19545750357210637, "step": 7320 }, { "epoch": 0.18325, "grad_norm": 31.25, "grad_norm_var": 1.0124348958333333, "learning_rate": 0.0001, "loss": 7.502, "loss/crossentropy": 2.1379879862070084, "loss/hidden": 3.403125, "loss/jsd": 0.0, "loss/logits": 0.1861495029181242, "step": 7330 }, { "epoch": 0.1835, "grad_norm": 31.25, "grad_norm_var": 0.7556640625, "learning_rate": 0.0001, "loss": 7.5501, "loss/crossentropy": 2.080298659205437, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.18716856762766837, "step": 7340 }, { "epoch": 0.18375, "grad_norm": 31.25, "grad_norm_var": 2.450455729166667, "learning_rate": 0.0001, "loss": 7.3934, "loss/crossentropy": 2.0731329679489137, "loss/hidden": 3.33984375, "loss/jsd": 0.0, "loss/logits": 0.17404056414961816, "step": 7350 }, { "epoch": 0.184, "grad_norm": 34.25, "grad_norm_var": 3.0947265625, "learning_rate": 0.0001, "loss": 7.5861, "loss/crossentropy": 2.0820507287979124, "loss/hidden": 3.465234375, "loss/jsd": 0.0, "loss/logits": 0.1926917627453804, "step": 7360 }, { "epoch": 0.18425, "grad_norm": 30.875, "grad_norm_var": 2.1624348958333335, "learning_rate": 0.0001, "loss": 7.5492, "loss/crossentropy": 2.2114842593669892, "loss/hidden": 3.48828125, "loss/jsd": 0.0, "loss/logits": 0.2127979423850775, "step": 7370 }, { "epoch": 0.1845, "grad_norm": 33.5, "grad_norm_var": 4.882291666666666, "learning_rate": 0.0001, "loss": 7.5255, "loss/crossentropy": 2.075804352760315, "loss/hidden": 3.455859375, "loss/jsd": 0.0, "loss/logits": 0.18906766772270203, "step": 7380 }, { "epoch": 0.18475, "grad_norm": 35.75, "grad_norm_var": 34.35729166666667, "learning_rate": 0.0001, "loss": 7.6434, "loss/crossentropy": 2.136875703930855, "loss/hidden": 3.371875, "loss/jsd": 0.0, "loss/logits": 0.18598171565681695, "step": 7390 }, { "epoch": 0.185, "grad_norm": 32.75, "grad_norm_var": 4.624739583333334, "learning_rate": 0.0001, "loss": 7.6191, "loss/crossentropy": 2.170763599872589, "loss/hidden": 3.5109375, "loss/jsd": 0.0, "loss/logits": 0.2151478011161089, "step": 7400 }, { "epoch": 0.18525, "grad_norm": 30.75, "grad_norm_var": 2.051497395833333, "learning_rate": 0.0001, "loss": 7.4116, "loss/crossentropy": 2.143643561005592, "loss/hidden": 3.46796875, "loss/jsd": 0.0, "loss/logits": 0.19506660383194685, "step": 7410 }, { "epoch": 0.1855, "grad_norm": 36.0, "grad_norm_var": 5.089322916666666, "learning_rate": 0.0001, "loss": 7.534, "loss/crossentropy": 2.1735941752791406, "loss/hidden": 3.307421875, "loss/jsd": 0.0, "loss/logits": 0.17996873259544371, "step": 7420 }, { "epoch": 0.18575, "grad_norm": 29.0, "grad_norm_var": 4.81640625, "learning_rate": 0.0001, "loss": 7.5846, "loss/crossentropy": 2.15124252140522, "loss/hidden": 3.45546875, "loss/jsd": 0.0, "loss/logits": 0.1944795411080122, "step": 7430 }, { "epoch": 0.186, "grad_norm": 30.125, "grad_norm_var": 1.7900390625, "learning_rate": 0.0001, "loss": 7.5394, "loss/crossentropy": 2.189228793978691, "loss/hidden": 3.428125, "loss/jsd": 0.0, "loss/logits": 0.19658807516098023, "step": 7440 }, { "epoch": 0.18625, "grad_norm": 31.25, "grad_norm_var": 1.1572916666666666, "learning_rate": 0.0001, "loss": 7.5156, "loss/crossentropy": 2.1805285453796386, "loss/hidden": 3.43984375, "loss/jsd": 0.0, "loss/logits": 0.20203232783824204, "step": 7450 }, { "epoch": 0.1865, "grad_norm": 32.75, "grad_norm_var": 1.015625, "learning_rate": 0.0001, "loss": 7.5803, "loss/crossentropy": 2.031705692410469, "loss/hidden": 3.429296875, "loss/jsd": 0.0, "loss/logits": 0.19695232547819613, "step": 7460 }, { "epoch": 0.18675, "grad_norm": 31.75, "grad_norm_var": 1.6389973958333333, "learning_rate": 0.0001, "loss": 7.5008, "loss/crossentropy": 2.0725282967090606, "loss/hidden": 3.53515625, "loss/jsd": 0.0, "loss/logits": 0.19746688194572926, "step": 7470 }, { "epoch": 0.187, "grad_norm": 33.75, "grad_norm_var": 30.643684895833335, "learning_rate": 0.0001, "loss": 7.5746, "loss/crossentropy": 2.2134982645511627, "loss/hidden": 3.4640625, "loss/jsd": 0.0, "loss/logits": 0.2143145205453038, "step": 7480 }, { "epoch": 0.18725, "grad_norm": 28.75, "grad_norm_var": 173.95358072916667, "learning_rate": 0.0001, "loss": 7.5472, "loss/crossentropy": 2.2404279142618178, "loss/hidden": 3.349609375, "loss/jsd": 0.0, "loss/logits": 0.18395285904407502, "step": 7490 }, { "epoch": 0.1875, "grad_norm": 33.0, "grad_norm_var": 295.3212890625, "learning_rate": 0.0001, "loss": 7.6232, "loss/crossentropy": 2.1231855720281603, "loss/hidden": 3.43203125, "loss/jsd": 0.0, "loss/logits": 0.19835165105760097, "step": 7500 }, { "epoch": 0.18775, "grad_norm": 33.5, "grad_norm_var": 9.3119140625, "learning_rate": 0.0001, "loss": 7.5797, "loss/crossentropy": 2.1263521701097488, "loss/hidden": 3.39140625, "loss/jsd": 0.0, "loss/logits": 0.18789808079600334, "step": 7510 }, { "epoch": 0.188, "grad_norm": 29.75, "grad_norm_var": 30.567708333333332, "learning_rate": 0.0001, "loss": 7.544, "loss/crossentropy": 2.2000538021326066, "loss/hidden": 3.31796875, "loss/jsd": 0.0, "loss/logits": 0.19643234014511107, "step": 7520 }, { "epoch": 0.18825, "grad_norm": 30.75, "grad_norm_var": 26.5056640625, "learning_rate": 0.0001, "loss": 7.4594, "loss/crossentropy": 2.1121160596609116, "loss/hidden": 3.42734375, "loss/jsd": 0.0, "loss/logits": 0.18368990309536457, "step": 7530 }, { "epoch": 0.1885, "grad_norm": 30.75, "grad_norm_var": 2.661393229166667, "learning_rate": 0.0001, "loss": 7.4737, "loss/crossentropy": 2.2245935589075088, "loss/hidden": 3.24140625, "loss/jsd": 0.0, "loss/logits": 0.17950727473944425, "step": 7540 }, { "epoch": 0.18875, "grad_norm": 32.75, "grad_norm_var": 3.11015625, "learning_rate": 0.0001, "loss": 7.5744, "loss/crossentropy": 2.2126697182655333, "loss/hidden": 3.308203125, "loss/jsd": 0.0, "loss/logits": 0.19009165093302727, "step": 7550 }, { "epoch": 0.189, "grad_norm": 34.25, "grad_norm_var": 2.104622395833333, "learning_rate": 0.0001, "loss": 7.6006, "loss/crossentropy": 2.0755111388862133, "loss/hidden": 3.523046875, "loss/jsd": 0.0, "loss/logits": 0.20379672143608332, "step": 7560 }, { "epoch": 0.18925, "grad_norm": 32.5, "grad_norm_var": 16.618684895833333, "learning_rate": 0.0001, "loss": 7.528, "loss/crossentropy": 2.205251544713974, "loss/hidden": 3.429296875, "loss/jsd": 0.0, "loss/logits": 0.18352425061166286, "step": 7570 }, { "epoch": 0.1895, "grad_norm": 29.875, "grad_norm_var": 12.086393229166667, "learning_rate": 0.0001, "loss": 7.4678, "loss/crossentropy": 2.0505823358893394, "loss/hidden": 3.4125, "loss/jsd": 0.0, "loss/logits": 0.2034569911658764, "step": 7580 }, { "epoch": 0.18975, "grad_norm": 35.0, "grad_norm_var": 15.94140625, "learning_rate": 0.0001, "loss": 7.506, "loss/crossentropy": 2.2195076823234556, "loss/hidden": 3.361328125, "loss/jsd": 0.0, "loss/logits": 0.19098728708922863, "step": 7590 }, { "epoch": 0.19, "grad_norm": 31.375, "grad_norm_var": 16.2853515625, "learning_rate": 0.0001, "loss": 7.4491, "loss/crossentropy": 2.185887323319912, "loss/hidden": 3.40546875, "loss/jsd": 0.0, "loss/logits": 0.19378383718430997, "step": 7600 }, { "epoch": 0.19025, "grad_norm": 33.0, "grad_norm_var": 3.0389973958333334, "learning_rate": 0.0001, "loss": 7.5015, "loss/crossentropy": 2.057955393195152, "loss/hidden": 3.536328125, "loss/jsd": 0.0, "loss/logits": 0.20849357955157757, "step": 7610 }, { "epoch": 0.1905, "grad_norm": 31.375, "grad_norm_var": 17.470833333333335, "learning_rate": 0.0001, "loss": 7.6086, "loss/crossentropy": 2.1172866210341454, "loss/hidden": 3.476171875, "loss/jsd": 0.0, "loss/logits": 0.20152031816542149, "step": 7620 }, { "epoch": 0.19075, "grad_norm": 32.5, "grad_norm_var": 20.54765625, "learning_rate": 0.0001, "loss": 7.5686, "loss/crossentropy": 2.2929946899414064, "loss/hidden": 3.29765625, "loss/jsd": 0.0, "loss/logits": 0.18747647628188133, "step": 7630 }, { "epoch": 0.191, "grad_norm": 37.25, "grad_norm_var": 40.75807291666667, "learning_rate": 0.0001, "loss": 7.6427, "loss/crossentropy": 2.180236041545868, "loss/hidden": 3.54140625, "loss/jsd": 0.0, "loss/logits": 0.20883531272411346, "step": 7640 }, { "epoch": 0.19125, "grad_norm": 29.625, "grad_norm_var": 40.85520833333333, "learning_rate": 0.0001, "loss": 7.4917, "loss/crossentropy": 2.1129625350236894, "loss/hidden": 3.362109375, "loss/jsd": 0.0, "loss/logits": 0.18379813842475415, "step": 7650 }, { "epoch": 0.1915, "grad_norm": 39.75, "grad_norm_var": 6.581184895833333, "learning_rate": 0.0001, "loss": 7.5268, "loss/crossentropy": 2.228352552652359, "loss/hidden": 3.460546875, "loss/jsd": 0.0, "loss/logits": 0.2048925407230854, "step": 7660 }, { "epoch": 0.19175, "grad_norm": 34.75, "grad_norm_var": 6.0181640625, "learning_rate": 0.0001, "loss": 7.5951, "loss/crossentropy": 2.2298452496528625, "loss/hidden": 3.5171875, "loss/jsd": 0.0, "loss/logits": 0.21614472325891257, "step": 7670 }, { "epoch": 0.192, "grad_norm": 29.625, "grad_norm_var": 3.6025390625, "learning_rate": 0.0001, "loss": 7.5215, "loss/crossentropy": 2.167386993765831, "loss/hidden": 3.3921875, "loss/jsd": 0.0, "loss/logits": 0.1897386133670807, "step": 7680 }, { "epoch": 0.19225, "grad_norm": 31.25, "grad_norm_var": 1.5754557291666667, "learning_rate": 0.0001, "loss": 7.4768, "loss/crossentropy": 2.1173581033945084, "loss/hidden": 3.43046875, "loss/jsd": 0.0, "loss/logits": 0.19473140574991704, "step": 7690 }, { "epoch": 0.1925, "grad_norm": 38.0, "grad_norm_var": 16.32890625, "learning_rate": 0.0001, "loss": 7.5827, "loss/crossentropy": 2.205728626251221, "loss/hidden": 3.40859375, "loss/jsd": 0.0, "loss/logits": 0.20203282944858075, "step": 7700 }, { "epoch": 0.19275, "grad_norm": 7583301632.0, "grad_norm_var": 3.594153946076571e+18, "learning_rate": 0.0001, "loss": 7.4851, "loss/crossentropy": 2.0537545680999756, "loss/hidden": 3.4953125, "loss/jsd": 0.0, "loss/logits": 0.19121616017073392, "step": 7710 }, { "epoch": 0.193, "grad_norm": 31.5, "grad_norm_var": 3.594153947356253e+18, "learning_rate": 0.0001, "loss": 7.6091, "loss/crossentropy": 2.2010452926158903, "loss/hidden": 3.41328125, "loss/jsd": 0.0, "loss/logits": 0.19013969153165816, "step": 7720 }, { "epoch": 0.19325, "grad_norm": 32.25, "grad_norm_var": 127.8494140625, "learning_rate": 0.0001, "loss": 7.5129, "loss/crossentropy": 2.1829321801662447, "loss/hidden": 3.33828125, "loss/jsd": 0.0, "loss/logits": 0.18250775039196016, "step": 7730 }, { "epoch": 0.1935, "grad_norm": 32.5, "grad_norm_var": 6.54765625, "learning_rate": 0.0001, "loss": 7.5028, "loss/crossentropy": 2.059948954731226, "loss/hidden": 3.469921875, "loss/jsd": 0.0, "loss/logits": 0.1829567258246243, "step": 7740 }, { "epoch": 0.19375, "grad_norm": 34.25, "grad_norm_var": 65.40305989583334, "learning_rate": 0.0001, "loss": 7.5152, "loss/crossentropy": 2.113444189727306, "loss/hidden": 3.4, "loss/jsd": 0.0, "loss/logits": 0.19062405563890933, "step": 7750 }, { "epoch": 0.194, "grad_norm": 32.25, "grad_norm_var": 61.19166666666667, "learning_rate": 0.0001, "loss": 7.5399, "loss/crossentropy": 2.0975135535001757, "loss/hidden": 3.5359375, "loss/jsd": 0.0, "loss/logits": 0.21106049697846174, "step": 7760 }, { "epoch": 0.19425, "grad_norm": 55.5, "grad_norm_var": 35.5087890625, "learning_rate": 0.0001, "loss": 7.6208, "loss/crossentropy": 2.168052741885185, "loss/hidden": 3.458984375, "loss/jsd": 0.0, "loss/logits": 0.20290581844747066, "step": 7770 }, { "epoch": 0.1945, "grad_norm": 32.5, "grad_norm_var": 34.890625, "learning_rate": 0.0001, "loss": 7.6052, "loss/crossentropy": 2.203343018889427, "loss/hidden": 3.51953125, "loss/jsd": 0.0, "loss/logits": 0.2262148879468441, "step": 7780 }, { "epoch": 0.19475, "grad_norm": 33.75, "grad_norm_var": 1.90390625, "learning_rate": 0.0001, "loss": 7.5521, "loss/crossentropy": 2.096657195687294, "loss/hidden": 3.4359375, "loss/jsd": 0.0, "loss/logits": 0.19364065770059824, "step": 7790 }, { "epoch": 0.195, "grad_norm": 32.25, "grad_norm_var": 1.4916015625, "learning_rate": 0.0001, "loss": 7.607, "loss/crossentropy": 2.0858742713928224, "loss/hidden": 3.482421875, "loss/jsd": 0.0, "loss/logits": 0.195220067165792, "step": 7800 }, { "epoch": 0.19525, "grad_norm": 30.5, "grad_norm_var": 30.91640625, "learning_rate": 0.0001, "loss": 7.5047, "loss/crossentropy": 2.1515824437141418, "loss/hidden": 3.44765625, "loss/jsd": 0.0, "loss/logits": 0.19715734478086233, "step": 7810 }, { "epoch": 0.1955, "grad_norm": 34.75, "grad_norm_var": 29.525455729166666, "learning_rate": 0.0001, "loss": 7.5755, "loss/crossentropy": 2.197349172830582, "loss/hidden": 3.397265625, "loss/jsd": 0.0, "loss/logits": 0.1922721391543746, "step": 7820 }, { "epoch": 0.19575, "grad_norm": 29.75, "grad_norm_var": 2.6962890625, "learning_rate": 0.0001, "loss": 7.5252, "loss/crossentropy": 2.1433851540088655, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.2196674931794405, "step": 7830 }, { "epoch": 0.196, "grad_norm": 31.625, "grad_norm_var": 4.008072916666666, "learning_rate": 0.0001, "loss": 7.5378, "loss/crossentropy": 2.151882603764534, "loss/hidden": 3.528125, "loss/jsd": 0.0, "loss/logits": 0.2241852417588234, "step": 7840 }, { "epoch": 0.19625, "grad_norm": 31.125, "grad_norm_var": 23.979166666666668, "learning_rate": 0.0001, "loss": 7.4767, "loss/crossentropy": 2.100368928909302, "loss/hidden": 3.429296875, "loss/jsd": 0.0, "loss/logits": 0.18492705803364515, "step": 7850 }, { "epoch": 0.1965, "grad_norm": 35.25, "grad_norm_var": 4.2900390625, "learning_rate": 0.0001, "loss": 7.5525, "loss/crossentropy": 2.1549850702285767, "loss/hidden": 3.41328125, "loss/jsd": 0.0, "loss/logits": 0.19525696001946927, "step": 7860 }, { "epoch": 0.19675, "grad_norm": 29.125, "grad_norm_var": 3.1264973958333333, "learning_rate": 0.0001, "loss": 7.4894, "loss/crossentropy": 2.085528630018234, "loss/hidden": 3.37734375, "loss/jsd": 0.0, "loss/logits": 0.19448419120162724, "step": 7870 }, { "epoch": 0.197, "grad_norm": 31.125, "grad_norm_var": 2.0509765625, "learning_rate": 0.0001, "loss": 7.6104, "loss/crossentropy": 2.197956010699272, "loss/hidden": 3.54609375, "loss/jsd": 0.0, "loss/logits": 0.20526408050209283, "step": 7880 }, { "epoch": 0.19725, "grad_norm": 28.5, "grad_norm_var": 4.287955729166667, "learning_rate": 0.0001, "loss": 7.5082, "loss/crossentropy": 2.093092533946037, "loss/hidden": 3.373828125, "loss/jsd": 0.0, "loss/logits": 0.18932582587003707, "step": 7890 }, { "epoch": 0.1975, "grad_norm": 31.875, "grad_norm_var": 3.9583333333333335, "learning_rate": 0.0001, "loss": 7.5156, "loss/crossentropy": 2.1209671765565874, "loss/hidden": 3.334375, "loss/jsd": 0.0, "loss/logits": 0.18670345041900874, "step": 7900 }, { "epoch": 0.19775, "grad_norm": 42.75, "grad_norm_var": 14.573893229166666, "learning_rate": 0.0001, "loss": 7.628, "loss/crossentropy": 2.2129906579852103, "loss/hidden": 3.41875, "loss/jsd": 0.0, "loss/logits": 0.18701824955642224, "step": 7910 }, { "epoch": 0.198, "grad_norm": 29.375, "grad_norm_var": 13.91015625, "learning_rate": 0.0001, "loss": 7.5283, "loss/crossentropy": 2.063839703798294, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.2156384490430355, "step": 7920 }, { "epoch": 0.19825, "grad_norm": 30.0, "grad_norm_var": 13.59140625, "learning_rate": 0.0001, "loss": 7.5872, "loss/crossentropy": 2.150103223323822, "loss/hidden": 3.44453125, "loss/jsd": 0.0, "loss/logits": 0.2093698762357235, "step": 7930 }, { "epoch": 0.1985, "grad_norm": 30.75, "grad_norm_var": 8.720833333333333, "learning_rate": 0.0001, "loss": 7.5207, "loss/crossentropy": 2.1579747438430785, "loss/hidden": 3.43046875, "loss/jsd": 0.0, "loss/logits": 0.20063298791646958, "step": 7940 }, { "epoch": 0.19875, "grad_norm": 29.25, "grad_norm_var": 2.7270182291666667, "learning_rate": 0.0001, "loss": 7.5105, "loss/crossentropy": 2.104949194192886, "loss/hidden": 3.462890625, "loss/jsd": 0.0, "loss/logits": 0.18702716194093227, "step": 7950 }, { "epoch": 0.199, "grad_norm": 31.375, "grad_norm_var": 2.7018229166666665, "learning_rate": 0.0001, "loss": 7.6138, "loss/crossentropy": 2.1542328625917433, "loss/hidden": 3.401171875, "loss/jsd": 0.0, "loss/logits": 0.1854570686817169, "step": 7960 }, { "epoch": 0.19925, "grad_norm": 36.25, "grad_norm_var": 78.99889322916667, "learning_rate": 0.0001, "loss": 7.5929, "loss/crossentropy": 2.196703353524208, "loss/hidden": 3.473828125, "loss/jsd": 0.0, "loss/logits": 0.20412985123693944, "step": 7970 }, { "epoch": 0.1995, "grad_norm": 33.0, "grad_norm_var": 79.22083333333333, "learning_rate": 0.0001, "loss": 7.5479, "loss/crossentropy": 2.1077481478452684, "loss/hidden": 3.441796875, "loss/jsd": 0.0, "loss/logits": 0.20956829860806464, "step": 7980 }, { "epoch": 0.19975, "grad_norm": 30.875, "grad_norm_var": 2.403059895833333, "learning_rate": 0.0001, "loss": 7.5393, "loss/crossentropy": 2.2737906739115714, "loss/hidden": 3.325390625, "loss/jsd": 0.0, "loss/logits": 0.19662595726549625, "step": 7990 }, { "epoch": 0.2, "grad_norm": 31.625, "grad_norm_var": 7.572916666666667, "learning_rate": 0.0001, "loss": 7.576, "loss/crossentropy": 2.187049573659897, "loss/hidden": 3.481640625, "loss/jsd": 0.0, "loss/logits": 0.1929944284260273, "step": 8000 }, { "epoch": 0.20025, "grad_norm": 31.125, "grad_norm_var": 1.81875, "learning_rate": 0.0001, "loss": 7.584, "loss/crossentropy": 2.1443400979042053, "loss/hidden": 3.587890625, "loss/jsd": 0.0, "loss/logits": 0.21116771101951598, "step": 8010 }, { "epoch": 0.2005, "grad_norm": 33.25, "grad_norm_var": 2.2309895833333333, "learning_rate": 0.0001, "loss": 7.619, "loss/crossentropy": 2.1093345403671266, "loss/hidden": 3.42421875, "loss/jsd": 0.0, "loss/logits": 0.19506766367703676, "step": 8020 }, { "epoch": 0.20075, "grad_norm": 34.25, "grad_norm_var": 2.2285807291666666, "learning_rate": 0.0001, "loss": 7.6101, "loss/crossentropy": 2.3067246288061143, "loss/hidden": 3.368359375, "loss/jsd": 0.0, "loss/logits": 0.21037033908069133, "step": 8030 }, { "epoch": 0.201, "grad_norm": 31.25, "grad_norm_var": 8.986393229166667, "learning_rate": 0.0001, "loss": 7.5813, "loss/crossentropy": 2.0566743701696395, "loss/hidden": 3.6375, "loss/jsd": 0.0, "loss/logits": 0.21147819980978966, "step": 8040 }, { "epoch": 0.20125, "grad_norm": 30.875, "grad_norm_var": 58.540625, "learning_rate": 0.0001, "loss": 7.6354, "loss/crossentropy": 2.119761574268341, "loss/hidden": 3.5140625, "loss/jsd": 0.0, "loss/logits": 0.21174631416797637, "step": 8050 }, { "epoch": 0.2015, "grad_norm": 31.125, "grad_norm_var": 1.9830729166666667, "learning_rate": 0.0001, "loss": 7.5129, "loss/crossentropy": 2.098930720984936, "loss/hidden": 3.46796875, "loss/jsd": 0.0, "loss/logits": 0.1940738322213292, "step": 8060 }, { "epoch": 0.20175, "grad_norm": 31.125, "grad_norm_var": 1.82265625, "learning_rate": 0.0001, "loss": 7.5747, "loss/crossentropy": 2.1186475455760956, "loss/hidden": 3.55859375, "loss/jsd": 0.0, "loss/logits": 0.19831476360559464, "step": 8070 }, { "epoch": 0.202, "grad_norm": 31.5, "grad_norm_var": 1.5416666666666667, "learning_rate": 0.0001, "loss": 7.5639, "loss/crossentropy": 2.2280248433351515, "loss/hidden": 3.4640625, "loss/jsd": 0.0, "loss/logits": 0.18766701072454453, "step": 8080 }, { "epoch": 0.20225, "grad_norm": 31.375, "grad_norm_var": 29.02265625, "learning_rate": 0.0001, "loss": 7.4765, "loss/crossentropy": 2.135748690366745, "loss/hidden": 3.394140625, "loss/jsd": 0.0, "loss/logits": 0.20170771703124046, "step": 8090 }, { "epoch": 0.2025, "grad_norm": 38.25, "grad_norm_var": 4.775, "learning_rate": 0.0001, "loss": 7.6228, "loss/crossentropy": 2.08721085190773, "loss/hidden": 3.42265625, "loss/jsd": 0.0, "loss/logits": 0.20192455761134626, "step": 8100 }, { "epoch": 0.20275, "grad_norm": 29.5, "grad_norm_var": 15.781184895833333, "learning_rate": 0.0001, "loss": 7.5354, "loss/crossentropy": 2.230104002356529, "loss/hidden": 3.46640625, "loss/jsd": 0.0, "loss/logits": 0.20827409140765668, "step": 8110 }, { "epoch": 0.203, "grad_norm": 30.75, "grad_norm_var": 2.0233723958333334, "learning_rate": 0.0001, "loss": 7.4638, "loss/crossentropy": 2.071944323182106, "loss/hidden": 3.436328125, "loss/jsd": 0.0, "loss/logits": 0.1925355602055788, "step": 8120 }, { "epoch": 0.20325, "grad_norm": 29.375, "grad_norm_var": 2.0625, "learning_rate": 0.0001, "loss": 7.5385, "loss/crossentropy": 2.0866906702518464, "loss/hidden": 3.621484375, "loss/jsd": 0.0, "loss/logits": 0.2152187094092369, "step": 8130 }, { "epoch": 0.2035, "grad_norm": 31.625, "grad_norm_var": 6.661458333333333, "learning_rate": 0.0001, "loss": 7.5217, "loss/crossentropy": 2.1989782720804216, "loss/hidden": 3.502734375, "loss/jsd": 0.0, "loss/logits": 0.1997136753052473, "step": 8140 }, { "epoch": 0.20375, "grad_norm": 30.5, "grad_norm_var": 5.3197265625, "learning_rate": 0.0001, "loss": 7.4881, "loss/crossentropy": 2.0995738029479982, "loss/hidden": 3.456640625, "loss/jsd": 0.0, "loss/logits": 0.18900278601795434, "step": 8150 }, { "epoch": 0.204, "grad_norm": 30.75, "grad_norm_var": 0.8395833333333333, "learning_rate": 0.0001, "loss": 7.5243, "loss/crossentropy": 2.0287352964282035, "loss/hidden": 3.475390625, "loss/jsd": 0.0, "loss/logits": 0.19610616452991964, "step": 8160 }, { "epoch": 0.20425, "grad_norm": 31.25, "grad_norm_var": 5.362239583333333, "learning_rate": 0.0001, "loss": 7.5562, "loss/crossentropy": 2.165475571155548, "loss/hidden": 3.369140625, "loss/jsd": 0.0, "loss/logits": 0.19938987717032433, "step": 8170 }, { "epoch": 0.2045, "grad_norm": 31.375, "grad_norm_var": 1.571875, "learning_rate": 0.0001, "loss": 7.5421, "loss/crossentropy": 2.2981997221708297, "loss/hidden": 3.410546875, "loss/jsd": 0.0, "loss/logits": 0.18277304694056512, "step": 8180 }, { "epoch": 0.20475, "grad_norm": 30.375, "grad_norm_var": 4.543489583333334, "learning_rate": 0.0001, "loss": 7.5629, "loss/crossentropy": 2.0597861796617507, "loss/hidden": 3.441015625, "loss/jsd": 0.0, "loss/logits": 0.20162123087793588, "step": 8190 }, { "epoch": 0.205, "grad_norm": 32.0, "grad_norm_var": 2.0572916666666665, "learning_rate": 0.0001, "loss": 7.5376, "loss/crossentropy": 2.1517196536064147, "loss/hidden": 3.33984375, "loss/jsd": 0.0, "loss/logits": 0.19018489569425584, "step": 8200 }, { "epoch": 0.20525, "grad_norm": 29.375, "grad_norm_var": 1.4994140625, "learning_rate": 0.0001, "loss": 7.4639, "loss/crossentropy": 2.2632179886102675, "loss/hidden": 3.25625, "loss/jsd": 0.0, "loss/logits": 0.18489395044744014, "step": 8210 }, { "epoch": 0.2055, "grad_norm": 36.5, "grad_norm_var": 2.61640625, "learning_rate": 0.0001, "loss": 7.7838, "loss/crossentropy": 2.1116667434573175, "loss/hidden": 3.573828125, "loss/jsd": 0.0, "loss/logits": 0.19244479853659868, "step": 8220 }, { "epoch": 0.20575, "grad_norm": 31.25, "grad_norm_var": 1.8582682291666666, "learning_rate": 0.0001, "loss": 7.5409, "loss/crossentropy": 2.1676986277103425, "loss/hidden": 3.42734375, "loss/jsd": 0.0, "loss/logits": 0.19751499071717263, "step": 8230 }, { "epoch": 0.206, "grad_norm": 35.5, "grad_norm_var": 3.9962890625, "learning_rate": 0.0001, "loss": 7.5365, "loss/crossentropy": 2.2388996213674544, "loss/hidden": 3.4109375, "loss/jsd": 0.0, "loss/logits": 0.19586583338677882, "step": 8240 }, { "epoch": 0.20625, "grad_norm": 30.375, "grad_norm_var": 5.229166666666667, "learning_rate": 0.0001, "loss": 7.4937, "loss/crossentropy": 2.173008766770363, "loss/hidden": 3.384375, "loss/jsd": 0.0, "loss/logits": 0.18703010510653256, "step": 8250 }, { "epoch": 0.2065, "grad_norm": 31.625, "grad_norm_var": 14.615625, "learning_rate": 0.0001, "loss": 7.5587, "loss/crossentropy": 2.1825241267681124, "loss/hidden": 3.443359375, "loss/jsd": 0.0, "loss/logits": 0.19528221413493158, "step": 8260 }, { "epoch": 0.20675, "grad_norm": 30.375, "grad_norm_var": 8.207747395833334, "learning_rate": 0.0001, "loss": 7.479, "loss/crossentropy": 2.0385641396045684, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.20498643592000007, "step": 8270 }, { "epoch": 0.207, "grad_norm": 36.25, "grad_norm_var": 5.3650390625, "learning_rate": 0.0001, "loss": 7.5835, "loss/crossentropy": 2.2289943635463714, "loss/hidden": 3.53046875, "loss/jsd": 0.0, "loss/logits": 0.20597830023616553, "step": 8280 }, { "epoch": 0.20725, "grad_norm": 32.5, "grad_norm_var": 72.63723958333334, "learning_rate": 0.0001, "loss": 7.4999, "loss/crossentropy": 2.201861135661602, "loss/hidden": 3.373828125, "loss/jsd": 0.0, "loss/logits": 0.1963998381048441, "step": 8290 }, { "epoch": 0.2075, "grad_norm": 33.5, "grad_norm_var": 75.33932291666666, "learning_rate": 0.0001, "loss": 7.4931, "loss/crossentropy": 2.0565819829702376, "loss/hidden": 3.587890625, "loss/jsd": 0.0, "loss/logits": 0.2153871137648821, "step": 8300 }, { "epoch": 0.20775, "grad_norm": 33.75, "grad_norm_var": 2.57265625, "learning_rate": 0.0001, "loss": 7.5969, "loss/crossentropy": 2.2017314821481704, "loss/hidden": 3.509765625, "loss/jsd": 0.0, "loss/logits": 0.20366298444569111, "step": 8310 }, { "epoch": 0.208, "grad_norm": 31.5, "grad_norm_var": 93.0625, "learning_rate": 0.0001, "loss": 7.5622, "loss/crossentropy": 2.139016662538052, "loss/hidden": 3.40859375, "loss/jsd": 0.0, "loss/logits": 0.18943111039698124, "step": 8320 }, { "epoch": 0.20825, "grad_norm": 48.25, "grad_norm_var": 20.628125, "learning_rate": 0.0001, "loss": 7.4914, "loss/crossentropy": 2.1512654572725296, "loss/hidden": 3.39765625, "loss/jsd": 0.0, "loss/logits": 0.20672829449176788, "step": 8330 }, { "epoch": 0.2085, "grad_norm": 32.25, "grad_norm_var": 19.8759765625, "learning_rate": 0.0001, "loss": 7.5845, "loss/crossentropy": 2.0606055706739426, "loss/hidden": 3.49921875, "loss/jsd": 0.0, "loss/logits": 0.20263293255120515, "step": 8340 }, { "epoch": 0.20875, "grad_norm": 31.375, "grad_norm_var": 55.4369140625, "learning_rate": 0.0001, "loss": 7.4669, "loss/crossentropy": 2.074323023855686, "loss/hidden": 3.344921875, "loss/jsd": 0.0, "loss/logits": 0.17513209469616414, "step": 8350 }, { "epoch": 0.209, "grad_norm": 29.5, "grad_norm_var": 54.5603515625, "learning_rate": 0.0001, "loss": 7.5259, "loss/crossentropy": 2.197512632608414, "loss/hidden": 3.438671875, "loss/jsd": 0.0, "loss/logits": 0.20681734681129454, "step": 8360 }, { "epoch": 0.20925, "grad_norm": 29.625, "grad_norm_var": 3.138541666666667, "learning_rate": 0.0001, "loss": 7.58, "loss/crossentropy": 2.2005858927965165, "loss/hidden": 3.414453125, "loss/jsd": 0.0, "loss/logits": 0.2138770181685686, "step": 8370 }, { "epoch": 0.2095, "grad_norm": 33.0, "grad_norm_var": 7.381184895833333, "learning_rate": 0.0001, "loss": 7.604, "loss/crossentropy": 2.2078860282897947, "loss/hidden": 3.570703125, "loss/jsd": 0.0, "loss/logits": 0.21495202817022802, "step": 8380 }, { "epoch": 0.20975, "grad_norm": 31.875, "grad_norm_var": 6.4931640625, "learning_rate": 0.0001, "loss": 7.5081, "loss/crossentropy": 2.1586494892835617, "loss/hidden": 3.434375, "loss/jsd": 0.0, "loss/logits": 0.19594298098236324, "step": 8390 }, { "epoch": 0.21, "grad_norm": 32.5, "grad_norm_var": 2.05390625, "learning_rate": 0.0001, "loss": 7.5932, "loss/crossentropy": 2.1475181549787523, "loss/hidden": 3.41953125, "loss/jsd": 0.0, "loss/logits": 0.19644266795367002, "step": 8400 }, { "epoch": 0.21025, "grad_norm": 32.5, "grad_norm_var": 1.1671223958333334, "learning_rate": 0.0001, "loss": 7.4963, "loss/crossentropy": 2.2497402161359785, "loss/hidden": 3.29296875, "loss/jsd": 0.0, "loss/logits": 0.20083448998630046, "step": 8410 }, { "epoch": 0.2105, "grad_norm": 36.0, "grad_norm_var": 4.269205729166667, "learning_rate": 0.0001, "loss": 7.4909, "loss/crossentropy": 2.1163347721099854, "loss/hidden": 3.487890625, "loss/jsd": 0.0, "loss/logits": 0.19131154641509057, "step": 8420 }, { "epoch": 0.21075, "grad_norm": 32.25, "grad_norm_var": 5.568489583333333, "learning_rate": 0.0001, "loss": 7.5108, "loss/crossentropy": 2.1301738530397416, "loss/hidden": 3.437890625, "loss/jsd": 0.0, "loss/logits": 0.18502578027546407, "step": 8430 }, { "epoch": 0.211, "grad_norm": 30.625, "grad_norm_var": 15.867708333333333, "learning_rate": 0.0001, "loss": 7.5275, "loss/crossentropy": 2.127982833981514, "loss/hidden": 3.399609375, "loss/jsd": 0.0, "loss/logits": 0.2027182461693883, "step": 8440 }, { "epoch": 0.21125, "grad_norm": 29.625, "grad_norm_var": 18.677018229166666, "learning_rate": 0.0001, "loss": 7.5249, "loss/crossentropy": 2.1977096766233446, "loss/hidden": 3.477734375, "loss/jsd": 0.0, "loss/logits": 0.20560352243483065, "step": 8450 }, { "epoch": 0.2115, "grad_norm": 29.75, "grad_norm_var": 16.110872395833333, "learning_rate": 0.0001, "loss": 7.4708, "loss/crossentropy": 2.1555270701646805, "loss/hidden": 3.3078125, "loss/jsd": 0.0, "loss/logits": 0.18343626484274864, "step": 8460 }, { "epoch": 0.21175, "grad_norm": 29.5, "grad_norm_var": 25.607291666666665, "learning_rate": 0.0001, "loss": 7.4635, "loss/crossentropy": 2.1936387956142425, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.21095133386552334, "step": 8470 }, { "epoch": 0.212, "grad_norm": 33.0, "grad_norm_var": 16.8197265625, "learning_rate": 0.0001, "loss": 7.5373, "loss/crossentropy": 2.231749877333641, "loss/hidden": 3.382421875, "loss/jsd": 0.0, "loss/logits": 0.1943896021693945, "step": 8480 }, { "epoch": 0.21225, "grad_norm": 31.75, "grad_norm_var": 7.643489583333333, "learning_rate": 0.0001, "loss": 7.5582, "loss/crossentropy": 2.171339076757431, "loss/hidden": 3.475390625, "loss/jsd": 0.0, "loss/logits": 0.20933733694255352, "step": 8490 }, { "epoch": 0.2125, "grad_norm": 36.5, "grad_norm_var": 3.863541666666667, "learning_rate": 0.0001, "loss": 7.5613, "loss/crossentropy": 2.170750407129526, "loss/hidden": 3.416015625, "loss/jsd": 0.0, "loss/logits": 0.19806241411715747, "step": 8500 }, { "epoch": 0.21275, "grad_norm": 30.625, "grad_norm_var": 6.814518229166667, "learning_rate": 0.0001, "loss": 7.475, "loss/crossentropy": 2.1906991213560105, "loss/hidden": 3.403515625, "loss/jsd": 0.0, "loss/logits": 0.19974222220480442, "step": 8510 }, { "epoch": 0.213, "grad_norm": 31.75, "grad_norm_var": 1.0955729166666666, "learning_rate": 0.0001, "loss": 7.5352, "loss/crossentropy": 2.101140005886555, "loss/hidden": 3.46953125, "loss/jsd": 0.0, "loss/logits": 0.20742072239518167, "step": 8520 }, { "epoch": 0.21325, "grad_norm": 33.0, "grad_norm_var": 2.234375, "learning_rate": 0.0001, "loss": 7.6097, "loss/crossentropy": 2.142820453643799, "loss/hidden": 3.417578125, "loss/jsd": 0.0, "loss/logits": 0.2011982973664999, "step": 8530 }, { "epoch": 0.2135, "grad_norm": 30.75, "grad_norm_var": 4.988541666666666, "learning_rate": 0.0001, "loss": 7.5074, "loss/crossentropy": 2.274955728650093, "loss/hidden": 3.254296875, "loss/jsd": 0.0, "loss/logits": 0.18379125297069548, "step": 8540 }, { "epoch": 0.21375, "grad_norm": 29.875, "grad_norm_var": 5.245572916666666, "learning_rate": 0.0001, "loss": 7.4441, "loss/crossentropy": 2.2132862359285355, "loss/hidden": 3.355859375, "loss/jsd": 0.0, "loss/logits": 0.20189897604286672, "step": 8550 }, { "epoch": 0.214, "grad_norm": 29.375, "grad_norm_var": 1.3567057291666667, "learning_rate": 0.0001, "loss": 7.5183, "loss/crossentropy": 2.1324101746082307, "loss/hidden": 3.542578125, "loss/jsd": 0.0, "loss/logits": 0.2091756235808134, "step": 8560 }, { "epoch": 0.21425, "grad_norm": 37.25, "grad_norm_var": 26.8353515625, "learning_rate": 0.0001, "loss": 7.5208, "loss/crossentropy": 2.2107133328914643, "loss/hidden": 3.4234375, "loss/jsd": 0.0, "loss/logits": 0.19691390544176102, "step": 8570 }, { "epoch": 0.2145, "grad_norm": 32.25, "grad_norm_var": 8.3525390625, "learning_rate": 0.0001, "loss": 7.4517, "loss/crossentropy": 2.1903372198343276, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.21582610420882703, "step": 8580 }, { "epoch": 0.21475, "grad_norm": 31.5, "grad_norm_var": 1.2587890625, "learning_rate": 0.0001, "loss": 7.5885, "loss/crossentropy": 2.1499848544597624, "loss/hidden": 3.508984375, "loss/jsd": 0.0, "loss/logits": 0.2200299922376871, "step": 8590 }, { "epoch": 0.215, "grad_norm": 44.75, "grad_norm_var": 11.795833333333333, "learning_rate": 0.0001, "loss": 7.5716, "loss/crossentropy": 2.2607037901878355, "loss/hidden": 3.37890625, "loss/jsd": 0.0, "loss/logits": 0.19387190453708172, "step": 8600 }, { "epoch": 0.21525, "grad_norm": 31.875, "grad_norm_var": 13.584830729166667, "learning_rate": 0.0001, "loss": 7.4842, "loss/crossentropy": 2.2367573767900466, "loss/hidden": 3.291796875, "loss/jsd": 0.0, "loss/logits": 0.1797945935279131, "step": 8610 }, { "epoch": 0.2155, "grad_norm": 31.0, "grad_norm_var": 6.67890625, "learning_rate": 0.0001, "loss": 7.5594, "loss/crossentropy": 2.132373479008675, "loss/hidden": 3.405078125, "loss/jsd": 0.0, "loss/logits": 0.19787863213568926, "step": 8620 }, { "epoch": 0.21575, "grad_norm": 32.75, "grad_norm_var": 1.5483723958333333, "learning_rate": 0.0001, "loss": 7.5224, "loss/crossentropy": 2.1131544440984724, "loss/hidden": 3.434765625, "loss/jsd": 0.0, "loss/logits": 0.18490277007222175, "step": 8630 }, { "epoch": 0.216, "grad_norm": 28.125, "grad_norm_var": 2.9827473958333335, "learning_rate": 0.0001, "loss": 7.5298, "loss/crossentropy": 2.1318382740020754, "loss/hidden": 3.45625, "loss/jsd": 0.0, "loss/logits": 0.20170295760035514, "step": 8640 }, { "epoch": 0.21625, "grad_norm": 29.5, "grad_norm_var": 2.6635416666666667, "learning_rate": 0.0001, "loss": 7.5196, "loss/crossentropy": 2.143068727850914, "loss/hidden": 3.448046875, "loss/jsd": 0.0, "loss/logits": 0.2085008706897497, "step": 8650 }, { "epoch": 0.2165, "grad_norm": 31.75, "grad_norm_var": 4.72265625, "learning_rate": 0.0001, "loss": 7.583, "loss/crossentropy": 2.191588431596756, "loss/hidden": 3.420703125, "loss/jsd": 0.0, "loss/logits": 0.19564641676843167, "step": 8660 }, { "epoch": 0.21675, "grad_norm": 36.75, "grad_norm_var": 5.292122395833333, "learning_rate": 0.0001, "loss": 7.5915, "loss/crossentropy": 2.1791785418987275, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.19772212151437998, "step": 8670 }, { "epoch": 0.217, "grad_norm": 29.75, "grad_norm_var": 4.042708333333334, "learning_rate": 0.0001, "loss": 7.496, "loss/crossentropy": 2.1210457414388655, "loss/hidden": 3.411328125, "loss/jsd": 0.0, "loss/logits": 0.20252102315425874, "step": 8680 }, { "epoch": 0.21725, "grad_norm": 30.25, "grad_norm_var": 2.02265625, "learning_rate": 0.0001, "loss": 7.4529, "loss/crossentropy": 2.1803869009017944, "loss/hidden": 3.412890625, "loss/jsd": 0.0, "loss/logits": 0.19335724450647832, "step": 8690 }, { "epoch": 0.2175, "grad_norm": 32.5, "grad_norm_var": 3.9624348958333333, "learning_rate": 0.0001, "loss": 7.5914, "loss/crossentropy": 2.134222483634949, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.19238188080489635, "step": 8700 }, { "epoch": 0.21775, "grad_norm": 32.75, "grad_norm_var": 6.13515625, "learning_rate": 0.0001, "loss": 7.5587, "loss/crossentropy": 2.1801154255867004, "loss/hidden": 3.456640625, "loss/jsd": 0.0, "loss/logits": 0.19446788169443607, "step": 8710 }, { "epoch": 0.218, "grad_norm": 30.0, "grad_norm_var": 3.06015625, "learning_rate": 0.0001, "loss": 7.561, "loss/crossentropy": 2.1410862773656847, "loss/hidden": 3.444140625, "loss/jsd": 0.0, "loss/logits": 0.20139614418148993, "step": 8720 }, { "epoch": 0.21825, "grad_norm": 33.5, "grad_norm_var": 2.0707682291666667, "learning_rate": 0.0001, "loss": 7.527, "loss/crossentropy": 2.137886345386505, "loss/hidden": 3.4, "loss/jsd": 0.0, "loss/logits": 0.2020585484802723, "step": 8730 }, { "epoch": 0.2185, "grad_norm": 31.125, "grad_norm_var": 2.278125, "learning_rate": 0.0001, "loss": 7.5367, "loss/crossentropy": 2.1495157063007353, "loss/hidden": 3.352734375, "loss/jsd": 0.0, "loss/logits": 0.2042895916849375, "step": 8740 }, { "epoch": 0.21875, "grad_norm": 31.25, "grad_norm_var": 4.108072916666667, "learning_rate": 0.0001, "loss": 7.5061, "loss/crossentropy": 2.2290263891220095, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.18052256256341934, "step": 8750 }, { "epoch": 0.219, "grad_norm": 30.75, "grad_norm_var": 3.0518229166666666, "learning_rate": 0.0001, "loss": 7.5422, "loss/crossentropy": 2.204638335108757, "loss/hidden": 3.273046875, "loss/jsd": 0.0, "loss/logits": 0.1816064776852727, "step": 8760 }, { "epoch": 0.21925, "grad_norm": 33.75, "grad_norm_var": 1.0259765625, "learning_rate": 0.0001, "loss": 7.5936, "loss/crossentropy": 2.173864471912384, "loss/hidden": 3.409375, "loss/jsd": 0.0, "loss/logits": 0.2168831005692482, "step": 8770 }, { "epoch": 0.2195, "grad_norm": 33.0, "grad_norm_var": 1.4957682291666667, "learning_rate": 0.0001, "loss": 7.6125, "loss/crossentropy": 2.2631371796131132, "loss/hidden": 3.37421875, "loss/jsd": 0.0, "loss/logits": 0.1991407833993435, "step": 8780 }, { "epoch": 0.21975, "grad_norm": 30.625, "grad_norm_var": 1.62890625, "learning_rate": 0.0001, "loss": 7.486, "loss/crossentropy": 2.156681847572327, "loss/hidden": 3.38359375, "loss/jsd": 0.0, "loss/logits": 0.18595759831368924, "step": 8790 }, { "epoch": 0.22, "grad_norm": 33.5, "grad_norm_var": 1.2202473958333333, "learning_rate": 0.0001, "loss": 7.4714, "loss/crossentropy": 2.069344013929367, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.19667117949575186, "step": 8800 }, { "epoch": 0.22025, "grad_norm": 30.625, "grad_norm_var": 3.7875, "learning_rate": 0.0001, "loss": 7.6153, "loss/crossentropy": 2.159384399652481, "loss/hidden": 3.43125, "loss/jsd": 0.0, "loss/logits": 0.19717039167881012, "step": 8810 }, { "epoch": 0.2205, "grad_norm": 31.25, "grad_norm_var": 2.7080729166666666, "learning_rate": 0.0001, "loss": 7.5037, "loss/crossentropy": 2.06855808198452, "loss/hidden": 3.596875, "loss/jsd": 0.0, "loss/logits": 0.19925107434391975, "step": 8820 }, { "epoch": 0.22075, "grad_norm": 33.0, "grad_norm_var": 14.142122395833333, "learning_rate": 0.0001, "loss": 7.61, "loss/crossentropy": 2.1856627821922303, "loss/hidden": 3.4203125, "loss/jsd": 0.0, "loss/logits": 0.19198091998696326, "step": 8830 }, { "epoch": 0.221, "grad_norm": 29.75, "grad_norm_var": 23.12265625, "learning_rate": 0.0001, "loss": 7.5265, "loss/crossentropy": 2.215584135055542, "loss/hidden": 3.360546875, "loss/jsd": 0.0, "loss/logits": 0.193273763731122, "step": 8840 }, { "epoch": 0.22125, "grad_norm": 32.5, "grad_norm_var": 23.365625, "learning_rate": 0.0001, "loss": 7.6189, "loss/crossentropy": 2.217027261853218, "loss/hidden": 3.487890625, "loss/jsd": 0.0, "loss/logits": 0.1961042732000351, "step": 8850 }, { "epoch": 0.2215, "grad_norm": 31.25, "grad_norm_var": 2.9160807291666666, "learning_rate": 0.0001, "loss": 7.441, "loss/crossentropy": 2.1981815114617347, "loss/hidden": 3.42578125, "loss/jsd": 0.0, "loss/logits": 0.19394037276506423, "step": 8860 }, { "epoch": 0.22175, "grad_norm": 27.75, "grad_norm_var": 9.90390625, "learning_rate": 0.0001, "loss": 7.4553, "loss/crossentropy": 2.112216001749039, "loss/hidden": 3.380859375, "loss/jsd": 0.0, "loss/logits": 0.18079342246055602, "step": 8870 }, { "epoch": 0.222, "grad_norm": 31.375, "grad_norm_var": 7.8384765625, "learning_rate": 0.0001, "loss": 7.4818, "loss/crossentropy": 2.101804518699646, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.1867048390209675, "step": 8880 }, { "epoch": 0.22225, "grad_norm": 32.25, "grad_norm_var": 9.934309895833334, "learning_rate": 0.0001, "loss": 7.5937, "loss/crossentropy": 2.193597176671028, "loss/hidden": 3.404296875, "loss/jsd": 0.0, "loss/logits": 0.20191702879965306, "step": 8890 }, { "epoch": 0.2225, "grad_norm": 30.0, "grad_norm_var": 32.552083333333336, "learning_rate": 0.0001, "loss": 7.4298, "loss/crossentropy": 2.2074760258197785, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.19682303816080093, "step": 8900 }, { "epoch": 0.22275, "grad_norm": 36.0, "grad_norm_var": 28.680143229166667, "learning_rate": 0.0001, "loss": 7.5468, "loss/crossentropy": 2.008703652024269, "loss/hidden": 3.46015625, "loss/jsd": 0.0, "loss/logits": 0.2202893177047372, "step": 8910 }, { "epoch": 0.223, "grad_norm": 33.25, "grad_norm_var": 10.238541666666666, "learning_rate": 0.0001, "loss": 7.5334, "loss/crossentropy": 2.0829237014055253, "loss/hidden": 3.330859375, "loss/jsd": 0.0, "loss/logits": 0.1849207304418087, "step": 8920 }, { "epoch": 0.22325, "grad_norm": 33.25, "grad_norm_var": 3.3676432291666667, "learning_rate": 0.0001, "loss": 7.5348, "loss/crossentropy": 2.1565073817968368, "loss/hidden": 3.37109375, "loss/jsd": 0.0, "loss/logits": 0.19267312660813332, "step": 8930 }, { "epoch": 0.2235, "grad_norm": 42.5, "grad_norm_var": 14.748372395833334, "learning_rate": 0.0001, "loss": 7.5472, "loss/crossentropy": 2.0750382035970687, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.1899881549179554, "step": 8940 }, { "epoch": 0.22375, "grad_norm": 35.5, "grad_norm_var": 12.167122395833333, "learning_rate": 0.0001, "loss": 7.6047, "loss/crossentropy": 2.1998844176530836, "loss/hidden": 3.358984375, "loss/jsd": 0.0, "loss/logits": 0.1992990154772997, "step": 8950 }, { "epoch": 0.224, "grad_norm": 39.5, "grad_norm_var": 8.703125, "learning_rate": 0.0001, "loss": 7.5236, "loss/crossentropy": 2.1398618072271347, "loss/hidden": 3.38671875, "loss/jsd": 0.0, "loss/logits": 0.2044746194034815, "step": 8960 }, { "epoch": 0.22425, "grad_norm": 29.5, "grad_norm_var": 7.711458333333334, "learning_rate": 0.0001, "loss": 7.5588, "loss/crossentropy": 2.0922764956951143, "loss/hidden": 3.363671875, "loss/jsd": 0.0, "loss/logits": 0.18962055966258048, "step": 8970 }, { "epoch": 0.2245, "grad_norm": 38.25, "grad_norm_var": 8.417122395833333, "learning_rate": 0.0001, "loss": 7.5705, "loss/crossentropy": 2.2088142544031144, "loss/hidden": 3.418359375, "loss/jsd": 0.0, "loss/logits": 0.2040243223309517, "step": 8980 }, { "epoch": 0.22475, "grad_norm": 31.25, "grad_norm_var": 6.708268229166666, "learning_rate": 0.0001, "loss": 7.5883, "loss/crossentropy": 2.0866949379444124, "loss/hidden": 3.488671875, "loss/jsd": 0.0, "loss/logits": 0.21130447909235955, "step": 8990 }, { "epoch": 0.225, "grad_norm": 28.375, "grad_norm_var": 7.254166666666666, "learning_rate": 0.0001, "loss": 7.5008, "loss/crossentropy": 2.117489975690842, "loss/hidden": 3.49375, "loss/jsd": 0.0, "loss/logits": 0.18843676745891572, "step": 9000 }, { "epoch": 0.22525, "grad_norm": 32.5, "grad_norm_var": 18.155143229166665, "learning_rate": 0.0001, "loss": 7.4718, "loss/crossentropy": 2.1349531918764115, "loss/hidden": 3.337890625, "loss/jsd": 0.0, "loss/logits": 0.18441876713186503, "step": 9010 }, { "epoch": 0.2255, "grad_norm": 30.25, "grad_norm_var": 7.642708333333333, "learning_rate": 0.0001, "loss": 7.4722, "loss/crossentropy": 2.158918860554695, "loss/hidden": 3.3796875, "loss/jsd": 0.0, "loss/logits": 0.1831710446625948, "step": 9020 }, { "epoch": 0.22575, "grad_norm": 30.625, "grad_norm_var": 5.802083333333333, "learning_rate": 0.0001, "loss": 7.5065, "loss/crossentropy": 2.1577743917703627, "loss/hidden": 3.443359375, "loss/jsd": 0.0, "loss/logits": 0.19996861293911933, "step": 9030 }, { "epoch": 0.226, "grad_norm": 35.75, "grad_norm_var": 6.050455729166667, "learning_rate": 0.0001, "loss": 7.54, "loss/crossentropy": 2.1675800561904905, "loss/hidden": 3.3734375, "loss/jsd": 0.0, "loss/logits": 0.19423422291874887, "step": 9040 }, { "epoch": 0.22625, "grad_norm": 30.25, "grad_norm_var": 8.01015625, "learning_rate": 0.0001, "loss": 7.5053, "loss/crossentropy": 2.1748566299676897, "loss/hidden": 3.298046875, "loss/jsd": 0.0, "loss/logits": 0.19081774912774563, "step": 9050 }, { "epoch": 0.2265, "grad_norm": 107.5, "grad_norm_var": 363.3160807291667, "learning_rate": 0.0001, "loss": 7.5204, "loss/crossentropy": 2.1024440199136736, "loss/hidden": 3.3796875, "loss/jsd": 0.0, "loss/logits": 0.19131676983088255, "step": 9060 }, { "epoch": 0.22675, "grad_norm": 43.0, "grad_norm_var": 382.7749348958333, "learning_rate": 0.0001, "loss": 7.4636, "loss/crossentropy": 2.0358673214912413, "loss/hidden": 3.53671875, "loss/jsd": 0.0, "loss/logits": 0.20718471184372902, "step": 9070 }, { "epoch": 0.227, "grad_norm": 38.0, "grad_norm_var": 34.826497395833336, "learning_rate": 0.0001, "loss": 7.5132, "loss/crossentropy": 2.1334098488092423, "loss/hidden": 3.454296875, "loss/jsd": 0.0, "loss/logits": 0.1900653975084424, "step": 9080 }, { "epoch": 0.22725, "grad_norm": 47.5, "grad_norm_var": 35.35182291666667, "learning_rate": 0.0001, "loss": 7.5471, "loss/crossentropy": 2.158940353989601, "loss/hidden": 3.351953125, "loss/jsd": 0.0, "loss/logits": 0.19873067885637283, "step": 9090 }, { "epoch": 0.2275, "grad_norm": 32.5, "grad_norm_var": 25.093489583333334, "learning_rate": 0.0001, "loss": 7.4687, "loss/crossentropy": 2.110513925552368, "loss/hidden": 3.409375, "loss/jsd": 0.0, "loss/logits": 0.18377724830061198, "step": 9100 }, { "epoch": 0.22775, "grad_norm": 47.75, "grad_norm_var": 31.540625, "learning_rate": 0.0001, "loss": 7.5241, "loss/crossentropy": 2.323571813106537, "loss/hidden": 3.30234375, "loss/jsd": 0.0, "loss/logits": 0.18574214577674866, "step": 9110 }, { "epoch": 0.228, "grad_norm": 32.0, "grad_norm_var": 52.509375, "learning_rate": 0.0001, "loss": 7.5911, "loss/crossentropy": 2.2099075824022294, "loss/hidden": 3.365234375, "loss/jsd": 0.0, "loss/logits": 0.1919841269031167, "step": 9120 }, { "epoch": 0.22825, "grad_norm": 28.5, "grad_norm_var": 44.95104166666667, "learning_rate": 0.0001, "loss": 7.5406, "loss/crossentropy": 2.146890181303024, "loss/hidden": 3.46640625, "loss/jsd": 0.0, "loss/logits": 0.19247145019471645, "step": 9130 }, { "epoch": 0.2285, "grad_norm": 39.0, "grad_norm_var": 14.4625, "learning_rate": 0.0001, "loss": 7.4844, "loss/crossentropy": 2.127192445099354, "loss/hidden": 3.3484375, "loss/jsd": 0.0, "loss/logits": 0.17968443408608437, "step": 9140 }, { "epoch": 0.22875, "grad_norm": 28.75, "grad_norm_var": 15.970572916666667, "learning_rate": 0.0001, "loss": 7.5806, "loss/crossentropy": 2.0811379849910736, "loss/hidden": 3.437109375, "loss/jsd": 0.0, "loss/logits": 0.1876799188554287, "step": 9150 }, { "epoch": 0.229, "grad_norm": 36.25, "grad_norm_var": 16.017643229166666, "learning_rate": 0.0001, "loss": 7.4714, "loss/crossentropy": 2.1674265801906585, "loss/hidden": 3.404296875, "loss/jsd": 0.0, "loss/logits": 0.19961154460906982, "step": 9160 }, { "epoch": 0.22925, "grad_norm": 30.25, "grad_norm_var": 14.31640625, "learning_rate": 0.0001, "loss": 7.4578, "loss/crossentropy": 2.1701185166835786, "loss/hidden": 3.306640625, "loss/jsd": 0.0, "loss/logits": 0.19284930936992167, "step": 9170 }, { "epoch": 0.2295, "grad_norm": 33.0, "grad_norm_var": 24.164322916666666, "learning_rate": 0.0001, "loss": 7.5706, "loss/crossentropy": 2.2007659181952475, "loss/hidden": 3.483984375, "loss/jsd": 0.0, "loss/logits": 0.20873381607234479, "step": 9180 }, { "epoch": 0.22975, "grad_norm": 33.5, "grad_norm_var": 167.6619140625, "learning_rate": 0.0001, "loss": 7.5246, "loss/crossentropy": 2.013364678621292, "loss/hidden": 3.480859375, "loss/jsd": 0.0, "loss/logits": 0.1943978626281023, "step": 9190 }, { "epoch": 0.23, "grad_norm": 32.5, "grad_norm_var": 161.8853515625, "learning_rate": 0.0001, "loss": 7.5824, "loss/crossentropy": 2.1492466554045677, "loss/hidden": 3.383984375, "loss/jsd": 0.0, "loss/logits": 0.19080942254513503, "step": 9200 }, { "epoch": 0.23025, "grad_norm": 35.0, "grad_norm_var": 25.631184895833332, "learning_rate": 0.0001, "loss": 7.4525, "loss/crossentropy": 2.1856732040643694, "loss/hidden": 3.508203125, "loss/jsd": 0.0, "loss/logits": 0.21295880153775215, "step": 9210 }, { "epoch": 0.2305, "grad_norm": 29.625, "grad_norm_var": 44.44348958333333, "learning_rate": 0.0001, "loss": 7.5668, "loss/crossentropy": 2.1970660746097566, "loss/hidden": 3.31171875, "loss/jsd": 0.0, "loss/logits": 0.1991808257997036, "step": 9220 }, { "epoch": 0.23075, "grad_norm": 33.25, "grad_norm_var": 29.702083333333334, "learning_rate": 0.0001, "loss": 7.5854, "loss/crossentropy": 2.1504436887800695, "loss/hidden": 3.446875, "loss/jsd": 0.0, "loss/logits": 0.18685424784198404, "step": 9230 }, { "epoch": 0.231, "grad_norm": 29.625, "grad_norm_var": 10.211393229166667, "learning_rate": 0.0001, "loss": 7.3922, "loss/crossentropy": 2.1555099219083784, "loss/hidden": 3.510546875, "loss/jsd": 0.0, "loss/logits": 0.20160883199423552, "step": 9240 }, { "epoch": 0.23125, "grad_norm": 30.75, "grad_norm_var": 6.043489583333334, "learning_rate": 0.0001, "loss": 7.4663, "loss/crossentropy": 2.153862714767456, "loss/hidden": 3.34765625, "loss/jsd": 0.0, "loss/logits": 0.18072083070874215, "step": 9250 }, { "epoch": 0.2315, "grad_norm": 35.25, "grad_norm_var": 5.314322916666667, "learning_rate": 0.0001, "loss": 7.4845, "loss/crossentropy": 2.2198209404945373, "loss/hidden": 3.328515625, "loss/jsd": 0.0, "loss/logits": 0.19124398212879895, "step": 9260 }, { "epoch": 0.23175, "grad_norm": 29.25, "grad_norm_var": 8.071875, "learning_rate": 0.0001, "loss": 7.6265, "loss/crossentropy": 2.1362643599510194, "loss/hidden": 3.42265625, "loss/jsd": 0.0, "loss/logits": 0.18900094255805017, "step": 9270 }, { "epoch": 0.232, "grad_norm": 30.375, "grad_norm_var": 9.327018229166667, "learning_rate": 0.0001, "loss": 7.4949, "loss/crossentropy": 2.226493775844574, "loss/hidden": 3.346875, "loss/jsd": 0.0, "loss/logits": 0.19106594361364843, "step": 9280 }, { "epoch": 0.23225, "grad_norm": 42.75, "grad_norm_var": 11.729622395833333, "learning_rate": 0.0001, "loss": 7.5367, "loss/crossentropy": 2.212230810523033, "loss/hidden": 3.358203125, "loss/jsd": 0.0, "loss/logits": 0.20718637369573117, "step": 9290 }, { "epoch": 0.2325, "grad_norm": 34.5, "grad_norm_var": 13.130989583333333, "learning_rate": 0.0001, "loss": 7.4321, "loss/crossentropy": 2.021953631937504, "loss/hidden": 3.455859375, "loss/jsd": 0.0, "loss/logits": 0.19819956757128238, "step": 9300 }, { "epoch": 0.23275, "grad_norm": 31.125, "grad_norm_var": 5.062239583333334, "learning_rate": 0.0001, "loss": 7.5474, "loss/crossentropy": 2.147363981604576, "loss/hidden": 3.435546875, "loss/jsd": 0.0, "loss/logits": 0.1922474455088377, "step": 9310 }, { "epoch": 0.233, "grad_norm": 32.25, "grad_norm_var": 3.471875, "learning_rate": 0.0001, "loss": 7.5815, "loss/crossentropy": 2.1315445095300674, "loss/hidden": 3.546875, "loss/jsd": 0.0, "loss/logits": 0.22608386687934398, "step": 9320 }, { "epoch": 0.23325, "grad_norm": 29.0, "grad_norm_var": 4.233072916666667, "learning_rate": 0.0001, "loss": 7.5392, "loss/crossentropy": 2.0831587575376034, "loss/hidden": 3.44921875, "loss/jsd": 0.0, "loss/logits": 0.20386858209967612, "step": 9330 }, { "epoch": 0.2335, "grad_norm": 31.75, "grad_norm_var": 225.54895833333333, "learning_rate": 0.0001, "loss": 7.5913, "loss/crossentropy": 2.2117609918117522, "loss/hidden": 3.427734375, "loss/jsd": 0.0, "loss/logits": 0.2390334837138653, "step": 9340 }, { "epoch": 0.23375, "grad_norm": 30.75, "grad_norm_var": 10.939518229166667, "learning_rate": 0.0001, "loss": 7.4838, "loss/crossentropy": 2.2391141802072525, "loss/hidden": 3.35, "loss/jsd": 0.0, "loss/logits": 0.1901895135641098, "step": 9350 }, { "epoch": 0.234, "grad_norm": 36.0, "grad_norm_var": 8.078059895833333, "learning_rate": 0.0001, "loss": 7.4995, "loss/crossentropy": 2.179610106348991, "loss/hidden": 3.396875, "loss/jsd": 0.0, "loss/logits": 0.19948177523910998, "step": 9360 }, { "epoch": 0.23425, "grad_norm": 37.0, "grad_norm_var": 4.773372395833333, "learning_rate": 0.0001, "loss": 7.566, "loss/crossentropy": 2.265758016705513, "loss/hidden": 3.49140625, "loss/jsd": 0.0, "loss/logits": 0.2015686921775341, "step": 9370 }, { "epoch": 0.2345, "grad_norm": 33.25, "grad_norm_var": 5.334309895833333, "learning_rate": 0.0001, "loss": 7.5378, "loss/crossentropy": 2.164343351125717, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.2006633374840021, "step": 9380 }, { "epoch": 0.23475, "grad_norm": 32.5, "grad_norm_var": 29.55625, "learning_rate": 0.0001, "loss": 7.5607, "loss/crossentropy": 2.150678759813309, "loss/hidden": 3.463671875, "loss/jsd": 0.0, "loss/logits": 0.22009918540716172, "step": 9390 }, { "epoch": 0.235, "grad_norm": 31.75, "grad_norm_var": 275.2309895833333, "learning_rate": 0.0001, "loss": 7.6283, "loss/crossentropy": 2.2133118510246277, "loss/hidden": 3.39296875, "loss/jsd": 0.0, "loss/logits": 0.18500468730926514, "step": 9400 }, { "epoch": 0.23525, "grad_norm": 29.125, "grad_norm_var": 3.2697265625, "learning_rate": 0.0001, "loss": 7.5132, "loss/crossentropy": 2.225054568052292, "loss/hidden": 3.37734375, "loss/jsd": 0.0, "loss/logits": 0.20218575857579707, "step": 9410 }, { "epoch": 0.2355, "grad_norm": 30.875, "grad_norm_var": 7.542122395833333, "learning_rate": 0.0001, "loss": 7.4509, "loss/crossentropy": 2.1811843127012254, "loss/hidden": 3.381640625, "loss/jsd": 0.0, "loss/logits": 0.19548335634171962, "step": 9420 }, { "epoch": 0.23575, "grad_norm": 31.875, "grad_norm_var": 1.2934895833333333, "learning_rate": 0.0001, "loss": 7.5001, "loss/crossentropy": 1.9795912995934486, "loss/hidden": 3.547265625, "loss/jsd": 0.0, "loss/logits": 0.19292352311313152, "step": 9430 }, { "epoch": 0.236, "grad_norm": 31.125, "grad_norm_var": 95.365625, "learning_rate": 0.0001, "loss": 7.5911, "loss/crossentropy": 2.032686772942543, "loss/hidden": 3.380859375, "loss/jsd": 0.0, "loss/logits": 0.1814481422305107, "step": 9440 }, { "epoch": 0.23625, "grad_norm": 31.375, "grad_norm_var": 199.14348958333332, "learning_rate": 0.0001, "loss": 7.4887, "loss/crossentropy": 2.1834616482257845, "loss/hidden": 3.34921875, "loss/jsd": 0.0, "loss/logits": 0.1900908298790455, "step": 9450 }, { "epoch": 0.2365, "grad_norm": 42.5, "grad_norm_var": 9.148893229166667, "learning_rate": 0.0001, "loss": 7.5461, "loss/crossentropy": 2.1409550577402117, "loss/hidden": 3.430078125, "loss/jsd": 0.0, "loss/logits": 0.19009583443403244, "step": 9460 }, { "epoch": 0.23675, "grad_norm": 30.0, "grad_norm_var": 11.903125, "learning_rate": 0.0001, "loss": 7.5157, "loss/crossentropy": 2.0411314353346826, "loss/hidden": 3.420703125, "loss/jsd": 0.0, "loss/logits": 0.18683939017355441, "step": 9470 }, { "epoch": 0.237, "grad_norm": 35.75, "grad_norm_var": 14.804622395833333, "learning_rate": 0.0001, "loss": 7.5184, "loss/crossentropy": 2.0848400443792343, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.2053835943341255, "step": 9480 }, { "epoch": 0.23725, "grad_norm": 31.875, "grad_norm_var": 4.326822916666667, "learning_rate": 0.0001, "loss": 7.5465, "loss/crossentropy": 2.198624536395073, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.20486781597137452, "step": 9490 }, { "epoch": 0.2375, "grad_norm": 39.75, "grad_norm_var": 23.5791015625, "learning_rate": 0.0001, "loss": 7.5646, "loss/crossentropy": 2.136742886900902, "loss/hidden": 3.412890625, "loss/jsd": 0.0, "loss/logits": 0.19704439640045165, "step": 9500 }, { "epoch": 0.23775, "grad_norm": 45.5, "grad_norm_var": 35.976497395833334, "learning_rate": 0.0001, "loss": 7.4662, "loss/crossentropy": 2.1736690044403075, "loss/hidden": 3.464453125, "loss/jsd": 0.0, "loss/logits": 0.19949170239269734, "step": 9510 }, { "epoch": 0.238, "grad_norm": 31.625, "grad_norm_var": 23.130989583333335, "learning_rate": 0.0001, "loss": 7.5822, "loss/crossentropy": 2.1334351167082786, "loss/hidden": 3.53828125, "loss/jsd": 0.0, "loss/logits": 0.20324139203876257, "step": 9520 }, { "epoch": 0.23825, "grad_norm": 29.25, "grad_norm_var": 17.6181640625, "learning_rate": 0.0001, "loss": 7.5358, "loss/crossentropy": 2.142412620782852, "loss/hidden": 3.44140625, "loss/jsd": 0.0, "loss/logits": 0.2030354391783476, "step": 9530 }, { "epoch": 0.2385, "grad_norm": 39.75, "grad_norm_var": 18.976041666666667, "learning_rate": 0.0001, "loss": 7.494, "loss/crossentropy": 2.181875669956207, "loss/hidden": 3.4671875, "loss/jsd": 0.0, "loss/logits": 0.1956815965473652, "step": 9540 }, { "epoch": 0.23875, "grad_norm": 34.75, "grad_norm_var": 8.832747395833334, "learning_rate": 0.0001, "loss": 7.648, "loss/crossentropy": 2.2528909504413606, "loss/hidden": 3.435546875, "loss/jsd": 0.0, "loss/logits": 0.20324745066463948, "step": 9550 }, { "epoch": 0.239, "grad_norm": 35.5, "grad_norm_var": 3.3268229166666665, "learning_rate": 0.0001, "loss": 7.5439, "loss/crossentropy": 2.2434193670749663, "loss/hidden": 3.415625, "loss/jsd": 0.0, "loss/logits": 0.20051232390105725, "step": 9560 }, { "epoch": 0.23925, "grad_norm": 31.75, "grad_norm_var": 1.7962890625, "learning_rate": 0.0001, "loss": 7.5977, "loss/crossentropy": 2.0810982078313827, "loss/hidden": 3.519921875, "loss/jsd": 0.0, "loss/logits": 0.1919446151703596, "step": 9570 }, { "epoch": 0.2395, "grad_norm": 31.5, "grad_norm_var": 18.6556640625, "learning_rate": 0.0001, "loss": 7.4919, "loss/crossentropy": 2.11810165643692, "loss/hidden": 3.272265625, "loss/jsd": 0.0, "loss/logits": 0.17909912299364805, "step": 9580 }, { "epoch": 0.23975, "grad_norm": 30.25, "grad_norm_var": 19.439322916666665, "learning_rate": 0.0001, "loss": 7.5892, "loss/crossentropy": 2.2458123177289964, "loss/hidden": 3.40703125, "loss/jsd": 0.0, "loss/logits": 0.2052942331880331, "step": 9590 }, { "epoch": 0.24, "grad_norm": 32.5, "grad_norm_var": 5.358268229166667, "learning_rate": 0.0001, "loss": 7.4833, "loss/crossentropy": 2.294092634320259, "loss/hidden": 3.355859375, "loss/jsd": 0.0, "loss/logits": 0.18821652382612228, "step": 9600 }, { "epoch": 0.24025, "grad_norm": 34.0, "grad_norm_var": 15.582747395833334, "learning_rate": 0.0001, "loss": 7.623, "loss/crossentropy": 2.152405506372452, "loss/hidden": 3.583203125, "loss/jsd": 0.0, "loss/logits": 0.20593744479119777, "step": 9610 }, { "epoch": 0.2405, "grad_norm": 30.5, "grad_norm_var": 11.377018229166667, "learning_rate": 0.0001, "loss": 7.6211, "loss/crossentropy": 2.0907988399267197, "loss/hidden": 3.487890625, "loss/jsd": 0.0, "loss/logits": 0.19778051003813743, "step": 9620 }, { "epoch": 0.24075, "grad_norm": 31.25, "grad_norm_var": 4.201822916666667, "learning_rate": 0.0001, "loss": 7.5435, "loss/crossentropy": 2.1334788501262665, "loss/hidden": 3.402734375, "loss/jsd": 0.0, "loss/logits": 0.1883639894425869, "step": 9630 }, { "epoch": 0.241, "grad_norm": 32.25, "grad_norm_var": 9.456705729166666, "learning_rate": 0.0001, "loss": 7.4907, "loss/crossentropy": 2.1615520387887956, "loss/hidden": 3.52734375, "loss/jsd": 0.0, "loss/logits": 0.212527497112751, "step": 9640 }, { "epoch": 0.24125, "grad_norm": 30.875, "grad_norm_var": 9.05, "learning_rate": 0.0001, "loss": 7.4189, "loss/crossentropy": 2.182580092549324, "loss/hidden": 3.437890625, "loss/jsd": 0.0, "loss/logits": 0.1890213243663311, "step": 9650 }, { "epoch": 0.2415, "grad_norm": 29.75, "grad_norm_var": 4.44765625, "learning_rate": 0.0001, "loss": 7.5499, "loss/crossentropy": 2.1858610659837723, "loss/hidden": 3.375390625, "loss/jsd": 0.0, "loss/logits": 0.1913149781525135, "step": 9660 }, { "epoch": 0.24175, "grad_norm": 29.875, "grad_norm_var": 7.605989583333334, "learning_rate": 0.0001, "loss": 7.4714, "loss/crossentropy": 2.212157425284386, "loss/hidden": 3.312890625, "loss/jsd": 0.0, "loss/logits": 0.1864764802157879, "step": 9670 }, { "epoch": 0.242, "grad_norm": 30.375, "grad_norm_var": 6.686393229166667, "learning_rate": 0.0001, "loss": 7.4193, "loss/crossentropy": 2.226425829529762, "loss/hidden": 3.341796875, "loss/jsd": 0.0, "loss/logits": 0.19140857569873332, "step": 9680 }, { "epoch": 0.24225, "grad_norm": 31.0, "grad_norm_var": 2.3114583333333334, "learning_rate": 0.0001, "loss": 7.5538, "loss/crossentropy": 2.123721697926521, "loss/hidden": 3.612109375, "loss/jsd": 0.0, "loss/logits": 0.21398749127984046, "step": 9690 }, { "epoch": 0.2425, "grad_norm": 31.25, "grad_norm_var": 2.03515625, "learning_rate": 0.0001, "loss": 7.4686, "loss/crossentropy": 2.189563122391701, "loss/hidden": 3.285546875, "loss/jsd": 0.0, "loss/logits": 0.19756122268736362, "step": 9700 }, { "epoch": 0.24275, "grad_norm": 36.75, "grad_norm_var": 4326.120768229167, "learning_rate": 0.0001, "loss": 7.6897, "loss/crossentropy": 2.137082815170288, "loss/hidden": 3.33359375, "loss/jsd": 0.0, "loss/logits": 0.18757453747093678, "step": 9710 }, { "epoch": 0.243, "grad_norm": 32.5, "grad_norm_var": 4357.42265625, "learning_rate": 0.0001, "loss": 7.4598, "loss/crossentropy": 2.1567126482725145, "loss/hidden": 3.319921875, "loss/jsd": 0.0, "loss/logits": 0.19823984801769257, "step": 9720 }, { "epoch": 0.24325, "grad_norm": 30.625, "grad_norm_var": 3.7192057291666667, "learning_rate": 0.0001, "loss": 7.5681, "loss/crossentropy": 2.0657265201210975, "loss/hidden": 3.423828125, "loss/jsd": 0.0, "loss/logits": 0.21410678885877132, "step": 9730 }, { "epoch": 0.2435, "grad_norm": 30.875, "grad_norm_var": 3.54765625, "learning_rate": 0.0001, "loss": 7.4632, "loss/crossentropy": 2.231111526489258, "loss/hidden": 3.218359375, "loss/jsd": 0.0, "loss/logits": 0.1727623924612999, "step": 9740 }, { "epoch": 0.24375, "grad_norm": 33.25, "grad_norm_var": 32.34348958333333, "learning_rate": 0.0001, "loss": 7.5244, "loss/crossentropy": 2.184462660551071, "loss/hidden": 3.3578125, "loss/jsd": 0.0, "loss/logits": 0.19121489115059376, "step": 9750 }, { "epoch": 0.244, "grad_norm": 29.5, "grad_norm_var": 13.9212890625, "learning_rate": 0.0001, "loss": 7.5373, "loss/crossentropy": 2.237063002586365, "loss/hidden": 3.42265625, "loss/jsd": 0.0, "loss/logits": 0.19552523456513882, "step": 9760 }, { "epoch": 0.24425, "grad_norm": 31.0, "grad_norm_var": 12.137239583333333, "learning_rate": 0.0001, "loss": 7.5296, "loss/crossentropy": 2.1166174903512003, "loss/hidden": 3.482421875, "loss/jsd": 0.0, "loss/logits": 0.19640736747533083, "step": 9770 }, { "epoch": 0.2445, "grad_norm": 31.625, "grad_norm_var": 1.8927083333333334, "learning_rate": 0.0001, "loss": 7.4695, "loss/crossentropy": 2.215598449110985, "loss/hidden": 3.330078125, "loss/jsd": 0.0, "loss/logits": 0.1820572379976511, "step": 9780 }, { "epoch": 0.24475, "grad_norm": 28.375, "grad_norm_var": 6.012239583333334, "learning_rate": 0.0001, "loss": 7.4857, "loss/crossentropy": 2.19299538731575, "loss/hidden": 3.433984375, "loss/jsd": 0.0, "loss/logits": 0.20531897619366646, "step": 9790 }, { "epoch": 0.245, "grad_norm": 36.5, "grad_norm_var": 2.2799472864610222e+18, "learning_rate": 0.0001, "loss": 7.6354, "loss/crossentropy": 2.141967089474201, "loss/hidden": 3.656640625, "loss/jsd": 0.0, "loss/logits": 0.19617959037423133, "step": 9800 }, { "epoch": 0.24525, "grad_norm": 31.75, "grad_norm_var": 2.2799472865365197e+18, "learning_rate": 0.0001, "loss": 7.5381, "loss/crossentropy": 2.1641202688217165, "loss/hidden": 3.4640625, "loss/jsd": 0.0, "loss/logits": 0.19467307589948177, "step": 9810 }, { "epoch": 0.2455, "grad_norm": 32.0, "grad_norm_var": 1.9330729166666667, "learning_rate": 0.0001, "loss": 7.4407, "loss/crossentropy": 2.1282627910375593, "loss/hidden": 3.428515625, "loss/jsd": 0.0, "loss/logits": 0.1983230970799923, "step": 9820 }, { "epoch": 0.24575, "grad_norm": 30.375, "grad_norm_var": 1.5020833333333334, "learning_rate": 0.0001, "loss": 7.5029, "loss/crossentropy": 2.1581913977861404, "loss/hidden": 3.44140625, "loss/jsd": 0.0, "loss/logits": 0.20057737156748773, "step": 9830 }, { "epoch": 0.246, "grad_norm": 32.0, "grad_norm_var": 3.36640625, "learning_rate": 0.0001, "loss": 7.5631, "loss/crossentropy": 2.111289617419243, "loss/hidden": 3.56328125, "loss/jsd": 0.0, "loss/logits": 0.19521272610872983, "step": 9840 }, { "epoch": 0.24625, "grad_norm": 34.0, "grad_norm_var": 2.7947265625, "learning_rate": 0.0001, "loss": 7.4617, "loss/crossentropy": 2.1353485763072966, "loss/hidden": 3.377734375, "loss/jsd": 0.0, "loss/logits": 0.18625408709049224, "step": 9850 }, { "epoch": 0.2465, "grad_norm": 29.875, "grad_norm_var": 2.5582682291666665, "learning_rate": 0.0001, "loss": 7.4291, "loss/crossentropy": 2.2995183020830154, "loss/hidden": 3.41015625, "loss/jsd": 0.0, "loss/logits": 0.1992196377366781, "step": 9860 }, { "epoch": 0.24675, "grad_norm": 31.25, "grad_norm_var": 1.32890625, "learning_rate": 0.0001, "loss": 7.6033, "loss/crossentropy": 2.2881169497966765, "loss/hidden": 3.410546875, "loss/jsd": 0.0, "loss/logits": 0.1932061992585659, "step": 9870 }, { "epoch": 0.247, "grad_norm": 30.0, "grad_norm_var": 14.017643229166667, "learning_rate": 0.0001, "loss": 7.5277, "loss/crossentropy": 2.1658580511808396, "loss/hidden": 3.341015625, "loss/jsd": 0.0, "loss/logits": 0.19005278386175634, "step": 9880 }, { "epoch": 0.24725, "grad_norm": 29.625, "grad_norm_var": 11.984830729166667, "learning_rate": 0.0001, "loss": 7.4924, "loss/crossentropy": 2.2023943603038787, "loss/hidden": 3.339453125, "loss/jsd": 0.0, "loss/logits": 0.19262171424925328, "step": 9890 }, { "epoch": 0.2475, "grad_norm": 31.625, "grad_norm_var": 6.539518229166666, "learning_rate": 0.0001, "loss": 7.3901, "loss/crossentropy": 2.1616971135139464, "loss/hidden": 3.405078125, "loss/jsd": 0.0, "loss/logits": 0.19074857234954834, "step": 9900 }, { "epoch": 0.24775, "grad_norm": 31.625, "grad_norm_var": 2.436393229166667, "learning_rate": 0.0001, "loss": 7.4839, "loss/crossentropy": 2.21129602342844, "loss/hidden": 3.37265625, "loss/jsd": 0.0, "loss/logits": 0.1898047223687172, "step": 9910 }, { "epoch": 0.248, "grad_norm": 31.125, "grad_norm_var": 1.9580729166666666, "learning_rate": 0.0001, "loss": 7.5074, "loss/crossentropy": 2.1486201629042627, "loss/hidden": 3.327734375, "loss/jsd": 0.0, "loss/logits": 0.19970939867198467, "step": 9920 }, { "epoch": 0.24825, "grad_norm": 31.75, "grad_norm_var": 1.9666666666666666, "learning_rate": 0.0001, "loss": 7.4634, "loss/crossentropy": 2.073464626073837, "loss/hidden": 3.489453125, "loss/jsd": 0.0, "loss/logits": 0.21017258744686843, "step": 9930 }, { "epoch": 0.2485, "grad_norm": 31.25, "grad_norm_var": 1.4166666666666667, "learning_rate": 0.0001, "loss": 7.4696, "loss/crossentropy": 2.156512539088726, "loss/hidden": 3.529296875, "loss/jsd": 0.0, "loss/logits": 0.20021300427615643, "step": 9940 }, { "epoch": 0.24875, "grad_norm": 31.375, "grad_norm_var": 2.6681640625, "learning_rate": 0.0001, "loss": 7.5618, "loss/crossentropy": 2.098437860608101, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.2031120091676712, "step": 9950 }, { "epoch": 0.249, "grad_norm": 29.125, "grad_norm_var": 4.5259765625, "learning_rate": 0.0001, "loss": 7.5073, "loss/crossentropy": 2.054112070798874, "loss/hidden": 3.3703125, "loss/jsd": 0.0, "loss/logits": 0.18417572602629662, "step": 9960 }, { "epoch": 0.24925, "grad_norm": 32.25, "grad_norm_var": 9.677083333333334, "learning_rate": 0.0001, "loss": 7.4506, "loss/crossentropy": 2.173594242334366, "loss/hidden": 3.3609375, "loss/jsd": 0.0, "loss/logits": 0.1788831666111946, "step": 9970 }, { "epoch": 0.2495, "grad_norm": 31.625, "grad_norm_var": 12.448958333333334, "learning_rate": 0.0001, "loss": 7.5566, "loss/crossentropy": 2.1865617662668226, "loss/hidden": 3.54375, "loss/jsd": 0.0, "loss/logits": 0.219557130523026, "step": 9980 }, { "epoch": 0.24975, "grad_norm": 31.125, "grad_norm_var": 7.601822916666666, "learning_rate": 0.0001, "loss": 7.6451, "loss/crossentropy": 2.161116376519203, "loss/hidden": 3.42109375, "loss/jsd": 0.0, "loss/logits": 0.20050363764166831, "step": 9990 }, { "epoch": 0.25, "grad_norm": 30.375, "grad_norm_var": 1.603125, "learning_rate": 0.0001, "loss": 7.4863, "loss/crossentropy": 2.056240776181221, "loss/hidden": 3.48203125, "loss/jsd": 0.0, "loss/logits": 0.21083669643849134, "step": 10000 } ], "logging_steps": 10, "max_steps": 40000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.8575100320088064e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }