| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.25, |
| "eval_steps": 2000, |
| "global_step": 10000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.00025, |
| "grad_norm": 39.5, |
| "learning_rate": 0.0001, |
| "loss": 7.8298, |
| "loss/crossentropy": 2.313796639442444, |
| "loss/hidden": 3.414453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21518087349832057, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0005, |
| "grad_norm": 31.5, |
| "grad_norm_var": 5.698893229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.8693, |
| "loss/crossentropy": 2.1564369201660156, |
| "loss/hidden": 3.587109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21401480734348297, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.00075, |
| "grad_norm": 36.0, |
| "grad_norm_var": 6.930143229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.8779, |
| "loss/crossentropy": 2.179039953649044, |
| "loss/hidden": 3.709375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22207003347575666, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.001, |
| "grad_norm": 32.75, |
| "grad_norm_var": 40.942708333333336, |
| "learning_rate": 0.0001, |
| "loss": 7.7653, |
| "loss/crossentropy": 2.074952059984207, |
| "loss/hidden": 3.55625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20403100922703743, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.00125, |
| "grad_norm": 35.75, |
| "grad_norm_var": 94.25729166666666, |
| "learning_rate": 0.0001, |
| "loss": 7.8641, |
| "loss/crossentropy": 2.087546107172966, |
| "loss/hidden": 3.50546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19412125833332539, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.0015, |
| "grad_norm": 30.25, |
| "grad_norm_var": 110.89140625, |
| "learning_rate": 0.0001, |
| "loss": 7.8652, |
| "loss/crossentropy": 2.2259810894727705, |
| "loss/hidden": 3.528125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21063638497143983, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.00175, |
| "grad_norm": 36.0, |
| "grad_norm_var": 62.02805989583333, |
| "learning_rate": 0.0001, |
| "loss": 7.751, |
| "loss/crossentropy": 2.164659637212753, |
| "loss/hidden": 3.47109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19977533183991908, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.002, |
| "grad_norm": 34.25, |
| "grad_norm_var": 8.6759765625, |
| "learning_rate": 0.0001, |
| "loss": 7.7596, |
| "loss/crossentropy": 2.097026476264, |
| "loss/hidden": 3.478125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20290330462157727, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.00225, |
| "grad_norm": 39.75, |
| "grad_norm_var": 71.35598958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.8106, |
| "loss/crossentropy": 2.1291788890957832, |
| "loss/hidden": 3.491796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19771635457873343, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.0025, |
| "grad_norm": 34.25, |
| "grad_norm_var": 9.158072916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.7473, |
| "loss/crossentropy": 2.147798593342304, |
| "loss/hidden": 3.558203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20517258979380132, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.00275, |
| "grad_norm": 31.625, |
| "grad_norm_var": 9.737239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.7738, |
| "loss/crossentropy": 2.1884776622056963, |
| "loss/hidden": 3.458203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20624704901129007, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.003, |
| "grad_norm": 37.75, |
| "grad_norm_var": 335.18170572916665, |
| "learning_rate": 0.0001, |
| "loss": 7.8546, |
| "loss/crossentropy": 2.2224678859114646, |
| "loss/hidden": 3.5296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22259013392031193, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.00325, |
| "grad_norm": 136.0, |
| "grad_norm_var": 1014.1374348958333, |
| "learning_rate": 0.0001, |
| "loss": 7.7227, |
| "loss/crossentropy": 2.135145714879036, |
| "loss/hidden": 3.459765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21895913481712342, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.0035, |
| "grad_norm": 36.25, |
| "grad_norm_var": 663.2072916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6794, |
| "loss/crossentropy": 2.2155070066452027, |
| "loss/hidden": 3.428515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18895817659795283, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.00375, |
| "grad_norm": 38.5, |
| "grad_norm_var": 54.0384765625, |
| "learning_rate": 0.0001, |
| "loss": 7.7461, |
| "loss/crossentropy": 2.1793935388326644, |
| "loss/hidden": 3.5140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19897108823060988, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.004, |
| "grad_norm": 62.0, |
| "grad_norm_var": 99.1447265625, |
| "learning_rate": 0.0001, |
| "loss": 7.7956, |
| "loss/crossentropy": 2.194957372546196, |
| "loss/hidden": 3.597265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22534323409199714, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.00425, |
| "grad_norm": 32.5, |
| "grad_norm_var": 62.60390625, |
| "learning_rate": 0.0001, |
| "loss": 7.7866, |
| "loss/crossentropy": 2.1939920127391814, |
| "loss/hidden": 3.529296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2311840608716011, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.0045, |
| "grad_norm": 33.5, |
| "grad_norm_var": 10.049934895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6691, |
| "loss/crossentropy": 2.1646964073181154, |
| "loss/hidden": 3.416015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2066217228770256, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.00475, |
| "grad_norm": 32.75, |
| "grad_norm_var": 13.889322916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.8529, |
| "loss/crossentropy": 2.135753521323204, |
| "loss/hidden": 3.64140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.23793395943939685, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.005, |
| "grad_norm": 30.375, |
| "grad_norm_var": 14.12265625, |
| "learning_rate": 0.0001, |
| "loss": 7.7357, |
| "loss/crossentropy": 2.1783783614635466, |
| "loss/hidden": 3.540234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2111268475651741, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.00525, |
| "grad_norm": 47.75, |
| "grad_norm_var": 167.33014322916668, |
| "learning_rate": 0.0001, |
| "loss": 7.8414, |
| "loss/crossentropy": 2.091558237373829, |
| "loss/hidden": 3.49765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2065280582755804, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.0055, |
| "grad_norm": 30.625, |
| "grad_norm_var": 185.38899739583334, |
| "learning_rate": 0.0001, |
| "loss": 7.7181, |
| "loss/crossentropy": 2.1866263896226883, |
| "loss/hidden": 3.434375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20596572011709213, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.00575, |
| "grad_norm": 30.125, |
| "grad_norm_var": 45.847330729166664, |
| "learning_rate": 0.0001, |
| "loss": 7.6276, |
| "loss/crossentropy": 2.1170753836631775, |
| "loss/hidden": 3.419140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20233637914061547, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.006, |
| "grad_norm": 33.75, |
| "grad_norm_var": 17.477083333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.7487, |
| "loss/crossentropy": 2.1388430804014207, |
| "loss/hidden": 3.55703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20581382531672715, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.00625, |
| "grad_norm": 31.75, |
| "grad_norm_var": 1.54765625, |
| "learning_rate": 0.0001, |
| "loss": 7.6568, |
| "loss/crossentropy": 2.2856020241975785, |
| "loss/hidden": 3.4375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2351478708907962, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.0065, |
| "grad_norm": 28.375, |
| "grad_norm_var": 28.54375, |
| "learning_rate": 0.0001, |
| "loss": 7.6993, |
| "loss/crossentropy": 2.0653378486633303, |
| "loss/hidden": 3.493359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19755732025951148, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.00675, |
| "grad_norm": 33.25, |
| "grad_norm_var": 28.384375, |
| "learning_rate": 0.0001, |
| "loss": 7.7075, |
| "loss/crossentropy": 2.1598333328962327, |
| "loss/hidden": 3.44765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19795978404581546, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.007, |
| "grad_norm": 32.5, |
| "grad_norm_var": 20.862955729166668, |
| "learning_rate": 0.0001, |
| "loss": 7.6852, |
| "loss/crossentropy": 2.138056221604347, |
| "loss/hidden": 3.43671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1999417580664158, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.00725, |
| "grad_norm": 42.0, |
| "grad_norm_var": 20.856705729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.8621, |
| "loss/crossentropy": 2.1779348880052565, |
| "loss/hidden": 3.42421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1947902340441942, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.0075, |
| "grad_norm": 31.75, |
| "grad_norm_var": 8.949739583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6542, |
| "loss/crossentropy": 2.174051034450531, |
| "loss/hidden": 3.49375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2129627451300621, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.00775, |
| "grad_norm": 33.5, |
| "grad_norm_var": 4.620247395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6721, |
| "loss/crossentropy": 2.0598735958337784, |
| "loss/hidden": 3.54921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20318429488688708, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.008, |
| "grad_norm": 35.5, |
| "grad_norm_var": 2.059375, |
| "learning_rate": 0.0001, |
| "loss": 7.6655, |
| "loss/crossentropy": 2.1254130959510804, |
| "loss/hidden": 3.446875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19875272288918494, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.00825, |
| "grad_norm": 35.0, |
| "grad_norm_var": 1.9639973958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.7461, |
| "loss/crossentropy": 2.1635933369398117, |
| "loss/hidden": 3.41640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2012931451201439, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.0085, |
| "grad_norm": 33.75, |
| "grad_norm_var": 2.255989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6974, |
| "loss/crossentropy": 2.214476653933525, |
| "loss/hidden": 3.39453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1954287003725767, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.00875, |
| "grad_norm": 30.625, |
| "grad_norm_var": 2.4942057291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6918, |
| "loss/crossentropy": 2.216859245300293, |
| "loss/hidden": 3.393359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20083636604249477, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.009, |
| "grad_norm": 31.625, |
| "grad_norm_var": 1.7905598958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6896, |
| "loss/crossentropy": 2.2161539107561112, |
| "loss/hidden": 3.405078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19574192687869071, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.00925, |
| "grad_norm": 29.75, |
| "grad_norm_var": 7.141080729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.8109, |
| "loss/crossentropy": 2.153403599560261, |
| "loss/hidden": 3.559765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21339697316288947, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.0095, |
| "grad_norm": 37.5, |
| "grad_norm_var": 10.9244140625, |
| "learning_rate": 0.0001, |
| "loss": 7.7615, |
| "loss/crossentropy": 2.253763607144356, |
| "loss/hidden": 3.494140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2074073076248169, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.00975, |
| "grad_norm": 33.75, |
| "grad_norm_var": 13.8400390625, |
| "learning_rate": 0.0001, |
| "loss": 7.7209, |
| "loss/crossentropy": 2.1363648414611816, |
| "loss/hidden": 3.478125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20775138661265374, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 31.5, |
| "grad_norm_var": 14.397330729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6574, |
| "loss/crossentropy": 2.1789979085326197, |
| "loss/hidden": 3.5109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20811444334685802, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.01025, |
| "grad_norm": 33.0, |
| "grad_norm_var": 9.6837890625, |
| "learning_rate": 0.0001, |
| "loss": 7.7272, |
| "loss/crossentropy": 2.232848098874092, |
| "loss/hidden": 3.5328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21815686002373696, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.0105, |
| "grad_norm": 30.0, |
| "grad_norm_var": 73.00201822916667, |
| "learning_rate": 0.0001, |
| "loss": 7.7767, |
| "loss/crossentropy": 2.064501041173935, |
| "loss/hidden": 3.616796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21888567861169578, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.01075, |
| "grad_norm": 29.625, |
| "grad_norm_var": 73.21015625, |
| "learning_rate": 0.0001, |
| "loss": 7.7084, |
| "loss/crossentropy": 2.1248373448848725, |
| "loss/hidden": 3.48828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20307110175490378, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.011, |
| "grad_norm": 31.875, |
| "grad_norm_var": 8.3259765625, |
| "learning_rate": 0.0001, |
| "loss": 7.6488, |
| "loss/crossentropy": 2.1684874832630157, |
| "loss/hidden": 3.371875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18734447471797466, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.01125, |
| "grad_norm": 40.75, |
| "grad_norm_var": 27.13125, |
| "learning_rate": 0.0001, |
| "loss": 7.831, |
| "loss/crossentropy": 2.1310232520103454, |
| "loss/hidden": 3.631640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20724854618310928, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.0115, |
| "grad_norm": 36.0, |
| "grad_norm_var": 27.680208333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.7446, |
| "loss/crossentropy": 2.134530597925186, |
| "loss/hidden": 3.43984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21913636103272438, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.01175, |
| "grad_norm": 33.0, |
| "grad_norm_var": 7.48125, |
| "learning_rate": 0.0001, |
| "loss": 7.6288, |
| "loss/crossentropy": 2.2641385555267335, |
| "loss/hidden": 3.348828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1849798161536455, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.012, |
| "grad_norm": 31.75, |
| "grad_norm_var": 11.4837890625, |
| "learning_rate": 0.0001, |
| "loss": 7.6493, |
| "loss/crossentropy": 2.2282994374632836, |
| "loss/hidden": 3.443359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2014446135610342, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.01225, |
| "grad_norm": 37.25, |
| "grad_norm_var": 6.677083333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.7612, |
| "loss/crossentropy": 2.1222758114337923, |
| "loss/hidden": 3.5625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2013396628201008, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.0125, |
| "grad_norm": 32.5, |
| "grad_norm_var": 4.7744140625, |
| "learning_rate": 0.0001, |
| "loss": 7.6421, |
| "loss/crossentropy": 2.069608175754547, |
| "loss/hidden": 3.45078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1966784244403243, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.01275, |
| "grad_norm": 41.5, |
| "grad_norm_var": 10.3697265625, |
| "learning_rate": 0.0001, |
| "loss": 7.6818, |
| "loss/crossentropy": 2.1589883297681807, |
| "loss/hidden": 3.355078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18451723456382751, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.013, |
| "grad_norm": 31.125, |
| "grad_norm_var": 6.667122395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.7136, |
| "loss/crossentropy": 2.149793979898095, |
| "loss/hidden": 3.598828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2093389181420207, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.01325, |
| "grad_norm": 35.25, |
| "grad_norm_var": 20.768489583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.7677, |
| "loss/crossentropy": 2.195904017984867, |
| "loss/hidden": 3.425390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19832278694957495, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.0135, |
| "grad_norm": 35.5, |
| "grad_norm_var": 17.8619140625, |
| "learning_rate": 0.0001, |
| "loss": 7.6679, |
| "loss/crossentropy": 2.1850160747766494, |
| "loss/hidden": 3.403125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20512696355581284, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.01375, |
| "grad_norm": 35.25, |
| "grad_norm_var": 25.736393229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.6426, |
| "loss/crossentropy": 2.1822438329458236, |
| "loss/hidden": 3.441796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20032773297280074, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.014, |
| "grad_norm": 35.25, |
| "grad_norm_var": 30.77890625, |
| "learning_rate": 0.0001, |
| "loss": 7.7347, |
| "loss/crossentropy": 2.1990185409784315, |
| "loss/hidden": 3.410546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19729668814688922, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.01425, |
| "grad_norm": 41.25, |
| "grad_norm_var": 46.00149739583333, |
| "learning_rate": 0.0001, |
| "loss": 7.6662, |
| "loss/crossentropy": 2.0567948162555694, |
| "loss/hidden": 3.487109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18916799686849117, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.0145, |
| "grad_norm": 31.875, |
| "grad_norm_var": 18.383072916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.662, |
| "loss/crossentropy": 2.1589747786521913, |
| "loss/hidden": 3.6359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21064655482769012, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.01475, |
| "grad_norm": 34.0, |
| "grad_norm_var": 7.5025390625, |
| "learning_rate": 0.0001, |
| "loss": 7.6739, |
| "loss/crossentropy": 2.053135275095701, |
| "loss/hidden": 3.4828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20297051095403731, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.015, |
| "grad_norm": 42.0, |
| "grad_norm_var": 58.718684895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.6386, |
| "loss/crossentropy": 2.0670476451516153, |
| "loss/hidden": 3.590234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.208550613373518, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.01525, |
| "grad_norm": 29.75, |
| "grad_norm_var": 57.89993489583333, |
| "learning_rate": 0.0001, |
| "loss": 7.6461, |
| "loss/crossentropy": 2.1219205021858216, |
| "loss/hidden": 3.4953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20205040834844112, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.0155, |
| "grad_norm": 29.25, |
| "grad_norm_var": 10.6228515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5641, |
| "loss/crossentropy": 2.114127852022648, |
| "loss/hidden": 3.4171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19217339344322681, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.01575, |
| "grad_norm": 30.375, |
| "grad_norm_var": 14.542643229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.6319, |
| "loss/crossentropy": 2.2160026699304582, |
| "loss/hidden": 3.392578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19281109217554332, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.016, |
| "grad_norm": 34.0, |
| "grad_norm_var": 8.1869140625, |
| "learning_rate": 0.0001, |
| "loss": 7.6546, |
| "loss/crossentropy": 2.1914512276649476, |
| "loss/hidden": 3.47109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2007790008559823, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.01625, |
| "grad_norm": 30.375, |
| "grad_norm_var": 5.4587890625, |
| "learning_rate": 0.0001, |
| "loss": 7.573, |
| "loss/crossentropy": 2.0619212985038757, |
| "loss/hidden": 3.440625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19594881720840931, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.0165, |
| "grad_norm": 34.75, |
| "grad_norm_var": 6.121809895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.7448, |
| "loss/crossentropy": 2.1046764492988586, |
| "loss/hidden": 3.5921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20083924774080514, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.01675, |
| "grad_norm": 34.5, |
| "grad_norm_var": 5.715559895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.8327, |
| "loss/crossentropy": 2.2835423797369003, |
| "loss/hidden": 3.604296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.23483402598649264, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.017, |
| "grad_norm": 31.875, |
| "grad_norm_var": 10.7650390625, |
| "learning_rate": 0.0001, |
| "loss": 7.8138, |
| "loss/crossentropy": 2.0907129019498827, |
| "loss/hidden": 3.518359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1924523524940014, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.01725, |
| "grad_norm": 33.0, |
| "grad_norm_var": 1.42265625, |
| "learning_rate": 0.0001, |
| "loss": 7.6162, |
| "loss/crossentropy": 2.127697338163853, |
| "loss/hidden": 3.50859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2057236723601818, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.0175, |
| "grad_norm": 31.0, |
| "grad_norm_var": 8.9822265625, |
| "learning_rate": 0.0001, |
| "loss": 7.727, |
| "loss/crossentropy": 2.088846719264984, |
| "loss/hidden": 3.487890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21837750263512135, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.01775, |
| "grad_norm": 31.125, |
| "grad_norm_var": 1.7385416666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.504, |
| "loss/crossentropy": 2.1813240855932237, |
| "loss/hidden": 3.376171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18769481666386129, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.018, |
| "grad_norm": 34.0, |
| "grad_norm_var": 2.569205729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6713, |
| "loss/crossentropy": 2.127785587310791, |
| "loss/hidden": 3.45546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20503429286181926, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.01825, |
| "grad_norm": 29.875, |
| "grad_norm_var": 12.484375, |
| "learning_rate": 0.0001, |
| "loss": 7.6481, |
| "loss/crossentropy": 2.171098938584328, |
| "loss/hidden": 3.474609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20801318623125553, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.0185, |
| "grad_norm": 37.25, |
| "grad_norm_var": 14.9978515625, |
| "learning_rate": 0.0001, |
| "loss": 7.6067, |
| "loss/crossentropy": 2.1487890854477882, |
| "loss/hidden": 3.427734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18985433727502823, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.01875, |
| "grad_norm": 32.0, |
| "grad_norm_var": 7.299739583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4877, |
| "loss/crossentropy": 2.2428383469581603, |
| "loss/hidden": 3.24375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18733534589409828, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.019, |
| "grad_norm": 30.0, |
| "grad_norm_var": 3.379622395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6509, |
| "loss/crossentropy": 2.1872796684503557, |
| "loss/hidden": 3.4203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21135813258588315, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.01925, |
| "grad_norm": 30.5, |
| "grad_norm_var": 46.3072265625, |
| "learning_rate": 0.0001, |
| "loss": 7.7384, |
| "loss/crossentropy": 2.2427982538938522, |
| "loss/hidden": 3.369921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1998496226966381, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.0195, |
| "grad_norm": 41.5, |
| "grad_norm_var": 45.49264322916667, |
| "learning_rate": 0.0001, |
| "loss": 7.6688, |
| "loss/crossentropy": 2.207463192939758, |
| "loss/hidden": 3.321875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18665656447410583, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.01975, |
| "grad_norm": 31.625, |
| "grad_norm_var": 53.53951822916667, |
| "learning_rate": 0.0001, |
| "loss": 7.6327, |
| "loss/crossentropy": 2.075051838159561, |
| "loss/hidden": 3.37265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19726874344050885, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 35.25, |
| "grad_norm_var": 15.802083333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6833, |
| "loss/crossentropy": 2.0811705768108366, |
| "loss/hidden": 3.40078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19689124524593354, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.02025, |
| "grad_norm": 36.25, |
| "grad_norm_var": 2.7916015625, |
| "learning_rate": 0.0001, |
| "loss": 7.6998, |
| "loss/crossentropy": 2.139931133389473, |
| "loss/hidden": 3.56640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20113225914537908, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.0205, |
| "grad_norm": 33.25, |
| "grad_norm_var": 3.21640625, |
| "learning_rate": 0.0001, |
| "loss": 7.6125, |
| "loss/crossentropy": 2.3070268869400024, |
| "loss/hidden": 3.4, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19928287118673324, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.02075, |
| "grad_norm": 31.375, |
| "grad_norm_var": 4.70390625, |
| "learning_rate": 0.0001, |
| "loss": 7.7169, |
| "loss/crossentropy": 2.1359834372997284, |
| "loss/hidden": 3.6421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22523897737264634, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.021, |
| "grad_norm": 33.75, |
| "grad_norm_var": 7.06015625, |
| "learning_rate": 0.0001, |
| "loss": 7.6629, |
| "loss/crossentropy": 2.1498879536986353, |
| "loss/hidden": 3.599609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21073536314070224, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.02125, |
| "grad_norm": 31.125, |
| "grad_norm_var": 11.855143229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.7246, |
| "loss/crossentropy": 2.154731386899948, |
| "loss/hidden": 3.379296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18697260301560165, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.0215, |
| "grad_norm": 28.0, |
| "grad_norm_var": 3.8988932291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4616, |
| "loss/crossentropy": 2.209418597817421, |
| "loss/hidden": 3.441015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19536950960755348, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.02175, |
| "grad_norm": 36.25, |
| "grad_norm_var": 28.367708333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6221, |
| "loss/crossentropy": 2.106307029724121, |
| "loss/hidden": 3.477734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20511649739928545, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.022, |
| "grad_norm": 30.875, |
| "grad_norm_var": 25.5978515625, |
| "learning_rate": 0.0001, |
| "loss": 7.57, |
| "loss/crossentropy": 2.170385852456093, |
| "loss/hidden": 3.419140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18977888114750385, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.02225, |
| "grad_norm": 31.0, |
| "grad_norm_var": 4.178580729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.659, |
| "loss/crossentropy": 2.022993338108063, |
| "loss/hidden": 3.580859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2017082829028368, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.0225, |
| "grad_norm": 30.25, |
| "grad_norm_var": 4.118684895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.6471, |
| "loss/crossentropy": 2.1982390731573105, |
| "loss/hidden": 3.4078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.186830697581172, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.02275, |
| "grad_norm": 36.0, |
| "grad_norm_var": 9.886393229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5929, |
| "loss/crossentropy": 2.1351534157991408, |
| "loss/hidden": 3.44296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19507032372057437, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.023, |
| "grad_norm": 30.25, |
| "grad_norm_var": 69.10182291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.7006, |
| "loss/crossentropy": 2.1805424720048903, |
| "loss/hidden": 3.60546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2387762701138854, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.02325, |
| "grad_norm": 29.625, |
| "grad_norm_var": 148.09765625, |
| "learning_rate": 0.0001, |
| "loss": 7.603, |
| "loss/crossentropy": 2.1993222564458845, |
| "loss/hidden": 3.5421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2252051206305623, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.0235, |
| "grad_norm": 33.0, |
| "grad_norm_var": 149.05670572916668, |
| "learning_rate": 0.0001, |
| "loss": 7.6101, |
| "loss/crossentropy": 2.132229286432266, |
| "loss/hidden": 3.434765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19997731409966946, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.02375, |
| "grad_norm": 33.25, |
| "grad_norm_var": 2.372330729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6309, |
| "loss/crossentropy": 2.057620918750763, |
| "loss/hidden": 3.525, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19510896243155001, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.024, |
| "grad_norm": 30.125, |
| "grad_norm_var": 2.9291666666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5599, |
| "loss/crossentropy": 2.1666407614946364, |
| "loss/hidden": 3.38828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19047823324799537, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.02425, |
| "grad_norm": 34.0, |
| "grad_norm_var": 750.4874348958333, |
| "learning_rate": 0.0001, |
| "loss": 7.667, |
| "loss/crossentropy": 2.199223425984383, |
| "loss/hidden": 3.561328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22954254262149335, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.0245, |
| "grad_norm": 31.0, |
| "grad_norm_var": 736.4518229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.6648, |
| "loss/crossentropy": 2.052691954374313, |
| "loss/hidden": 3.612109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20743414014577866, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.02475, |
| "grad_norm": 29.625, |
| "grad_norm_var": 13.204166666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.587, |
| "loss/crossentropy": 2.2363356560468675, |
| "loss/hidden": 3.383984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19663754627108573, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.025, |
| "grad_norm": 29.5, |
| "grad_norm_var": 8.623958333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5999, |
| "loss/crossentropy": 2.096450260281563, |
| "loss/hidden": 3.437890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1896633107215166, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.02525, |
| "grad_norm": 30.0, |
| "grad_norm_var": 26.456184895833335, |
| "learning_rate": 0.0001, |
| "loss": 7.6469, |
| "loss/crossentropy": 2.219489449262619, |
| "loss/hidden": 3.461328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20298976600170135, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.0255, |
| "grad_norm": 32.75, |
| "grad_norm_var": 22.0525390625, |
| "learning_rate": 0.0001, |
| "loss": 7.6634, |
| "loss/crossentropy": 2.157027468085289, |
| "loss/hidden": 3.641015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21785753238946198, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.02575, |
| "grad_norm": 33.0, |
| "grad_norm_var": 31.347330729166668, |
| "learning_rate": 0.0001, |
| "loss": 7.6449, |
| "loss/crossentropy": 2.1728423804044725, |
| "loss/hidden": 3.41015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19077841471880674, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.026, |
| "grad_norm": 74.5, |
| "grad_norm_var": 122.20045572916666, |
| "learning_rate": 0.0001, |
| "loss": 7.6132, |
| "loss/crossentropy": 2.1822386175394057, |
| "loss/hidden": 3.39453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1930880568921566, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.02625, |
| "grad_norm": 67.5, |
| "grad_norm_var": 178.43125, |
| "learning_rate": 0.0001, |
| "loss": 7.6429, |
| "loss/crossentropy": 2.2600297421216964, |
| "loss/hidden": 3.458203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20767511576414108, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.0265, |
| "grad_norm": 31.875, |
| "grad_norm_var": 92.06087239583333, |
| "learning_rate": 0.0001, |
| "loss": 7.5718, |
| "loss/crossentropy": 2.118990848958492, |
| "loss/hidden": 3.37890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19327420592308045, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.02675, |
| "grad_norm": 30.875, |
| "grad_norm_var": 35.283268229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.603, |
| "loss/crossentropy": 2.2320737928152083, |
| "loss/hidden": 3.335546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18573360554873944, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.027, |
| "grad_norm": 35.25, |
| "grad_norm_var": 3795.8796223958334, |
| "learning_rate": 0.0001, |
| "loss": 7.6632, |
| "loss/crossentropy": 2.1329027831554415, |
| "loss/hidden": 3.500390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.25382886435836555, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.02725, |
| "grad_norm": 41.0, |
| "grad_norm_var": 3810.5869140625, |
| "learning_rate": 0.0001, |
| "loss": 7.591, |
| "loss/crossentropy": 2.147709222137928, |
| "loss/hidden": 3.409375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19441522471606731, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.0275, |
| "grad_norm": 34.0, |
| "grad_norm_var": 10.347916666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.459, |
| "loss/crossentropy": 2.134738603234291, |
| "loss/hidden": 3.45, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18917258959263564, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.02775, |
| "grad_norm": 30.5, |
| "grad_norm_var": 5.362239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4625, |
| "loss/crossentropy": 2.072269695997238, |
| "loss/hidden": 3.453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19563193432986736, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.028, |
| "grad_norm": 29.375, |
| "grad_norm_var": 14.4478515625, |
| "learning_rate": 0.0001, |
| "loss": 7.514, |
| "loss/crossentropy": 2.131034165620804, |
| "loss/hidden": 3.44453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18194433208554983, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.02825, |
| "grad_norm": 33.0, |
| "grad_norm_var": 23.925, |
| "learning_rate": 0.0001, |
| "loss": 7.5884, |
| "loss/crossentropy": 2.023801653087139, |
| "loss/hidden": 3.637109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20569879673421382, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.0285, |
| "grad_norm": 32.0, |
| "grad_norm_var": 7.2744140625, |
| "learning_rate": 0.0001, |
| "loss": 7.6524, |
| "loss/crossentropy": 2.1517456393688916, |
| "loss/hidden": 3.537890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2079196309670806, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.02875, |
| "grad_norm": 35.75, |
| "grad_norm_var": 7.99140625, |
| "learning_rate": 0.0001, |
| "loss": 7.6074, |
| "loss/crossentropy": 2.004653300344944, |
| "loss/hidden": 3.583984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19787274841219188, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.029, |
| "grad_norm": 32.75, |
| "grad_norm_var": 4.120572916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6348, |
| "loss/crossentropy": 2.1528601229190825, |
| "loss/hidden": 3.361328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1927174234762788, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.02925, |
| "grad_norm": 36.25, |
| "grad_norm_var": 7.069205729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6205, |
| "loss/crossentropy": 2.05783154964447, |
| "loss/hidden": 3.614453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22374887801706791, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.0295, |
| "grad_norm": 31.0, |
| "grad_norm_var": 41.484309895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.7356, |
| "loss/crossentropy": 2.126041141152382, |
| "loss/hidden": 3.5234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2197611417621374, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.02975, |
| "grad_norm": 38.5, |
| "grad_norm_var": 43.91223958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6157, |
| "loss/crossentropy": 2.2476495057344437, |
| "loss/hidden": 3.462109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19447711408138274, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 47.5, |
| "grad_norm_var": 95.72057291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.7237, |
| "loss/crossentropy": 2.092088536918163, |
| "loss/hidden": 3.433203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19047515615820884, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.03025, |
| "grad_norm": 31.5, |
| "grad_norm_var": 99.6822265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5678, |
| "loss/crossentropy": 2.1249007523059844, |
| "loss/hidden": 3.505078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19545839354395866, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.0305, |
| "grad_norm": 30.625, |
| "grad_norm_var": 6.409309895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5862, |
| "loss/crossentropy": 2.2506365835666657, |
| "loss/hidden": 3.39921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19677093252539635, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.03075, |
| "grad_norm": 33.25, |
| "grad_norm_var": 6.676822916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5504, |
| "loss/crossentropy": 2.133887434005737, |
| "loss/hidden": 3.459375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1896925836801529, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.031, |
| "grad_norm": 38.0, |
| "grad_norm_var": 5.2337890625, |
| "learning_rate": 0.0001, |
| "loss": 7.5956, |
| "loss/crossentropy": 2.0669597774744033, |
| "loss/hidden": 3.475390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19292720556259155, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.03125, |
| "grad_norm": 30.625, |
| "grad_norm_var": 3.97265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5722, |
| "loss/crossentropy": 2.261713761091232, |
| "loss/hidden": 3.346875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19570228308439255, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.0315, |
| "grad_norm": 29.25, |
| "grad_norm_var": 4.916080729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5566, |
| "loss/crossentropy": 2.0476513862609864, |
| "loss/hidden": 3.668359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2093046260997653, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.03175, |
| "grad_norm": 34.75, |
| "grad_norm_var": 10.875455729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6172, |
| "loss/crossentropy": 2.1128817319869997, |
| "loss/hidden": 3.5703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21670667603611946, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.032, |
| "grad_norm": 33.5, |
| "grad_norm_var": 3.824934895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6787, |
| "loss/crossentropy": 2.2115501552820205, |
| "loss/hidden": 3.417578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1827129926532507, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.03225, |
| "grad_norm": 30.375, |
| "grad_norm_var": 13.828125, |
| "learning_rate": 0.0001, |
| "loss": 7.6339, |
| "loss/crossentropy": 2.176504462957382, |
| "loss/hidden": 3.5, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2160520726814866, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.0325, |
| "grad_norm": 32.25, |
| "grad_norm_var": 5.916080729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6438, |
| "loss/crossentropy": 2.173138880729675, |
| "loss/hidden": 3.529296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2056989949196577, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.03275, |
| "grad_norm": 32.25, |
| "grad_norm_var": 5.78515625, |
| "learning_rate": 0.0001, |
| "loss": 7.7146, |
| "loss/crossentropy": 2.247766065597534, |
| "loss/hidden": 3.488671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20310762114822864, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.033, |
| "grad_norm": 32.75, |
| "grad_norm_var": 4.51640625, |
| "learning_rate": 0.0001, |
| "loss": 7.6452, |
| "loss/crossentropy": 2.0862443327903746, |
| "loss/hidden": 3.406640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18912406917661428, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.03325, |
| "grad_norm": 34.5, |
| "grad_norm_var": 7.220833333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.7715, |
| "loss/crossentropy": 2.093398702144623, |
| "loss/hidden": 3.570703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2104920681566, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.0335, |
| "grad_norm": 38.0, |
| "grad_norm_var": 9.108268229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.7368, |
| "loss/crossentropy": 2.17246213555336, |
| "loss/hidden": 3.576953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21665989980101585, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.03375, |
| "grad_norm": 33.5, |
| "grad_norm_var": 4.794205729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5892, |
| "loss/crossentropy": 2.1238946616649628, |
| "loss/hidden": 3.48828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2172164160758257, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.034, |
| "grad_norm": 32.5, |
| "grad_norm_var": 101.1431640625, |
| "learning_rate": 0.0001, |
| "loss": 7.6341, |
| "loss/crossentropy": 2.194270025193691, |
| "loss/hidden": 3.4859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19632596522569656, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.03425, |
| "grad_norm": 33.25, |
| "grad_norm_var": 3.9119140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5496, |
| "loss/crossentropy": 2.1282873928546904, |
| "loss/hidden": 3.343359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17983752395957708, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.0345, |
| "grad_norm": 34.5, |
| "grad_norm_var": 4.324934895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.6348, |
| "loss/crossentropy": 2.140147662162781, |
| "loss/hidden": 3.471484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20302014388144016, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.03475, |
| "grad_norm": 30.75, |
| "grad_norm_var": 2.818489583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.7271, |
| "loss/crossentropy": 2.128489089012146, |
| "loss/hidden": 3.4359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19602114744484425, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.035, |
| "grad_norm": 31.25, |
| "grad_norm_var": 2.1458333333333335, |
| "learning_rate": 0.0001, |
| "loss": 7.6417, |
| "loss/crossentropy": 2.1306474581360817, |
| "loss/hidden": 3.582421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19735200479626655, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.03525, |
| "grad_norm": 31.25, |
| "grad_norm_var": 9.2009765625, |
| "learning_rate": 0.0001, |
| "loss": 7.7002, |
| "loss/crossentropy": 2.173697289824486, |
| "loss/hidden": 3.480859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20366120263934134, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.0355, |
| "grad_norm": 31.75, |
| "grad_norm_var": 9.913997395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6577, |
| "loss/crossentropy": 2.26003720164299, |
| "loss/hidden": 3.385546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20924863480031491, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.03575, |
| "grad_norm": 30.25, |
| "grad_norm_var": 26.66015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5718, |
| "loss/crossentropy": 2.2352112770080566, |
| "loss/hidden": 3.419140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19357634484767913, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.036, |
| "grad_norm": 34.0, |
| "grad_norm_var": 30.602083333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5861, |
| "loss/crossentropy": 2.0770506739616392, |
| "loss/hidden": 3.5453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21514309681951999, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.03625, |
| "grad_norm": 37.5, |
| "grad_norm_var": 12.3875, |
| "learning_rate": 0.0001, |
| "loss": 7.4986, |
| "loss/crossentropy": 2.0542123883962633, |
| "loss/hidden": 3.510546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19684152901172638, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.0365, |
| "grad_norm": 32.75, |
| "grad_norm_var": 8.548372395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6006, |
| "loss/crossentropy": 2.175110411643982, |
| "loss/hidden": 3.37578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19031002502888442, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.03675, |
| "grad_norm": 34.25, |
| "grad_norm_var": 2.8429676028135214e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.7702, |
| "loss/crossentropy": 2.1691948026418686, |
| "loss/hidden": 3.5859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22843880020081997, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.037, |
| "grad_norm": 36.25, |
| "grad_norm_var": 2.842967603101565e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.6046, |
| "loss/crossentropy": 2.0826233722269536, |
| "loss/hidden": 3.520703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1938928204588592, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.03725, |
| "grad_norm": 32.75, |
| "grad_norm_var": 8.939518229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6265, |
| "loss/crossentropy": 2.2077848985791206, |
| "loss/hidden": 3.435546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1943045362830162, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.0375, |
| "grad_norm": 34.25, |
| "grad_norm_var": 7.7125, |
| "learning_rate": 0.0001, |
| "loss": 7.6217, |
| "loss/crossentropy": 2.1079602181911468, |
| "loss/hidden": 3.395703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19446163363754748, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.03775, |
| "grad_norm": 34.25, |
| "grad_norm_var": 5.7931640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5893, |
| "loss/crossentropy": 2.078600898385048, |
| "loss/hidden": 3.5734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21464722994714974, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.038, |
| "grad_norm": 35.75, |
| "grad_norm_var": 6.186458333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6365, |
| "loss/crossentropy": 2.1014960765838624, |
| "loss/hidden": 3.53125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19028044641017913, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.03825, |
| "grad_norm": 36.0, |
| "grad_norm_var": 4.455989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5414, |
| "loss/crossentropy": 2.1112293377518654, |
| "loss/hidden": 3.350390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18414278626441954, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.0385, |
| "grad_norm": 32.25, |
| "grad_norm_var": 3.7249348958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5681, |
| "loss/crossentropy": 2.166412356495857, |
| "loss/hidden": 3.44453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19420933350920677, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.03875, |
| "grad_norm": 31.625, |
| "grad_norm_var": 5.330989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6663, |
| "loss/crossentropy": 2.0857026129961014, |
| "loss/hidden": 3.574609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2175652377307415, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.039, |
| "grad_norm": 30.75, |
| "grad_norm_var": 7.401822916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5661, |
| "loss/crossentropy": 2.1806214213371278, |
| "loss/hidden": 3.562890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21507157981395722, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.03925, |
| "grad_norm": 37.5, |
| "grad_norm_var": 9.043684895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5649, |
| "loss/crossentropy": 2.073585295677185, |
| "loss/hidden": 3.520703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19737527389079332, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.0395, |
| "grad_norm": 32.5, |
| "grad_norm_var": 4.607747395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6893, |
| "loss/crossentropy": 2.262183803319931, |
| "loss/hidden": 3.425390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22450251020491124, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.03975, |
| "grad_norm": 31.25, |
| "grad_norm_var": 1.3983723958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.7101, |
| "loss/crossentropy": 2.1200410187244416, |
| "loss/hidden": 3.464453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20633359774947166, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 32.25, |
| "grad_norm_var": 28.868684895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6821, |
| "loss/crossentropy": 2.1520946115255355, |
| "loss/hidden": 3.616796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20241751577705144, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.04025, |
| "grad_norm": 34.5, |
| "grad_norm_var": 24.308072916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6482, |
| "loss/crossentropy": 2.109182408452034, |
| "loss/hidden": 3.491015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19619097150862216, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.0405, |
| "grad_norm": 33.25, |
| "grad_norm_var": 1.83125, |
| "learning_rate": 0.0001, |
| "loss": 7.614, |
| "loss/crossentropy": 2.220561644434929, |
| "loss/hidden": 3.44921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20333079397678375, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.04075, |
| "grad_norm": 31.0, |
| "grad_norm_var": 7.183072916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.7748, |
| "loss/crossentropy": 2.2026446878910066, |
| "loss/hidden": 3.472265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20862093791365624, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.041, |
| "grad_norm": 36.25, |
| "grad_norm_var": 8.080989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6592, |
| "loss/crossentropy": 2.2313437908887863, |
| "loss/hidden": 3.32421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19155636206269264, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.04125, |
| "grad_norm": 31.5, |
| "grad_norm_var": 3.8863932291666665, |
| "learning_rate": 0.0001, |
| "loss": 7.6964, |
| "loss/crossentropy": 2.0529640942811964, |
| "loss/hidden": 3.658984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.23474433943629264, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.0415, |
| "grad_norm": 33.25, |
| "grad_norm_var": 1.0374348958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6483, |
| "loss/crossentropy": 2.175355441868305, |
| "loss/hidden": 3.41953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19280093312263488, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.04175, |
| "grad_norm": 29.625, |
| "grad_norm_var": 2.4268229166666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6277, |
| "loss/crossentropy": 2.121490114927292, |
| "loss/hidden": 3.4421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20306031554937362, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.042, |
| "grad_norm": 36.75, |
| "grad_norm_var": 186.4869140625, |
| "learning_rate": 0.0001, |
| "loss": 7.7912, |
| "loss/crossentropy": 2.123810574412346, |
| "loss/hidden": 3.596875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20700039602816106, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.04225, |
| "grad_norm": 33.0, |
| "grad_norm_var": 194.72708333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5507, |
| "loss/crossentropy": 2.198003688454628, |
| "loss/hidden": 3.384375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19452486634254457, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.0425, |
| "grad_norm": 32.25, |
| "grad_norm_var": 3.842643229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6641, |
| "loss/crossentropy": 2.1328449815511705, |
| "loss/hidden": 3.508984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20609250776469706, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.04275, |
| "grad_norm": 36.5, |
| "grad_norm_var": 23.064322916666665, |
| "learning_rate": 0.0001, |
| "loss": 7.5838, |
| "loss/crossentropy": 2.1690568923950195, |
| "loss/hidden": 3.459765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19301791079342365, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.043, |
| "grad_norm": 30.0, |
| "grad_norm_var": 5.9712890625, |
| "learning_rate": 0.0001, |
| "loss": 7.5441, |
| "loss/crossentropy": 2.1149508744478225, |
| "loss/hidden": 3.327734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19253603778779507, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.04325, |
| "grad_norm": 33.0, |
| "grad_norm_var": 3.82890625, |
| "learning_rate": 0.0001, |
| "loss": 7.6644, |
| "loss/crossentropy": 2.070442554354668, |
| "loss/hidden": 3.5234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19407737776637077, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.0435, |
| "grad_norm": 30.625, |
| "grad_norm_var": 1.5947265625, |
| "learning_rate": 0.0001, |
| "loss": 7.6468, |
| "loss/crossentropy": 2.2249585568904875, |
| "loss/hidden": 3.437890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20079109650105237, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.04375, |
| "grad_norm": 32.25, |
| "grad_norm_var": 4.1384765625, |
| "learning_rate": 0.0001, |
| "loss": 7.6522, |
| "loss/crossentropy": 2.1712467283010484, |
| "loss/hidden": 3.481640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21240621842443944, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.044, |
| "grad_norm": 30.75, |
| "grad_norm_var": 2.7018229166666665, |
| "learning_rate": 0.0001, |
| "loss": 7.6326, |
| "loss/crossentropy": 2.1433209091424943, |
| "loss/hidden": 3.50546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19783576354384422, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.04425, |
| "grad_norm": 32.25, |
| "grad_norm_var": 10.2634765625, |
| "learning_rate": 0.0001, |
| "loss": 7.7252, |
| "loss/crossentropy": 2.1377856612205504, |
| "loss/hidden": 3.462109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19519764352589847, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.0445, |
| "grad_norm": 32.25, |
| "grad_norm_var": 9.817122395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.6165, |
| "loss/crossentropy": 2.2387378960847855, |
| "loss/hidden": 3.465234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2087532427161932, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.04475, |
| "grad_norm": 31.0, |
| "grad_norm_var": 1.9650390625, |
| "learning_rate": 0.0001, |
| "loss": 7.6701, |
| "loss/crossentropy": 2.280033028125763, |
| "loss/hidden": 3.31875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1907376278191805, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.045, |
| "grad_norm": 33.75, |
| "grad_norm_var": 2.4197265625, |
| "learning_rate": 0.0001, |
| "loss": 7.6553, |
| "loss/crossentropy": 2.205285739898682, |
| "loss/hidden": 3.448828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1980523556470871, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.04525, |
| "grad_norm": 31.0, |
| "grad_norm_var": 3.2462890625, |
| "learning_rate": 0.0001, |
| "loss": 7.6001, |
| "loss/crossentropy": 2.047496220469475, |
| "loss/hidden": 3.548046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19389633461833, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.0455, |
| "grad_norm": 31.125, |
| "grad_norm_var": 2.562239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.615, |
| "loss/crossentropy": 2.174453580379486, |
| "loss/hidden": 3.516796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20545508041977883, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.04575, |
| "grad_norm": 34.25, |
| "grad_norm_var": 3.4296223958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.638, |
| "loss/crossentropy": 2.0722746759653092, |
| "loss/hidden": 3.437109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19747158586978913, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.046, |
| "grad_norm": 34.75, |
| "grad_norm_var": 3.0666015625, |
| "learning_rate": 0.0001, |
| "loss": 7.7087, |
| "loss/crossentropy": 2.1196924835443496, |
| "loss/hidden": 3.622265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20298538953065873, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.04625, |
| "grad_norm": 29.75, |
| "grad_norm_var": 2.6119140625, |
| "learning_rate": 0.0001, |
| "loss": 7.6036, |
| "loss/crossentropy": 2.1688392132520677, |
| "loss/hidden": 3.323046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17962730433791876, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.0465, |
| "grad_norm": 33.75, |
| "grad_norm_var": 1.3374348958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.7051, |
| "loss/crossentropy": 2.1148360162973403, |
| "loss/hidden": 3.422265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2195219134911895, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.04675, |
| "grad_norm": 33.5, |
| "grad_norm_var": 2.45625, |
| "learning_rate": 0.0001, |
| "loss": 7.6397, |
| "loss/crossentropy": 2.016439202427864, |
| "loss/hidden": 3.529296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2061541959643364, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.047, |
| "grad_norm": 29.75, |
| "grad_norm_var": 4.173893229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.7054, |
| "loss/crossentropy": 2.1470705419778824, |
| "loss/hidden": 3.3828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21371309272944927, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.04725, |
| "grad_norm": 32.5, |
| "grad_norm_var": 4.054622395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5959, |
| "loss/crossentropy": 2.265937978029251, |
| "loss/hidden": 3.31328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1929181769490242, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.0475, |
| "grad_norm": 30.125, |
| "grad_norm_var": 6.6197265625, |
| "learning_rate": 0.0001, |
| "loss": 7.6012, |
| "loss/crossentropy": 2.0853475779294968, |
| "loss/hidden": 3.40625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20215214397758247, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.04775, |
| "grad_norm": 34.0, |
| "grad_norm_var": 23.695572916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.7544, |
| "loss/crossentropy": 2.162308484315872, |
| "loss/hidden": 3.4796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2013952497392893, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.048, |
| "grad_norm": 30.75, |
| "grad_norm_var": 4.120768229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6092, |
| "loss/crossentropy": 2.088267083466053, |
| "loss/hidden": 3.5234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2007219024002552, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.04825, |
| "grad_norm": 35.75, |
| "grad_norm_var": 2.97890625, |
| "learning_rate": 0.0001, |
| "loss": 7.7141, |
| "loss/crossentropy": 2.0617689430713653, |
| "loss/hidden": 3.46875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1928685350343585, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.0485, |
| "grad_norm": 35.0, |
| "grad_norm_var": 6.073893229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.7068, |
| "loss/crossentropy": 2.1201131522655485, |
| "loss/hidden": 3.501953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20757155679166317, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.04875, |
| "grad_norm": 31.5, |
| "grad_norm_var": 21.345572916666665, |
| "learning_rate": 0.0001, |
| "loss": 7.6198, |
| "loss/crossentropy": 2.235423868894577, |
| "loss/hidden": 3.36875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1959926813840866, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.049, |
| "grad_norm": 30.625, |
| "grad_norm_var": 28.99765625, |
| "learning_rate": 0.0001, |
| "loss": 7.6389, |
| "loss/crossentropy": 2.205905148386955, |
| "loss/hidden": 3.446484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2116202499717474, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.04925, |
| "grad_norm": 33.25, |
| "grad_norm_var": 9.9369140625, |
| "learning_rate": 0.0001, |
| "loss": 7.7121, |
| "loss/crossentropy": 2.163422483205795, |
| "loss/hidden": 3.40390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1981559544801712, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.0495, |
| "grad_norm": 29.0, |
| "grad_norm_var": 9.118489583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6772, |
| "loss/crossentropy": 2.1636913806200027, |
| "loss/hidden": 3.442578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1968079771846533, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.04975, |
| "grad_norm": 35.25, |
| "grad_norm_var": 4.377018229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5948, |
| "loss/crossentropy": 2.174500140547752, |
| "loss/hidden": 3.42578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1923616673797369, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 30.75, |
| "grad_norm_var": 6.15625, |
| "learning_rate": 0.0001, |
| "loss": 7.5639, |
| "loss/crossentropy": 2.1197956264019013, |
| "loss/hidden": 3.50703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20423812307417394, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.05025, |
| "grad_norm": 33.5, |
| "grad_norm_var": 4.725455729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6238, |
| "loss/crossentropy": 2.1442053347826002, |
| "loss/hidden": 3.359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20131820477545262, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.0505, |
| "grad_norm": 33.0, |
| "grad_norm_var": 3.2212890625, |
| "learning_rate": 0.0001, |
| "loss": 7.6024, |
| "loss/crossentropy": 2.1970301985740663, |
| "loss/hidden": 3.43203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19248049296438693, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.05075, |
| "grad_norm": 31.5, |
| "grad_norm_var": 2.2853515625, |
| "learning_rate": 0.0001, |
| "loss": 7.6279, |
| "loss/crossentropy": 2.0732986360788344, |
| "loss/hidden": 3.476953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19802382439374924, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.051, |
| "grad_norm": 35.25, |
| "grad_norm_var": 3.2129557291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.643, |
| "loss/crossentropy": 2.196815450489521, |
| "loss/hidden": 3.48203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20899684820324183, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.05125, |
| "grad_norm": 34.5, |
| "grad_norm_var": 4.093489583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6321, |
| "loss/crossentropy": 2.083095496892929, |
| "loss/hidden": 3.40390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18292178437113762, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.0515, |
| "grad_norm": 31.875, |
| "grad_norm_var": 19.478059895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5882, |
| "loss/crossentropy": 2.2153579622507094, |
| "loss/hidden": 3.360546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19245057981461286, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.05175, |
| "grad_norm": 34.5, |
| "grad_norm_var": 16.97265625, |
| "learning_rate": 0.0001, |
| "loss": 7.6006, |
| "loss/crossentropy": 2.2516845196485518, |
| "loss/hidden": 3.385546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1975632380694151, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.052, |
| "grad_norm": 29.75, |
| "grad_norm_var": 2.71015625, |
| "learning_rate": 0.0001, |
| "loss": 7.6393, |
| "loss/crossentropy": 2.1561204314231874, |
| "loss/hidden": 3.465234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2124529665336013, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.05225, |
| "grad_norm": 35.75, |
| "grad_norm_var": 36.984375, |
| "learning_rate": 0.0001, |
| "loss": 7.7289, |
| "loss/crossentropy": 2.2129232093691824, |
| "loss/hidden": 3.43125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1984808323904872, |
| "step": 2090 |
| }, |
| { |
| "epoch": 0.0525, |
| "grad_norm": 28.875, |
| "grad_norm_var": 38.076822916666664, |
| "learning_rate": 0.0001, |
| "loss": 7.5446, |
| "loss/crossentropy": 2.281945154070854, |
| "loss/hidden": 3.387109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1915616899728775, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.05275, |
| "grad_norm": 30.5, |
| "grad_norm_var": 2.10625, |
| "learning_rate": 0.0001, |
| "loss": 7.6215, |
| "loss/crossentropy": 2.0773366719484327, |
| "loss/hidden": 3.439453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18790210355073214, |
| "step": 2110 |
| }, |
| { |
| "epoch": 0.053, |
| "grad_norm": 31.25, |
| "grad_norm_var": 1.24765625, |
| "learning_rate": 0.0001, |
| "loss": 7.4968, |
| "loss/crossentropy": 2.186223568022251, |
| "loss/hidden": 3.36875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1900737203657627, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.05325, |
| "grad_norm": 30.75, |
| "grad_norm_var": 4.638541666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6574, |
| "loss/crossentropy": 2.2741902500391005, |
| "loss/hidden": 3.38359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1898935280740261, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.0535, |
| "grad_norm": 33.75, |
| "grad_norm_var": 18.001041666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6903, |
| "loss/crossentropy": 2.1332941919565203, |
| "loss/hidden": 3.42265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1954928996041417, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.05375, |
| "grad_norm": 34.5, |
| "grad_norm_var": 17.939583333333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5786, |
| "loss/crossentropy": 2.2076333969831468, |
| "loss/hidden": 3.4828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20088096596300603, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.054, |
| "grad_norm": 33.5, |
| "grad_norm_var": 8.947916666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5995, |
| "loss/crossentropy": 2.201739010214806, |
| "loss/hidden": 3.394140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20021349862217902, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.05425, |
| "grad_norm": 30.0, |
| "grad_norm_var": 185.5900390625, |
| "learning_rate": 0.0001, |
| "loss": 7.6214, |
| "loss/crossentropy": 2.1913442850112914, |
| "loss/hidden": 3.407421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1996122680604458, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.0545, |
| "grad_norm": 30.625, |
| "grad_norm_var": 186.84166666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.7375, |
| "loss/crossentropy": 2.173484447598457, |
| "loss/hidden": 3.428515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18794310167431832, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.05475, |
| "grad_norm": 31.375, |
| "grad_norm_var": 8.699739583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6023, |
| "loss/crossentropy": 2.207549235224724, |
| "loss/hidden": 3.494140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21360519118607044, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.055, |
| "grad_norm": 32.0, |
| "grad_norm_var": 5.228059895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5821, |
| "loss/crossentropy": 2.168141430988908, |
| "loss/hidden": 3.355078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1850608481094241, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.05525, |
| "grad_norm": 29.625, |
| "grad_norm_var": 10.708268229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.6191, |
| "loss/crossentropy": 2.26080215126276, |
| "loss/hidden": 3.440625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19979589320719243, |
| "step": 2210 |
| }, |
| { |
| "epoch": 0.0555, |
| "grad_norm": 31.25, |
| "grad_norm_var": 10.148958333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.613, |
| "loss/crossentropy": 2.2105998665094377, |
| "loss/hidden": 3.430859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20264392383396626, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.05575, |
| "grad_norm": 29.5, |
| "grad_norm_var": 9.128059895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5296, |
| "loss/crossentropy": 2.076467031240463, |
| "loss/hidden": 3.59296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22621012963354586, |
| "step": 2230 |
| }, |
| { |
| "epoch": 0.056, |
| "grad_norm": 34.5, |
| "grad_norm_var": 3.99375, |
| "learning_rate": 0.0001, |
| "loss": 7.6686, |
| "loss/crossentropy": 2.0320577889680864, |
| "loss/hidden": 3.575, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19981470778584481, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.05625, |
| "grad_norm": 30.75, |
| "grad_norm_var": 14.473893229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5723, |
| "loss/crossentropy": 2.084241083264351, |
| "loss/hidden": 3.545703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2100257944315672, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.0565, |
| "grad_norm": 31.5, |
| "grad_norm_var": 44.7041015625, |
| "learning_rate": 0.0001, |
| "loss": 7.6107, |
| "loss/crossentropy": 2.2695932418107985, |
| "loss/hidden": 3.371875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19781356416642665, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.05675, |
| "grad_norm": 30.875, |
| "grad_norm_var": 9.897916666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6187, |
| "loss/crossentropy": 2.188571906089783, |
| "loss/hidden": 3.43984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2015857521444559, |
| "step": 2270 |
| }, |
| { |
| "epoch": 0.057, |
| "grad_norm": 28.375, |
| "grad_norm_var": 3.490625, |
| "learning_rate": 0.0001, |
| "loss": 7.6139, |
| "loss/crossentropy": 2.1134210243821143, |
| "loss/hidden": 3.481640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22417646870017052, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.05725, |
| "grad_norm": 32.0, |
| "grad_norm_var": 6.479622395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6591, |
| "loss/crossentropy": 2.1894455403089523, |
| "loss/hidden": 3.465625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19847002141177655, |
| "step": 2290 |
| }, |
| { |
| "epoch": 0.0575, |
| "grad_norm": 31.25, |
| "grad_norm_var": 8.809309895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.657, |
| "loss/crossentropy": 2.1524556159973143, |
| "loss/hidden": 3.319921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1857963975518942, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.05775, |
| "grad_norm": 32.25, |
| "grad_norm_var": 3.121809895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6475, |
| "loss/crossentropy": 2.2037901908159254, |
| "loss/hidden": 3.44140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22191528491675855, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.058, |
| "grad_norm": 32.5, |
| "grad_norm_var": 2.6348307291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5846, |
| "loss/crossentropy": 2.214809921383858, |
| "loss/hidden": 3.3453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19421134144067764, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.05825, |
| "grad_norm": 29.375, |
| "grad_norm_var": 2.939322916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6893, |
| "loss/crossentropy": 2.2204942047595977, |
| "loss/hidden": 3.437890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21439925488084555, |
| "step": 2330 |
| }, |
| { |
| "epoch": 0.0585, |
| "grad_norm": 28.625, |
| "grad_norm_var": 3.8744140625, |
| "learning_rate": 0.0001, |
| "loss": 7.6038, |
| "loss/crossentropy": 2.1540059238672256, |
| "loss/hidden": 3.56953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.24175845962017775, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.05875, |
| "grad_norm": 31.875, |
| "grad_norm_var": 2.1426432291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6379, |
| "loss/crossentropy": 2.1948168754577635, |
| "loss/hidden": 3.43046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2041913490742445, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.059, |
| "grad_norm": 32.5, |
| "grad_norm_var": 1.7869140625, |
| "learning_rate": 0.0001, |
| "loss": 7.7135, |
| "loss/crossentropy": 2.1938526153564455, |
| "loss/hidden": 3.434375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1992840923368931, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.05925, |
| "grad_norm": 33.75, |
| "grad_norm_var": 1.1343098958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6931, |
| "loss/crossentropy": 2.12769907861948, |
| "loss/hidden": 3.3796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18500677905976773, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.0595, |
| "grad_norm": 33.0, |
| "grad_norm_var": 3.1884765625, |
| "learning_rate": 0.0001, |
| "loss": 7.6297, |
| "loss/crossentropy": 2.1268584340810777, |
| "loss/hidden": 3.485546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20107861533761023, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.05975, |
| "grad_norm": 31.0, |
| "grad_norm_var": 5.368489583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.555, |
| "loss/crossentropy": 2.198070913553238, |
| "loss/hidden": 3.4171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19437791910022498, |
| "step": 2390 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 31.0, |
| "grad_norm_var": 3.218489583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.633, |
| "loss/crossentropy": 2.1521017968654634, |
| "loss/hidden": 3.472265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19696612432599067, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.06025, |
| "grad_norm": 31.625, |
| "grad_norm_var": 1.4098307291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6634, |
| "loss/crossentropy": 2.0935733556747436, |
| "loss/hidden": 3.46171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19025789983570576, |
| "step": 2410 |
| }, |
| { |
| "epoch": 0.0605, |
| "grad_norm": 31.75, |
| "grad_norm_var": 4.812434895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6523, |
| "loss/crossentropy": 2.206766763329506, |
| "loss/hidden": 3.421484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19223052635788918, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.06075, |
| "grad_norm": 32.0, |
| "grad_norm_var": 5.545247395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6703, |
| "loss/crossentropy": 2.2091148614883425, |
| "loss/hidden": 3.471875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2191623793914914, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.061, |
| "grad_norm": 31.875, |
| "grad_norm_var": 3.06640625, |
| "learning_rate": 0.0001, |
| "loss": 7.6194, |
| "loss/crossentropy": 2.2076220482587816, |
| "loss/hidden": 3.52265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21331611163914205, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.06125, |
| "grad_norm": 33.75, |
| "grad_norm_var": 3.753125, |
| "learning_rate": 0.0001, |
| "loss": 7.6143, |
| "loss/crossentropy": 2.1473243802785875, |
| "loss/hidden": 3.415625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20034591071307659, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.0615, |
| "grad_norm": 31.25, |
| "grad_norm_var": 4.09765625, |
| "learning_rate": 0.0001, |
| "loss": 7.6166, |
| "loss/crossentropy": 2.2176205784082414, |
| "loss/hidden": 3.4015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1890367180109024, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.06175, |
| "grad_norm": 32.75, |
| "grad_norm_var": 2.56015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5864, |
| "loss/crossentropy": 2.139193335175514, |
| "loss/hidden": 3.55234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19700367711484432, |
| "step": 2470 |
| }, |
| { |
| "epoch": 0.062, |
| "grad_norm": 30.625, |
| "grad_norm_var": 3.5400390625, |
| "learning_rate": 0.0001, |
| "loss": 7.6061, |
| "loss/crossentropy": 2.101886364817619, |
| "loss/hidden": 3.543359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20392275378108024, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.06225, |
| "grad_norm": 31.625, |
| "grad_norm_var": 2.3353515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5912, |
| "loss/crossentropy": 2.1105535492300986, |
| "loss/hidden": 3.451953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20442402064800264, |
| "step": 2490 |
| }, |
| { |
| "epoch": 0.0625, |
| "grad_norm": 32.25, |
| "grad_norm_var": 2.183333333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6553, |
| "loss/crossentropy": 2.1315447479486465, |
| "loss/hidden": 3.471484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20164060425013303, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.06275, |
| "grad_norm": 33.25, |
| "grad_norm_var": 147.15149739583333, |
| "learning_rate": 0.0001, |
| "loss": 7.6542, |
| "loss/crossentropy": 2.0641630738973618, |
| "loss/hidden": 3.468359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20385651774704455, |
| "step": 2510 |
| }, |
| { |
| "epoch": 0.063, |
| "grad_norm": 30.5, |
| "grad_norm_var": 150.99166666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5568, |
| "loss/crossentropy": 2.191535955667496, |
| "loss/hidden": 3.402734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19768227599561214, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.06325, |
| "grad_norm": 29.0, |
| "grad_norm_var": 2.13515625, |
| "learning_rate": 0.0001, |
| "loss": 7.6544, |
| "loss/crossentropy": 2.199158227443695, |
| "loss/hidden": 3.45, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2097537014633417, |
| "step": 2530 |
| }, |
| { |
| "epoch": 0.0635, |
| "grad_norm": 31.5, |
| "grad_norm_var": 2.731705729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5063, |
| "loss/crossentropy": 2.1456793427467344, |
| "loss/hidden": 3.403515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19175102189183235, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.06375, |
| "grad_norm": 32.5, |
| "grad_norm_var": 6.859830729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5828, |
| "loss/crossentropy": 2.255453732609749, |
| "loss/hidden": 3.4203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19693338237702845, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.064, |
| "grad_norm": 31.625, |
| "grad_norm_var": 5.178125, |
| "learning_rate": 0.0001, |
| "loss": 7.5702, |
| "loss/crossentropy": 2.2270909011363984, |
| "loss/hidden": 3.43359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20889390334486962, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.06425, |
| "grad_norm": 33.5, |
| "grad_norm_var": 3.6372395833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5904, |
| "loss/crossentropy": 2.190132850408554, |
| "loss/hidden": 3.3953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20386817157268525, |
| "step": 2570 |
| }, |
| { |
| "epoch": 0.0645, |
| "grad_norm": 34.25, |
| "grad_norm_var": 10.79765625, |
| "learning_rate": 0.0001, |
| "loss": 7.5854, |
| "loss/crossentropy": 2.07715407460928, |
| "loss/hidden": 3.581640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20797281824052333, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.06475, |
| "grad_norm": 33.5, |
| "grad_norm_var": 12.35625, |
| "learning_rate": 0.0001, |
| "loss": 7.6279, |
| "loss/crossentropy": 2.1247923612594604, |
| "loss/hidden": 3.434765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21620508767664431, |
| "step": 2590 |
| }, |
| { |
| "epoch": 0.065, |
| "grad_norm": 32.75, |
| "grad_norm_var": 5.094791666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5996, |
| "loss/crossentropy": 2.087959203124046, |
| "loss/hidden": 3.521875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19923710729926825, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.06525, |
| "grad_norm": 30.0, |
| "grad_norm_var": 7.6150390625, |
| "learning_rate": 0.0001, |
| "loss": 7.682, |
| "loss/crossentropy": 2.1805250465869905, |
| "loss/hidden": 3.38984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2021130472421646, |
| "step": 2610 |
| }, |
| { |
| "epoch": 0.0655, |
| "grad_norm": 34.75, |
| "grad_norm_var": 7.06015625, |
| "learning_rate": 0.0001, |
| "loss": 7.7244, |
| "loss/crossentropy": 2.1178730964660644, |
| "loss/hidden": 3.402734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19405451826751233, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.06575, |
| "grad_norm": 29.75, |
| "grad_norm_var": 3.065559895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6483, |
| "loss/crossentropy": 2.1593512505292893, |
| "loss/hidden": 3.381640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2096536297351122, |
| "step": 2630 |
| }, |
| { |
| "epoch": 0.066, |
| "grad_norm": 33.25, |
| "grad_norm_var": 4.623372395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5982, |
| "loss/crossentropy": 2.159625916182995, |
| "loss/hidden": 3.365234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18939675595611333, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.06625, |
| "grad_norm": 53.0, |
| "grad_norm_var": 49.99524739583333, |
| "learning_rate": 0.0001, |
| "loss": 7.6918, |
| "loss/crossentropy": 2.114516945183277, |
| "loss/hidden": 3.51484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2009023107588291, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.0665, |
| "grad_norm": 30.125, |
| "grad_norm_var": 38.81399739583333, |
| "learning_rate": 0.0001, |
| "loss": 7.5543, |
| "loss/crossentropy": 2.132766366004944, |
| "loss/hidden": 3.421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1965734062716365, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.06675, |
| "grad_norm": 30.875, |
| "grad_norm_var": 1.9559895833333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6338, |
| "loss/crossentropy": 2.1092610150575637, |
| "loss/hidden": 3.431640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17978871315717698, |
| "step": 2670 |
| }, |
| { |
| "epoch": 0.067, |
| "grad_norm": 29.875, |
| "grad_norm_var": 4.47890625, |
| "learning_rate": 0.0001, |
| "loss": 7.617, |
| "loss/crossentropy": 2.2271903961896897, |
| "loss/hidden": 3.461328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2073811784386635, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.06725, |
| "grad_norm": 30.125, |
| "grad_norm_var": 3.2249348958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.641, |
| "loss/crossentropy": 2.0155764549970625, |
| "loss/hidden": 3.603515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2049756994470954, |
| "step": 2690 |
| }, |
| { |
| "epoch": 0.0675, |
| "grad_norm": 33.5, |
| "grad_norm_var": 3.0061848958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.6253, |
| "loss/crossentropy": 2.221065053343773, |
| "loss/hidden": 3.482421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2112014289945364, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.06775, |
| "grad_norm": 32.25, |
| "grad_norm_var": 18.753125, |
| "learning_rate": 0.0001, |
| "loss": 7.6427, |
| "loss/crossentropy": 2.180001160502434, |
| "loss/hidden": 3.353125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.193130424618721, |
| "step": 2710 |
| }, |
| { |
| "epoch": 0.068, |
| "grad_norm": 32.5, |
| "grad_norm_var": 20.773893229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6163, |
| "loss/crossentropy": 2.283226564526558, |
| "loss/hidden": 3.48515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19388929307460784, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.06825, |
| "grad_norm": 31.25, |
| "grad_norm_var": 1.61640625, |
| "learning_rate": 0.0001, |
| "loss": 7.587, |
| "loss/crossentropy": 2.1378406554460527, |
| "loss/hidden": 3.4921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2115953892469406, |
| "step": 2730 |
| }, |
| { |
| "epoch": 0.0685, |
| "grad_norm": 32.5, |
| "grad_norm_var": 1.8684895833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6011, |
| "loss/crossentropy": 2.0985760882496836, |
| "loss/hidden": 3.48125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19036055766046048, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.06875, |
| "grad_norm": 32.25, |
| "grad_norm_var": 2.9535807291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6583, |
| "loss/crossentropy": 2.1665745437145234, |
| "loss/hidden": 3.36640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18649150040000678, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.069, |
| "grad_norm": 34.5, |
| "grad_norm_var": 6.343489583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5495, |
| "loss/crossentropy": 2.176983141899109, |
| "loss/hidden": 3.412890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20285341441631316, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.06925, |
| "grad_norm": 32.5, |
| "grad_norm_var": 4.972916666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6597, |
| "loss/crossentropy": 2.1060123026371, |
| "loss/hidden": 3.400390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19327255934476853, |
| "step": 2770 |
| }, |
| { |
| "epoch": 0.0695, |
| "grad_norm": 31.625, |
| "grad_norm_var": 32.33639322916667, |
| "learning_rate": 0.0001, |
| "loss": 7.5862, |
| "loss/crossentropy": 2.1663936868309976, |
| "loss/hidden": 3.512109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22126073129475116, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.06975, |
| "grad_norm": 32.5, |
| "grad_norm_var": 5.694791666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5124, |
| "loss/crossentropy": 2.225750984251499, |
| "loss/hidden": 3.350390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19473073966801166, |
| "step": 2790 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 31.5, |
| "grad_norm_var": 4.237434895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6174, |
| "loss/crossentropy": 2.0647315263748167, |
| "loss/hidden": 3.4734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2136565549299121, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.07025, |
| "grad_norm": 36.0, |
| "grad_norm_var": 4.792708333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6789, |
| "loss/crossentropy": 2.1971701353788378, |
| "loss/hidden": 3.440625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21544951274991037, |
| "step": 2810 |
| }, |
| { |
| "epoch": 0.0705, |
| "grad_norm": 31.125, |
| "grad_norm_var": 11.145247395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.7043, |
| "loss/crossentropy": 2.2537077218294144, |
| "loss/hidden": 3.395703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19961411394178868, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.07075, |
| "grad_norm": 30.5, |
| "grad_norm_var": 85.65182291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6427, |
| "loss/crossentropy": 2.0513558954000475, |
| "loss/hidden": 3.615234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.23545071221888064, |
| "step": 2830 |
| }, |
| { |
| "epoch": 0.071, |
| "grad_norm": 31.375, |
| "grad_norm_var": 64.77180989583333, |
| "learning_rate": 0.0001, |
| "loss": 7.6378, |
| "loss/crossentropy": 2.186201846599579, |
| "loss/hidden": 3.55078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20769538041204214, |
| "step": 2840 |
| }, |
| { |
| "epoch": 0.07125, |
| "grad_norm": 32.25, |
| "grad_norm_var": 2.0268229166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5525, |
| "loss/crossentropy": 2.161085495352745, |
| "loss/hidden": 3.27421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18488222286105155, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.0715, |
| "grad_norm": 33.0, |
| "grad_norm_var": 10.437434895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6376, |
| "loss/crossentropy": 2.09626332372427, |
| "loss/hidden": 3.33203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.179809108376503, |
| "step": 2860 |
| }, |
| { |
| "epoch": 0.07175, |
| "grad_norm": 33.25, |
| "grad_norm_var": 9.233072916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.62, |
| "loss/crossentropy": 2.2382488936185836, |
| "loss/hidden": 3.353515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20872681811451912, |
| "step": 2870 |
| }, |
| { |
| "epoch": 0.072, |
| "grad_norm": 28.5, |
| "grad_norm_var": 8.784375, |
| "learning_rate": 0.0001, |
| "loss": 7.6516, |
| "loss/crossentropy": 2.1699771240353583, |
| "loss/hidden": 3.470703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2100867312401533, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.07225, |
| "grad_norm": 33.75, |
| "grad_norm_var": 9.269791666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5839, |
| "loss/crossentropy": 2.1368533104658125, |
| "loss/hidden": 3.426953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21750828213989734, |
| "step": 2890 |
| }, |
| { |
| "epoch": 0.0725, |
| "grad_norm": 36.0, |
| "grad_norm_var": 5.518489583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6849, |
| "loss/crossentropy": 2.12222815155983, |
| "loss/hidden": 3.580859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21114687696099282, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.07275, |
| "grad_norm": 31.125, |
| "grad_norm_var": 5.622916666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5109, |
| "loss/crossentropy": 2.171084225177765, |
| "loss/hidden": 3.403125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1979156408458948, |
| "step": 2910 |
| }, |
| { |
| "epoch": 0.073, |
| "grad_norm": 31.125, |
| "grad_norm_var": 1.5619140625, |
| "learning_rate": 0.0001, |
| "loss": 7.6895, |
| "loss/crossentropy": 2.164732736349106, |
| "loss/hidden": 3.404296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20426477529108525, |
| "step": 2920 |
| }, |
| { |
| "epoch": 0.07325, |
| "grad_norm": 29.375, |
| "grad_norm_var": 1.7854166666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5573, |
| "loss/crossentropy": 2.1073058575391768, |
| "loss/hidden": 3.50234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2097570365294814, |
| "step": 2930 |
| }, |
| { |
| "epoch": 0.0735, |
| "grad_norm": 30.875, |
| "grad_norm_var": 2.4955729166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5697, |
| "loss/crossentropy": 2.153279659152031, |
| "loss/hidden": 3.361328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1900124330073595, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.07375, |
| "grad_norm": 49.5, |
| "grad_norm_var": 22.75390625, |
| "learning_rate": 0.0001, |
| "loss": 7.671, |
| "loss/crossentropy": 2.2612457245588304, |
| "loss/hidden": 3.414453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18990697022527456, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.074, |
| "grad_norm": 32.0, |
| "grad_norm_var": 24.510416666666668, |
| "learning_rate": 0.0001, |
| "loss": 7.5814, |
| "loss/crossentropy": 2.123460465669632, |
| "loss/hidden": 3.496484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20658994875848294, |
| "step": 2960 |
| }, |
| { |
| "epoch": 0.07425, |
| "grad_norm": 30.125, |
| "grad_norm_var": 118.29837239583334, |
| "learning_rate": 0.0001, |
| "loss": 7.5481, |
| "loss/crossentropy": 2.2275219768285752, |
| "loss/hidden": 3.33515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18647960387170315, |
| "step": 2970 |
| }, |
| { |
| "epoch": 0.0745, |
| "grad_norm": 29.375, |
| "grad_norm_var": 21.989322916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5209, |
| "loss/crossentropy": 2.1731285482645033, |
| "loss/hidden": 3.4171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1889862149953842, |
| "step": 2980 |
| }, |
| { |
| "epoch": 0.07475, |
| "grad_norm": 31.125, |
| "grad_norm_var": 4.253059895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5815, |
| "loss/crossentropy": 2.2546483501791954, |
| "loss/hidden": 3.415625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18856723569333553, |
| "step": 2990 |
| }, |
| { |
| "epoch": 0.075, |
| "grad_norm": 32.0, |
| "grad_norm_var": 5.730143229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5835, |
| "loss/crossentropy": 2.092367857694626, |
| "loss/hidden": 3.52421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20502115599811077, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.07525, |
| "grad_norm": 29.5, |
| "grad_norm_var": 15.926822916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5849, |
| "loss/crossentropy": 2.109961675107479, |
| "loss/hidden": 3.432421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20341113824397325, |
| "step": 3010 |
| }, |
| { |
| "epoch": 0.0755, |
| "grad_norm": 32.5, |
| "grad_norm_var": 3.314322916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5896, |
| "loss/crossentropy": 2.1648701071739196, |
| "loss/hidden": 3.5, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20436643473803998, |
| "step": 3020 |
| }, |
| { |
| "epoch": 0.07575, |
| "grad_norm": 32.75, |
| "grad_norm_var": 1.4754557291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5959, |
| "loss/crossentropy": 2.2054502993822096, |
| "loss/hidden": 3.440234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20024821683764457, |
| "step": 3030 |
| }, |
| { |
| "epoch": 0.076, |
| "grad_norm": 30.875, |
| "grad_norm_var": 6.4931640625, |
| "learning_rate": 0.0001, |
| "loss": 7.6957, |
| "loss/crossentropy": 2.166448511183262, |
| "loss/hidden": 3.484765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20373598877340554, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.07625, |
| "grad_norm": 36.5, |
| "grad_norm_var": 9.442122395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5957, |
| "loss/crossentropy": 2.218970799446106, |
| "loss/hidden": 3.503515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20884830448776484, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.0765, |
| "grad_norm": 32.25, |
| "grad_norm_var": 6.062955729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5458, |
| "loss/crossentropy": 2.080473840236664, |
| "loss/hidden": 3.57109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20190774220973254, |
| "step": 3060 |
| }, |
| { |
| "epoch": 0.07675, |
| "grad_norm": 29.5, |
| "grad_norm_var": 2.6093098958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5646, |
| "loss/crossentropy": 2.1775156021118165, |
| "loss/hidden": 3.355859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19825822599232196, |
| "step": 3070 |
| }, |
| { |
| "epoch": 0.077, |
| "grad_norm": 30.125, |
| "grad_norm_var": 2.690625, |
| "learning_rate": 0.0001, |
| "loss": 7.7174, |
| "loss/crossentropy": 2.246141794323921, |
| "loss/hidden": 3.425390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20639744736254215, |
| "step": 3080 |
| }, |
| { |
| "epoch": 0.07725, |
| "grad_norm": 30.875, |
| "grad_norm_var": 4.271809895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6196, |
| "loss/crossentropy": 2.060313332080841, |
| "loss/hidden": 3.481640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21316638588905334, |
| "step": 3090 |
| }, |
| { |
| "epoch": 0.0775, |
| "grad_norm": 34.0, |
| "grad_norm_var": 2.873893229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5637, |
| "loss/crossentropy": 2.153154730796814, |
| "loss/hidden": 3.44453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20764457508921624, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.07775, |
| "grad_norm": 33.25, |
| "grad_norm_var": 2.2708333333333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5924, |
| "loss/crossentropy": 2.2558963537216186, |
| "loss/hidden": 3.364453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19444480016827584, |
| "step": 3110 |
| }, |
| { |
| "epoch": 0.078, |
| "grad_norm": 29.5, |
| "grad_norm_var": 2.2301432291666665, |
| "learning_rate": 0.0001, |
| "loss": 7.7202, |
| "loss/crossentropy": 2.190881980955601, |
| "loss/hidden": 3.422265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2061827789992094, |
| "step": 3120 |
| }, |
| { |
| "epoch": 0.07825, |
| "grad_norm": 31.375, |
| "grad_norm_var": 2.991080729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5479, |
| "loss/crossentropy": 2.1357465982437134, |
| "loss/hidden": 3.394140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19807269163429736, |
| "step": 3130 |
| }, |
| { |
| "epoch": 0.0785, |
| "grad_norm": 29.125, |
| "grad_norm_var": 5.457291666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.641, |
| "loss/crossentropy": 2.166859371960163, |
| "loss/hidden": 3.398046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1900594387203455, |
| "step": 3140 |
| }, |
| { |
| "epoch": 0.07875, |
| "grad_norm": 32.75, |
| "grad_norm_var": 27.958072916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5223, |
| "loss/crossentropy": 2.1595762044191362, |
| "loss/hidden": 3.551171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22254167906939984, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.079, |
| "grad_norm": 29.75, |
| "grad_norm_var": 3.4056640625, |
| "learning_rate": 0.0001, |
| "loss": 7.6761, |
| "loss/crossentropy": 2.156154304742813, |
| "loss/hidden": 3.407421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2026256375014782, |
| "step": 3160 |
| }, |
| { |
| "epoch": 0.07925, |
| "grad_norm": 32.0, |
| "grad_norm_var": 7.246875, |
| "learning_rate": 0.0001, |
| "loss": 7.4683, |
| "loss/crossentropy": 2.1108324408531187, |
| "loss/hidden": 3.5015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18734413515776396, |
| "step": 3170 |
| }, |
| { |
| "epoch": 0.0795, |
| "grad_norm": 28.375, |
| "grad_norm_var": 3.9212890625, |
| "learning_rate": 0.0001, |
| "loss": 7.5591, |
| "loss/crossentropy": 2.1108986347913743, |
| "loss/hidden": 3.487109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19466390162706376, |
| "step": 3180 |
| }, |
| { |
| "epoch": 0.07975, |
| "grad_norm": 32.5, |
| "grad_norm_var": 15.6962890625, |
| "learning_rate": 0.0001, |
| "loss": 7.6375, |
| "loss/crossentropy": 2.1180114537477492, |
| "loss/hidden": 3.478125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21690767258405685, |
| "step": 3190 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 31.875, |
| "grad_norm_var": 12.757747395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5977, |
| "loss/crossentropy": 2.1203838691115378, |
| "loss/hidden": 3.5421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20782926268875598, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.08025, |
| "grad_norm": 28.375, |
| "grad_norm_var": 3.3218098958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6303, |
| "loss/crossentropy": 2.1929849207401277, |
| "loss/hidden": 3.348828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19219291880726813, |
| "step": 3210 |
| }, |
| { |
| "epoch": 0.0805, |
| "grad_norm": 30.0, |
| "grad_norm_var": 3.1958333333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5282, |
| "loss/crossentropy": 2.2013367488980293, |
| "loss/hidden": 3.6, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21040805242955685, |
| "step": 3220 |
| }, |
| { |
| "epoch": 0.08075, |
| "grad_norm": 32.25, |
| "grad_norm_var": 1.8684895833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5331, |
| "loss/crossentropy": 2.184007254242897, |
| "loss/hidden": 3.371875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19979026056826116, |
| "step": 3230 |
| }, |
| { |
| "epoch": 0.081, |
| "grad_norm": 30.75, |
| "grad_norm_var": 2.130989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5964, |
| "loss/crossentropy": 2.2199858695268633, |
| "loss/hidden": 3.38515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21446770764887332, |
| "step": 3240 |
| }, |
| { |
| "epoch": 0.08125, |
| "grad_norm": 32.75, |
| "grad_norm_var": 2.6483723958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.602, |
| "loss/crossentropy": 2.1263694643974302, |
| "loss/hidden": 3.482421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19737922623753548, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.0815, |
| "grad_norm": 30.75, |
| "grad_norm_var": 3.207291666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5927, |
| "loss/crossentropy": 2.184669151902199, |
| "loss/hidden": 3.3359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18790993094444275, |
| "step": 3260 |
| }, |
| { |
| "epoch": 0.08175, |
| "grad_norm": 29.875, |
| "grad_norm_var": 2.857291666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.615, |
| "loss/crossentropy": 2.0831361666321753, |
| "loss/hidden": 3.48203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19330178536474704, |
| "step": 3270 |
| }, |
| { |
| "epoch": 0.082, |
| "grad_norm": 30.625, |
| "grad_norm_var": 15.702018229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6216, |
| "loss/crossentropy": 2.158697286248207, |
| "loss/hidden": 3.3765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18888361509889365, |
| "step": 3280 |
| }, |
| { |
| "epoch": 0.08225, |
| "grad_norm": 32.75, |
| "grad_norm_var": 18.211393229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.564, |
| "loss/crossentropy": 2.2913430631160736, |
| "loss/hidden": 3.4828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2058469709008932, |
| "step": 3290 |
| }, |
| { |
| "epoch": 0.0825, |
| "grad_norm": 35.5, |
| "grad_norm_var": 4.24140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5168, |
| "loss/crossentropy": 2.2065580666065214, |
| "loss/hidden": 3.3515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18786473274230958, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.08275, |
| "grad_norm": 32.5, |
| "grad_norm_var": 3.692643229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5099, |
| "loss/crossentropy": 2.1358665406703947, |
| "loss/hidden": 3.36875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18491616416722537, |
| "step": 3310 |
| }, |
| { |
| "epoch": 0.083, |
| "grad_norm": 31.75, |
| "grad_norm_var": 2.50390625, |
| "learning_rate": 0.0001, |
| "loss": 7.6092, |
| "loss/crossentropy": 2.206757593154907, |
| "loss/hidden": 3.4328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20058272033929825, |
| "step": 3320 |
| }, |
| { |
| "epoch": 0.08325, |
| "grad_norm": 34.5, |
| "grad_norm_var": 1.7497395833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5493, |
| "loss/crossentropy": 2.0709328591823577, |
| "loss/hidden": 3.450390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1953151250258088, |
| "step": 3330 |
| }, |
| { |
| "epoch": 0.0835, |
| "grad_norm": 34.5, |
| "grad_norm_var": 2.9395182291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.7584, |
| "loss/crossentropy": 2.1559954971075057, |
| "loss/hidden": 3.5484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20604321975260972, |
| "step": 3340 |
| }, |
| { |
| "epoch": 0.08375, |
| "grad_norm": 33.75, |
| "grad_norm_var": 17.864322916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6969, |
| "loss/crossentropy": 2.1975975424051284, |
| "loss/hidden": 3.40078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2027706265449524, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.084, |
| "grad_norm": 33.25, |
| "grad_norm_var": 2.7997395833333334, |
| "learning_rate": 0.0001, |
| "loss": 7.61, |
| "loss/crossentropy": 2.018556122481823, |
| "loss/hidden": 3.403515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18029189426451922, |
| "step": 3360 |
| }, |
| { |
| "epoch": 0.08425, |
| "grad_norm": 33.0, |
| "grad_norm_var": 2.5994140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5397, |
| "loss/crossentropy": 2.1838466703891752, |
| "loss/hidden": 3.39453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2002351511269808, |
| "step": 3370 |
| }, |
| { |
| "epoch": 0.0845, |
| "grad_norm": 32.75, |
| "grad_norm_var": 2.912955729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5982, |
| "loss/crossentropy": 2.184953287243843, |
| "loss/hidden": 3.391015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19311312437057496, |
| "step": 3380 |
| }, |
| { |
| "epoch": 0.08475, |
| "grad_norm": 34.25, |
| "grad_norm_var": 3.309375, |
| "learning_rate": 0.0001, |
| "loss": 7.5841, |
| "loss/crossentropy": 2.2160476714372637, |
| "loss/hidden": 3.408984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1966065490618348, |
| "step": 3390 |
| }, |
| { |
| "epoch": 0.085, |
| "grad_norm": 31.75, |
| "grad_norm_var": 2.1936848958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.588, |
| "loss/crossentropy": 2.2071674168109894, |
| "loss/hidden": 3.423828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19414376243948936, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.08525, |
| "grad_norm": 33.0, |
| "grad_norm_var": 1.6301432291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5966, |
| "loss/crossentropy": 2.117925961315632, |
| "loss/hidden": 3.44609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21105701606720687, |
| "step": 3410 |
| }, |
| { |
| "epoch": 0.0855, |
| "grad_norm": 32.25, |
| "grad_norm_var": 5.82265625, |
| "learning_rate": 0.0001, |
| "loss": 7.62, |
| "loss/crossentropy": 2.0512605965137483, |
| "loss/hidden": 3.509375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20283049941062928, |
| "step": 3420 |
| }, |
| { |
| "epoch": 0.08575, |
| "grad_norm": 31.75, |
| "grad_norm_var": 6.539583333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5429, |
| "loss/crossentropy": 2.074887050688267, |
| "loss/hidden": 3.5, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18658901005983353, |
| "step": 3430 |
| }, |
| { |
| "epoch": 0.086, |
| "grad_norm": 33.5, |
| "grad_norm_var": 8.426497395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5759, |
| "loss/crossentropy": 2.1776267290115356, |
| "loss/hidden": 3.466796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20649599879980088, |
| "step": 3440 |
| }, |
| { |
| "epoch": 0.08625, |
| "grad_norm": 43.5, |
| "grad_norm_var": 14.962434895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6429, |
| "loss/crossentropy": 2.12281953394413, |
| "loss/hidden": 3.5265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20702828094363213, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.0865, |
| "grad_norm": 31.0, |
| "grad_norm_var": 196.96555989583334, |
| "learning_rate": 0.0001, |
| "loss": 7.7919, |
| "loss/crossentropy": 2.1028707295656206, |
| "loss/hidden": 3.536328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22825684808194638, |
| "step": 3460 |
| }, |
| { |
| "epoch": 0.08675, |
| "grad_norm": 32.25, |
| "grad_norm_var": 206.46979166666668, |
| "learning_rate": 0.0001, |
| "loss": 7.5698, |
| "loss/crossentropy": 2.1032180160284044, |
| "loss/hidden": 3.45625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1899772472679615, |
| "step": 3470 |
| }, |
| { |
| "epoch": 0.087, |
| "grad_norm": 37.0, |
| "grad_norm_var": 15.6322265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5884, |
| "loss/crossentropy": 2.0837722390890123, |
| "loss/hidden": 3.446484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20534363873302935, |
| "step": 3480 |
| }, |
| { |
| "epoch": 0.08725, |
| "grad_norm": 30.0, |
| "grad_norm_var": 16.8806640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5719, |
| "loss/crossentropy": 2.1673771381378173, |
| "loss/hidden": 3.45234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21180946305394172, |
| "step": 3490 |
| }, |
| { |
| "epoch": 0.0875, |
| "grad_norm": 33.0, |
| "grad_norm_var": 16.212955729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5171, |
| "loss/crossentropy": 2.2269717276096346, |
| "loss/hidden": 3.294140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18251859862357378, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.08775, |
| "grad_norm": 33.5, |
| "grad_norm_var": 395.25390625, |
| "learning_rate": 0.0001, |
| "loss": 7.6825, |
| "loss/crossentropy": 2.2768601924180984, |
| "loss/hidden": 3.307421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17959882766008378, |
| "step": 3510 |
| }, |
| { |
| "epoch": 0.088, |
| "grad_norm": 31.875, |
| "grad_norm_var": 400.7280598958333, |
| "learning_rate": 0.0001, |
| "loss": 7.5387, |
| "loss/crossentropy": 2.174117147922516, |
| "loss/hidden": 3.325390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1909211568534374, |
| "step": 3520 |
| }, |
| { |
| "epoch": 0.08825, |
| "grad_norm": 34.0, |
| "grad_norm_var": 3.2527951689747005e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.5099, |
| "loss/crossentropy": 2.102574473619461, |
| "loss/hidden": 3.365625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18131749220192434, |
| "step": 3530 |
| }, |
| { |
| "epoch": 0.0885, |
| "grad_norm": 34.75, |
| "grad_norm_var": 3.252795168997245e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.5664, |
| "loss/crossentropy": 2.121107617020607, |
| "loss/hidden": 3.433203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18711038120090961, |
| "step": 3540 |
| }, |
| { |
| "epoch": 0.08875, |
| "grad_norm": 35.25, |
| "grad_norm_var": 26.5384765625, |
| "learning_rate": 0.0001, |
| "loss": 7.5577, |
| "loss/crossentropy": 2.1354643225669863, |
| "loss/hidden": 3.372265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18915031235665083, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.089, |
| "grad_norm": 29.25, |
| "grad_norm_var": 39.177083333333336, |
| "learning_rate": 0.0001, |
| "loss": 7.5603, |
| "loss/crossentropy": 2.111011874675751, |
| "loss/hidden": 3.425390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20049102939665317, |
| "step": 3560 |
| }, |
| { |
| "epoch": 0.08925, |
| "grad_norm": 30.5, |
| "grad_norm_var": 24.4916015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5407, |
| "loss/crossentropy": 2.091498665511608, |
| "loss/hidden": 3.429296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19228591658174993, |
| "step": 3570 |
| }, |
| { |
| "epoch": 0.0895, |
| "grad_norm": 30.0, |
| "grad_norm_var": 21.582291666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5146, |
| "loss/crossentropy": 2.1603414684534075, |
| "loss/hidden": 3.586328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22721791528165342, |
| "step": 3580 |
| }, |
| { |
| "epoch": 0.08975, |
| "grad_norm": 29.25, |
| "grad_norm_var": 18.798893229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5322, |
| "loss/crossentropy": 2.1110543325543403, |
| "loss/hidden": 3.439453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19287437647581102, |
| "step": 3590 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 40.75, |
| "grad_norm_var": 15.987955729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.55, |
| "loss/crossentropy": 2.211816768348217, |
| "loss/hidden": 3.38984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19003268536180257, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.09025, |
| "grad_norm": 29.75, |
| "grad_norm_var": 14.333268229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.6044, |
| "loss/crossentropy": 2.2199724197387694, |
| "loss/hidden": 3.45234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19937946014106273, |
| "step": 3610 |
| }, |
| { |
| "epoch": 0.0905, |
| "grad_norm": 29.5, |
| "grad_norm_var": 7.994205729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5672, |
| "loss/crossentropy": 2.1754990458488463, |
| "loss/hidden": 3.455859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1872939633205533, |
| "step": 3620 |
| }, |
| { |
| "epoch": 0.09075, |
| "grad_norm": 29.625, |
| "grad_norm_var": 8.087955729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.443, |
| "loss/crossentropy": 2.2714238941669462, |
| "loss/hidden": 3.36484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1929878756403923, |
| "step": 3630 |
| }, |
| { |
| "epoch": 0.091, |
| "grad_norm": 30.25, |
| "grad_norm_var": 6.550455729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5694, |
| "loss/crossentropy": 2.1840985506772994, |
| "loss/hidden": 3.504296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19485698137432336, |
| "step": 3640 |
| }, |
| { |
| "epoch": 0.09125, |
| "grad_norm": 32.25, |
| "grad_norm_var": 7.7494140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5571, |
| "loss/crossentropy": 2.1566817820072175, |
| "loss/hidden": 3.503125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21431526727974415, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.0915, |
| "grad_norm": 34.75, |
| "grad_norm_var": 5.333268229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5377, |
| "loss/crossentropy": 2.0471107825636863, |
| "loss/hidden": 3.497265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20124074276536702, |
| "step": 3660 |
| }, |
| { |
| "epoch": 0.09175, |
| "grad_norm": 33.0, |
| "grad_norm_var": 7.4353515625, |
| "learning_rate": 0.0001, |
| "loss": 7.6125, |
| "loss/crossentropy": 2.1804106384515762, |
| "loss/hidden": 3.440625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22469761371612548, |
| "step": 3670 |
| }, |
| { |
| "epoch": 0.092, |
| "grad_norm": 33.5, |
| "grad_norm_var": 4.3572265625, |
| "learning_rate": 0.0001, |
| "loss": 7.6491, |
| "loss/crossentropy": 2.2595307737588883, |
| "loss/hidden": 3.352734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1894306108355522, |
| "step": 3680 |
| }, |
| { |
| "epoch": 0.09225, |
| "grad_norm": 36.25, |
| "grad_norm_var": 8.666666666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5857, |
| "loss/crossentropy": 2.0454846382141114, |
| "loss/hidden": 3.465234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1902542944997549, |
| "step": 3690 |
| }, |
| { |
| "epoch": 0.0925, |
| "grad_norm": 28.625, |
| "grad_norm_var": 6.204166666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6065, |
| "loss/crossentropy": 2.1835698932409286, |
| "loss/hidden": 3.4828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20310410112142563, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.09275, |
| "grad_norm": 35.25, |
| "grad_norm_var": 7.305989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6004, |
| "loss/crossentropy": 2.0759357810020447, |
| "loss/hidden": 3.446484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21512960288673638, |
| "step": 3710 |
| }, |
| { |
| "epoch": 0.093, |
| "grad_norm": 38.25, |
| "grad_norm_var": 19.737239583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6564, |
| "loss/crossentropy": 2.2961436778306963, |
| "loss/hidden": 3.318359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.198493617400527, |
| "step": 3720 |
| }, |
| { |
| "epoch": 0.09325, |
| "grad_norm": 30.5, |
| "grad_norm_var": 17.01015625, |
| "learning_rate": 0.0001, |
| "loss": 7.6998, |
| "loss/crossentropy": 2.1192551463842393, |
| "loss/hidden": 3.510546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19986802861094474, |
| "step": 3730 |
| }, |
| { |
| "epoch": 0.0935, |
| "grad_norm": 36.25, |
| "grad_norm_var": 10.20625, |
| "learning_rate": 0.0001, |
| "loss": 7.4855, |
| "loss/crossentropy": 2.0999212980270388, |
| "loss/hidden": 3.377734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19102167561650277, |
| "step": 3740 |
| }, |
| { |
| "epoch": 0.09375, |
| "grad_norm": 33.75, |
| "grad_norm_var": 7.556705729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6165, |
| "loss/crossentropy": 2.1783443093299866, |
| "loss/hidden": 3.418359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1862858783453703, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.094, |
| "grad_norm": 28.125, |
| "grad_norm_var": 5.3603515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5372, |
| "loss/crossentropy": 2.0993641003966332, |
| "loss/hidden": 3.405078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17717746701091527, |
| "step": 3760 |
| }, |
| { |
| "epoch": 0.09425, |
| "grad_norm": 30.625, |
| "grad_norm_var": 5.297330729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.614, |
| "loss/crossentropy": 2.1238688945770265, |
| "loss/hidden": 3.424609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20928110517561435, |
| "step": 3770 |
| }, |
| { |
| "epoch": 0.0945, |
| "grad_norm": 29.75, |
| "grad_norm_var": 4.573893229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5672, |
| "loss/crossentropy": 2.203542584180832, |
| "loss/hidden": 3.40546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19581303521990776, |
| "step": 3780 |
| }, |
| { |
| "epoch": 0.09475, |
| "grad_norm": 38.0, |
| "grad_norm_var": 5.322330729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5661, |
| "loss/crossentropy": 2.159272998571396, |
| "loss/hidden": 3.46953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1973846558481455, |
| "step": 3790 |
| }, |
| { |
| "epoch": 0.095, |
| "grad_norm": 30.75, |
| "grad_norm_var": 5.291666666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5495, |
| "loss/crossentropy": 2.188914805650711, |
| "loss/hidden": 3.375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2011850569397211, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.09525, |
| "grad_norm": 31.125, |
| "grad_norm_var": 6.112434895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5776, |
| "loss/crossentropy": 2.1599004954099654, |
| "loss/hidden": 3.464453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21134469993412494, |
| "step": 3810 |
| }, |
| { |
| "epoch": 0.0955, |
| "grad_norm": 35.25, |
| "grad_norm_var": 29.9431640625, |
| "learning_rate": 0.0001, |
| "loss": 7.6779, |
| "loss/crossentropy": 2.1196573287248612, |
| "loss/hidden": 3.576171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2261866919696331, |
| "step": 3820 |
| }, |
| { |
| "epoch": 0.09575, |
| "grad_norm": 37.5, |
| "grad_norm_var": 11.21640625, |
| "learning_rate": 0.0001, |
| "loss": 7.4703, |
| "loss/crossentropy": 2.149521693587303, |
| "loss/hidden": 3.373046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1926643056795001, |
| "step": 3830 |
| }, |
| { |
| "epoch": 0.096, |
| "grad_norm": 32.0, |
| "grad_norm_var": 4.3494140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5513, |
| "loss/crossentropy": 2.1707202911376955, |
| "loss/hidden": 3.425, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19597616009414195, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.09625, |
| "grad_norm": 29.5, |
| "grad_norm_var": 630.7197916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6191, |
| "loss/crossentropy": 2.0859180808067324, |
| "loss/hidden": 3.4640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20257378201931714, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.0965, |
| "grad_norm": 59.5, |
| "grad_norm_var": 100.66223958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5774, |
| "loss/crossentropy": 2.1815837740898134, |
| "loss/hidden": 3.426953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18438388928771018, |
| "step": 3860 |
| }, |
| { |
| "epoch": 0.09675, |
| "grad_norm": 36.75, |
| "grad_norm_var": 66.29212239583333, |
| "learning_rate": 0.0001, |
| "loss": 7.5808, |
| "loss/crossentropy": 2.0505243610590695, |
| "loss/hidden": 3.440234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18692483827471734, |
| "step": 3870 |
| }, |
| { |
| "epoch": 0.097, |
| "grad_norm": 30.375, |
| "grad_norm_var": 4.266080729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5664, |
| "loss/crossentropy": 2.2033773183822634, |
| "loss/hidden": 3.4625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19963842574507, |
| "step": 3880 |
| }, |
| { |
| "epoch": 0.09725, |
| "grad_norm": 32.25, |
| "grad_norm_var": 5.0025390625, |
| "learning_rate": 0.0001, |
| "loss": 7.59, |
| "loss/crossentropy": 2.1328989803791045, |
| "loss/hidden": 3.417578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19634215533733368, |
| "step": 3890 |
| }, |
| { |
| "epoch": 0.0975, |
| "grad_norm": 34.0, |
| "grad_norm_var": 2.1322916666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6407, |
| "loss/crossentropy": 2.170455330610275, |
| "loss/hidden": 3.43828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19931643791496753, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.09775, |
| "grad_norm": 34.75, |
| "grad_norm_var": 3.558333333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5899, |
| "loss/crossentropy": 2.1301774442195893, |
| "loss/hidden": 3.468359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19248898830264807, |
| "step": 3910 |
| }, |
| { |
| "epoch": 0.098, |
| "grad_norm": 33.75, |
| "grad_norm_var": 3.3478515625, |
| "learning_rate": 0.0001, |
| "loss": 7.6526, |
| "loss/crossentropy": 2.1600559651851654, |
| "loss/hidden": 3.541796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22827934101223946, |
| "step": 3920 |
| }, |
| { |
| "epoch": 0.09825, |
| "grad_norm": 30.75, |
| "grad_norm_var": 6.117643229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5635, |
| "loss/crossentropy": 2.0728287249803543, |
| "loss/hidden": 3.534765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20216128267347813, |
| "step": 3930 |
| }, |
| { |
| "epoch": 0.0985, |
| "grad_norm": 33.25, |
| "grad_norm_var": 7.5087890625, |
| "learning_rate": 0.0001, |
| "loss": 7.7253, |
| "loss/crossentropy": 2.1514860481023788, |
| "loss/hidden": 3.530078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21096254773437978, |
| "step": 3940 |
| }, |
| { |
| "epoch": 0.09875, |
| "grad_norm": 34.25, |
| "grad_norm_var": 2.6207682291666665, |
| "learning_rate": 0.0001, |
| "loss": 7.538, |
| "loss/crossentropy": 2.169696259498596, |
| "loss/hidden": 3.48671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21556729041039943, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.099, |
| "grad_norm": 31.25, |
| "grad_norm_var": 5.3150390625, |
| "learning_rate": 0.0001, |
| "loss": 7.54, |
| "loss/crossentropy": 2.1874313950538635, |
| "loss/hidden": 3.399609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19146509394049643, |
| "step": 3960 |
| }, |
| { |
| "epoch": 0.09925, |
| "grad_norm": 32.0, |
| "grad_norm_var": 23.437239583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6292, |
| "loss/crossentropy": 2.165771406888962, |
| "loss/hidden": 3.592578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20717886611819267, |
| "step": 3970 |
| }, |
| { |
| "epoch": 0.0995, |
| "grad_norm": 31.375, |
| "grad_norm_var": 407.4603515625, |
| "learning_rate": 0.0001, |
| "loss": 7.7177, |
| "loss/crossentropy": 2.1150890797376634, |
| "loss/hidden": 3.501953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19676875434815883, |
| "step": 3980 |
| }, |
| { |
| "epoch": 0.09975, |
| "grad_norm": 33.5, |
| "grad_norm_var": 8.463997395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6397, |
| "loss/crossentropy": 2.13585125207901, |
| "loss/hidden": 3.597265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2018281053751707, |
| "step": 3990 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 36.0, |
| "grad_norm_var": 8.787434895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.6957, |
| "loss/crossentropy": 2.062576304376125, |
| "loss/hidden": 3.556640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20351322293281554, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.10025, |
| "grad_norm": 32.25, |
| "grad_norm_var": 2.6931640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5171, |
| "loss/crossentropy": 2.1093045681715012, |
| "loss/hidden": 3.5, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19258121848106385, |
| "step": 4010 |
| }, |
| { |
| "epoch": 0.1005, |
| "grad_norm": 38.5, |
| "grad_norm_var": 6.976497395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.7046, |
| "loss/crossentropy": 2.1054726734757425, |
| "loss/hidden": 3.515234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18498583231121302, |
| "step": 4020 |
| }, |
| { |
| "epoch": 0.10075, |
| "grad_norm": 32.25, |
| "grad_norm_var": 16.50390625, |
| "learning_rate": 0.0001, |
| "loss": 7.6242, |
| "loss/crossentropy": 2.0566830962896345, |
| "loss/hidden": 3.537890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21257028207182885, |
| "step": 4030 |
| }, |
| { |
| "epoch": 0.101, |
| "grad_norm": 30.125, |
| "grad_norm_var": 21.61640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5611, |
| "loss/crossentropy": 2.0847130313515665, |
| "loss/hidden": 3.378515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19317954257130623, |
| "step": 4040 |
| }, |
| { |
| "epoch": 0.10125, |
| "grad_norm": 31.0, |
| "grad_norm_var": 16.408268229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5456, |
| "loss/crossentropy": 2.116552269458771, |
| "loss/hidden": 3.444140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1943613938987255, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.1015, |
| "grad_norm": 31.25, |
| "grad_norm_var": 17.984375, |
| "learning_rate": 0.0001, |
| "loss": 7.6259, |
| "loss/crossentropy": 2.2868128657341003, |
| "loss/hidden": 3.494921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2375142715871334, |
| "step": 4060 |
| }, |
| { |
| "epoch": 0.10175, |
| "grad_norm": 29.625, |
| "grad_norm_var": 2.2025390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5637, |
| "loss/crossentropy": 2.092506285011768, |
| "loss/hidden": 3.466796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1903899708762765, |
| "step": 4070 |
| }, |
| { |
| "epoch": 0.102, |
| "grad_norm": 30.25, |
| "grad_norm_var": 55.06764322916667, |
| "learning_rate": 0.0001, |
| "loss": 7.6365, |
| "loss/crossentropy": 2.2538520216941835, |
| "loss/hidden": 3.4921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2204372201114893, |
| "step": 4080 |
| }, |
| { |
| "epoch": 0.10225, |
| "grad_norm": 48.0, |
| "grad_norm_var": 66.06555989583333, |
| "learning_rate": 0.0001, |
| "loss": 7.6539, |
| "loss/crossentropy": 2.198161965608597, |
| "loss/hidden": 3.3671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18744452036917209, |
| "step": 4090 |
| }, |
| { |
| "epoch": 0.1025, |
| "grad_norm": 31.625, |
| "grad_norm_var": 25.937239583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5872, |
| "loss/crossentropy": 2.161240801215172, |
| "loss/hidden": 3.548046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19324529767036439, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.10275, |
| "grad_norm": 31.125, |
| "grad_norm_var": 2.9613932291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5854, |
| "loss/crossentropy": 2.185439817607403, |
| "loss/hidden": 3.453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19476189762353896, |
| "step": 4110 |
| }, |
| { |
| "epoch": 0.103, |
| "grad_norm": 29.0, |
| "grad_norm_var": 5.4556640625, |
| "learning_rate": 0.0001, |
| "loss": 7.6728, |
| "loss/crossentropy": 2.1513148337602614, |
| "loss/hidden": 3.4875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20135847330093384, |
| "step": 4120 |
| }, |
| { |
| "epoch": 0.10325, |
| "grad_norm": 36.25, |
| "grad_norm_var": 4.3994140625, |
| "learning_rate": 0.0001, |
| "loss": 7.6006, |
| "loss/crossentropy": 2.0776968479156492, |
| "loss/hidden": 3.49140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19831380508840085, |
| "step": 4130 |
| }, |
| { |
| "epoch": 0.1035, |
| "grad_norm": 33.0, |
| "grad_norm_var": 4.205143229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6157, |
| "loss/crossentropy": 2.0971890702843665, |
| "loss/hidden": 3.64140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2015662420541048, |
| "step": 4140 |
| }, |
| { |
| "epoch": 0.10375, |
| "grad_norm": 35.5, |
| "grad_norm_var": 23.512239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6638, |
| "loss/crossentropy": 2.128816670179367, |
| "loss/hidden": 3.420703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19698726907372474, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.104, |
| "grad_norm": 30.75, |
| "grad_norm_var": 22.026822916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6175, |
| "loss/crossentropy": 2.0965539067983627, |
| "loss/hidden": 3.455859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22271894477307796, |
| "step": 4160 |
| }, |
| { |
| "epoch": 0.10425, |
| "grad_norm": 32.5, |
| "grad_norm_var": 2.426041666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.555, |
| "loss/crossentropy": 2.215752348303795, |
| "loss/hidden": 3.364453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1962002281099558, |
| "step": 4170 |
| }, |
| { |
| "epoch": 0.1045, |
| "grad_norm": 39.5, |
| "grad_norm_var": 23.042708333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6246, |
| "loss/crossentropy": 2.0542988061904905, |
| "loss/hidden": 3.440625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19908196646720172, |
| "step": 4180 |
| }, |
| { |
| "epoch": 0.10475, |
| "grad_norm": 34.75, |
| "grad_norm_var": 6.342643229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.487, |
| "loss/crossentropy": 2.2133218079805372, |
| "loss/hidden": 3.382421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18823296912014484, |
| "step": 4190 |
| }, |
| { |
| "epoch": 0.105, |
| "grad_norm": 31.125, |
| "grad_norm_var": 173.98430989583332, |
| "learning_rate": 0.0001, |
| "loss": 7.643, |
| "loss/crossentropy": 2.1089532509446145, |
| "loss/hidden": 3.501953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20591201409697532, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.10525, |
| "grad_norm": 36.75, |
| "grad_norm_var": 7.351041666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5643, |
| "loss/crossentropy": 2.289369744062424, |
| "loss/hidden": 3.36640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19611021652817726, |
| "step": 4210 |
| }, |
| { |
| "epoch": 0.1055, |
| "grad_norm": 35.75, |
| "grad_norm_var": 6.539322916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6756, |
| "loss/crossentropy": 2.1963788866996765, |
| "loss/hidden": 3.49921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19495316371321678, |
| "step": 4220 |
| }, |
| { |
| "epoch": 0.10575, |
| "grad_norm": 31.875, |
| "grad_norm_var": 6.276041666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5552, |
| "loss/crossentropy": 2.078681927919388, |
| "loss/hidden": 3.441015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1941295877099037, |
| "step": 4230 |
| }, |
| { |
| "epoch": 0.106, |
| "grad_norm": 44.5, |
| "grad_norm_var": 34.80520833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6072, |
| "loss/crossentropy": 2.222626182436943, |
| "loss/hidden": 3.2828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1848284311592579, |
| "step": 4240 |
| }, |
| { |
| "epoch": 0.10625, |
| "grad_norm": 31.375, |
| "grad_norm_var": 35.084375, |
| "learning_rate": 0.0001, |
| "loss": 7.6435, |
| "loss/crossentropy": 2.152433153986931, |
| "loss/hidden": 3.3390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1829435657709837, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.1065, |
| "grad_norm": 31.625, |
| "grad_norm_var": 4.431705729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5977, |
| "loss/crossentropy": 2.1493207842111586, |
| "loss/hidden": 3.466015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19450047723948954, |
| "step": 4260 |
| }, |
| { |
| "epoch": 0.10675, |
| "grad_norm": 30.0, |
| "grad_norm_var": 8.024739583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5423, |
| "loss/crossentropy": 2.0623584628105163, |
| "loss/hidden": 3.42890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19236240349709988, |
| "step": 4270 |
| }, |
| { |
| "epoch": 0.107, |
| "grad_norm": 51.75, |
| "grad_norm_var": 105.76145833333334, |
| "learning_rate": 0.0001, |
| "loss": 7.577, |
| "loss/crossentropy": 2.103591626882553, |
| "loss/hidden": 3.369140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18201838787645103, |
| "step": 4280 |
| }, |
| { |
| "epoch": 0.10725, |
| "grad_norm": 33.25, |
| "grad_norm_var": 144.54973958333332, |
| "learning_rate": 0.0001, |
| "loss": 7.7078, |
| "loss/crossentropy": 2.1076686546206473, |
| "loss/hidden": 3.56328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19488887619227171, |
| "step": 4290 |
| }, |
| { |
| "epoch": 0.1075, |
| "grad_norm": 32.0, |
| "grad_norm_var": 190.62057291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5811, |
| "loss/crossentropy": 2.168550156056881, |
| "loss/hidden": 3.34453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1908732896670699, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.10775, |
| "grad_norm": 28.875, |
| "grad_norm_var": 149.1822265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5999, |
| "loss/crossentropy": 2.100285217165947, |
| "loss/hidden": 3.411328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19184609185904264, |
| "step": 4310 |
| }, |
| { |
| "epoch": 0.108, |
| "grad_norm": 44.25, |
| "grad_norm_var": 12.502018229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.7075, |
| "loss/crossentropy": 2.0968768775463102, |
| "loss/hidden": 3.51796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20168912429362534, |
| "step": 4320 |
| }, |
| { |
| "epoch": 0.10825, |
| "grad_norm": 31.25, |
| "grad_norm_var": 12.81875, |
| "learning_rate": 0.0001, |
| "loss": 7.5576, |
| "loss/crossentropy": 2.1037441343069077, |
| "loss/hidden": 3.370703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17891897186636924, |
| "step": 4330 |
| }, |
| { |
| "epoch": 0.1085, |
| "grad_norm": 33.5, |
| "grad_norm_var": 2.700455729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5097, |
| "loss/crossentropy": 2.210876139998436, |
| "loss/hidden": 3.36171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19316814988851547, |
| "step": 4340 |
| }, |
| { |
| "epoch": 0.10875, |
| "grad_norm": 31.125, |
| "grad_norm_var": 17.4666015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5755, |
| "loss/crossentropy": 2.1331328481435774, |
| "loss/hidden": 3.4609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20047767795622348, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.109, |
| "grad_norm": 32.75, |
| "grad_norm_var": 3.198372395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5385, |
| "loss/crossentropy": 2.153067779541016, |
| "loss/hidden": 3.476953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20173794813454152, |
| "step": 4360 |
| }, |
| { |
| "epoch": 0.10925, |
| "grad_norm": 32.0, |
| "grad_norm_var": 4.010416666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5426, |
| "loss/crossentropy": 2.165090653300285, |
| "loss/hidden": 3.330859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18712956104427575, |
| "step": 4370 |
| }, |
| { |
| "epoch": 0.1095, |
| "grad_norm": 30.0, |
| "grad_norm_var": 1.77265625, |
| "learning_rate": 0.0001, |
| "loss": 7.605, |
| "loss/crossentropy": 2.217823189496994, |
| "loss/hidden": 3.37265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20003505125641824, |
| "step": 4380 |
| }, |
| { |
| "epoch": 0.10975, |
| "grad_norm": 29.875, |
| "grad_norm_var": 1.7603515625, |
| "learning_rate": 0.0001, |
| "loss": 7.507, |
| "loss/crossentropy": 2.138795481622219, |
| "loss/hidden": 3.48828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1892871480435133, |
| "step": 4390 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 30.375, |
| "grad_norm_var": 22.449739583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6965, |
| "loss/crossentropy": 2.22740375995636, |
| "loss/hidden": 3.5578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2264870759099722, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.11025, |
| "grad_norm": 29.5, |
| "grad_norm_var": 37.71666666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.604, |
| "loss/crossentropy": 2.1781785815954207, |
| "loss/hidden": 3.471875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2012148156762123, |
| "step": 4410 |
| }, |
| { |
| "epoch": 0.1105, |
| "grad_norm": 33.0, |
| "grad_norm_var": 25.305989583333332, |
| "learning_rate": 0.0001, |
| "loss": 7.5767, |
| "loss/crossentropy": 2.0333445832133292, |
| "loss/hidden": 3.55703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1852614250034094, |
| "step": 4420 |
| }, |
| { |
| "epoch": 0.11075, |
| "grad_norm": 32.0, |
| "grad_norm_var": 0.9760416666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6218, |
| "loss/crossentropy": 2.2101993292570112, |
| "loss/hidden": 3.505078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2069159124046564, |
| "step": 4430 |
| }, |
| { |
| "epoch": 0.111, |
| "grad_norm": 30.5, |
| "grad_norm_var": 7.8837890625, |
| "learning_rate": 0.0001, |
| "loss": 7.6543, |
| "loss/crossentropy": 2.0182371377944945, |
| "loss/hidden": 3.475390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19055260960012674, |
| "step": 4440 |
| }, |
| { |
| "epoch": 0.11125, |
| "grad_norm": 29.0, |
| "grad_norm_var": 18.167643229166668, |
| "learning_rate": 0.0001, |
| "loss": 7.5559, |
| "loss/crossentropy": 2.209046494960785, |
| "loss/hidden": 3.4546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1926161792129278, |
| "step": 4450 |
| }, |
| { |
| "epoch": 0.1115, |
| "grad_norm": 30.375, |
| "grad_norm_var": 19.9634765625, |
| "learning_rate": 0.0001, |
| "loss": 7.5527, |
| "loss/crossentropy": 2.2265418380498887, |
| "loss/hidden": 3.28671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17907681576907636, |
| "step": 4460 |
| }, |
| { |
| "epoch": 0.11175, |
| "grad_norm": 35.25, |
| "grad_norm_var": 3.0434895833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6046, |
| "loss/crossentropy": 2.1534146428108216, |
| "loss/hidden": 3.436328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19181067440658808, |
| "step": 4470 |
| }, |
| { |
| "epoch": 0.112, |
| "grad_norm": 31.375, |
| "grad_norm_var": 2.161393229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5151, |
| "loss/crossentropy": 2.2303753718733788, |
| "loss/hidden": 3.46796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19312014058232307, |
| "step": 4480 |
| }, |
| { |
| "epoch": 0.11225, |
| "grad_norm": 31.25, |
| "grad_norm_var": 2.071875, |
| "learning_rate": 0.0001, |
| "loss": 7.5733, |
| "loss/crossentropy": 2.2565354451537134, |
| "loss/hidden": 3.300390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19555974584072827, |
| "step": 4490 |
| }, |
| { |
| "epoch": 0.1125, |
| "grad_norm": 30.125, |
| "grad_norm_var": 6.21015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5721, |
| "loss/crossentropy": 2.1691703468561174, |
| "loss/hidden": 3.350390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19526711832731963, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.11275, |
| "grad_norm": 30.75, |
| "grad_norm_var": 34.985416666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6031, |
| "loss/crossentropy": 2.191486781835556, |
| "loss/hidden": 3.38515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2022854283452034, |
| "step": 4510 |
| }, |
| { |
| "epoch": 0.113, |
| "grad_norm": 32.75, |
| "grad_norm_var": 34.91041666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.566, |
| "loss/crossentropy": 2.07875557243824, |
| "loss/hidden": 3.510546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20517632961273194, |
| "step": 4520 |
| }, |
| { |
| "epoch": 0.11325, |
| "grad_norm": 30.25, |
| "grad_norm_var": 3.1869140625, |
| "learning_rate": 0.0001, |
| "loss": 7.6204, |
| "loss/crossentropy": 2.1490323692560196, |
| "loss/hidden": 3.55390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20650937724858523, |
| "step": 4530 |
| }, |
| { |
| "epoch": 0.1135, |
| "grad_norm": 31.625, |
| "grad_norm_var": 3.6639973958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5605, |
| "loss/crossentropy": 2.19907369017601, |
| "loss/hidden": 3.3765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18378095962107183, |
| "step": 4540 |
| }, |
| { |
| "epoch": 0.11375, |
| "grad_norm": 33.0, |
| "grad_norm_var": 3.34140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5943, |
| "loss/crossentropy": 2.0509427756071092, |
| "loss/hidden": 3.46796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2012764386832714, |
| "step": 4550 |
| }, |
| { |
| "epoch": 0.114, |
| "grad_norm": 32.25, |
| "grad_norm_var": 55.52916666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5349, |
| "loss/crossentropy": 2.247987303137779, |
| "loss/hidden": 3.491015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2166461084038019, |
| "step": 4560 |
| }, |
| { |
| "epoch": 0.11425, |
| "grad_norm": 31.0, |
| "grad_norm_var": 59.064518229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5699, |
| "loss/crossentropy": 2.256947749853134, |
| "loss/hidden": 3.369921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19454225115478038, |
| "step": 4570 |
| }, |
| { |
| "epoch": 0.1145, |
| "grad_norm": 30.5, |
| "grad_norm_var": 12.089583333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5587, |
| "loss/crossentropy": 2.230518189072609, |
| "loss/hidden": 3.380078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19083393104374408, |
| "step": 4580 |
| }, |
| { |
| "epoch": 0.11475, |
| "grad_norm": 31.5, |
| "grad_norm_var": 18.3056640625, |
| "learning_rate": 0.0001, |
| "loss": 7.664, |
| "loss/crossentropy": 2.113222661614418, |
| "loss/hidden": 3.51640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20051947552710772, |
| "step": 4590 |
| }, |
| { |
| "epoch": 0.115, |
| "grad_norm": 29.25, |
| "grad_norm_var": 26.7837890625, |
| "learning_rate": 0.0001, |
| "loss": 7.6508, |
| "loss/crossentropy": 2.2963487923145296, |
| "loss/hidden": 3.430859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21824662014842033, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.11525, |
| "grad_norm": 31.75, |
| "grad_norm_var": 3.249739583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5491, |
| "loss/crossentropy": 2.2380147099494936, |
| "loss/hidden": 3.488671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2021485272794962, |
| "step": 4610 |
| }, |
| { |
| "epoch": 0.1155, |
| "grad_norm": 34.0, |
| "grad_norm_var": 3.2072265625, |
| "learning_rate": 0.0001, |
| "loss": 7.6035, |
| "loss/crossentropy": 2.1933206588029863, |
| "loss/hidden": 3.519921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2073045803233981, |
| "step": 4620 |
| }, |
| { |
| "epoch": 0.11575, |
| "grad_norm": 30.375, |
| "grad_norm_var": 25.005989583333335, |
| "learning_rate": 0.0001, |
| "loss": 7.577, |
| "loss/crossentropy": 2.3104471057653426, |
| "loss/hidden": 3.377734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20087463557720184, |
| "step": 4630 |
| }, |
| { |
| "epoch": 0.116, |
| "grad_norm": 33.5, |
| "grad_norm_var": 539.2056640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5764, |
| "loss/crossentropy": 2.1786745607852938, |
| "loss/hidden": 3.39453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19096632562577726, |
| "step": 4640 |
| }, |
| { |
| "epoch": 0.11625, |
| "grad_norm": 30.5, |
| "grad_norm_var": 132.26139322916666, |
| "learning_rate": 0.0001, |
| "loss": 7.7424, |
| "loss/crossentropy": 2.1628643572330475, |
| "loss/hidden": 3.526171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1994694285094738, |
| "step": 4650 |
| }, |
| { |
| "epoch": 0.1165, |
| "grad_norm": 40.25, |
| "grad_norm_var": 12.672330729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6456, |
| "loss/crossentropy": 2.0927803248167036, |
| "loss/hidden": 3.507421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20613454841077328, |
| "step": 4660 |
| }, |
| { |
| "epoch": 0.11675, |
| "grad_norm": 37.25, |
| "grad_norm_var": 7.458072916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.622, |
| "loss/crossentropy": 2.311227411031723, |
| "loss/hidden": 3.402734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21675110273063183, |
| "step": 4670 |
| }, |
| { |
| "epoch": 0.117, |
| "grad_norm": 34.5, |
| "grad_norm_var": 138.99524739583333, |
| "learning_rate": 0.0001, |
| "loss": 7.6633, |
| "loss/crossentropy": 2.1860357582569123, |
| "loss/hidden": 3.509375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20624178424477577, |
| "step": 4680 |
| }, |
| { |
| "epoch": 0.11725, |
| "grad_norm": 38.25, |
| "grad_norm_var": 11.161393229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.654, |
| "loss/crossentropy": 2.246461641788483, |
| "loss/hidden": 3.435546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19816880766302347, |
| "step": 4690 |
| }, |
| { |
| "epoch": 0.1175, |
| "grad_norm": 98.5, |
| "grad_norm_var": 275.63645833333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6871, |
| "loss/crossentropy": 2.1662321478128432, |
| "loss/hidden": 3.52421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.23473294898867608, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.11775, |
| "grad_norm": 32.25, |
| "grad_norm_var": 273.496875, |
| "learning_rate": 0.0001, |
| "loss": 7.5664, |
| "loss/crossentropy": 2.1411470264196395, |
| "loss/hidden": 3.466015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20816716887056827, |
| "step": 4710 |
| }, |
| { |
| "epoch": 0.118, |
| "grad_norm": 30.625, |
| "grad_norm_var": 1.4697916666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5106, |
| "loss/crossentropy": 2.184460151195526, |
| "loss/hidden": 3.387890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19597234334796668, |
| "step": 4720 |
| }, |
| { |
| "epoch": 0.11825, |
| "grad_norm": 32.75, |
| "grad_norm_var": 151.39368489583333, |
| "learning_rate": 0.0001, |
| "loss": 7.7143, |
| "loss/crossentropy": 2.18603872358799, |
| "loss/hidden": 3.509375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21333505641669034, |
| "step": 4730 |
| }, |
| { |
| "epoch": 0.1185, |
| "grad_norm": 31.125, |
| "grad_norm_var": 41.80774739583333, |
| "learning_rate": 0.0001, |
| "loss": 7.5158, |
| "loss/crossentropy": 2.038896057009697, |
| "loss/hidden": 3.413671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18354782909154893, |
| "step": 4740 |
| }, |
| { |
| "epoch": 0.11875, |
| "grad_norm": 31.5, |
| "grad_norm_var": 20.727083333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6044, |
| "loss/crossentropy": 2.07361024916172, |
| "loss/hidden": 3.505078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2144785810261965, |
| "step": 4750 |
| }, |
| { |
| "epoch": 0.119, |
| "grad_norm": 32.0, |
| "grad_norm_var": 14.9994140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5936, |
| "loss/crossentropy": 2.172766661643982, |
| "loss/hidden": 3.359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1904382836073637, |
| "step": 4760 |
| }, |
| { |
| "epoch": 0.11925, |
| "grad_norm": 33.0, |
| "grad_norm_var": 11.213541666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6024, |
| "loss/crossentropy": 2.2863214761018753, |
| "loss/hidden": 3.464453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20077989026904106, |
| "step": 4770 |
| }, |
| { |
| "epoch": 0.1195, |
| "grad_norm": 32.0, |
| "grad_norm_var": 23.308268229166668, |
| "learning_rate": 0.0001, |
| "loss": 7.5743, |
| "loss/crossentropy": 2.172411371767521, |
| "loss/hidden": 3.357421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1879224268719554, |
| "step": 4780 |
| }, |
| { |
| "epoch": 0.11975, |
| "grad_norm": 29.625, |
| "grad_norm_var": 27.662239583333335, |
| "learning_rate": 0.0001, |
| "loss": 7.6003, |
| "loss/crossentropy": 2.061740070581436, |
| "loss/hidden": 3.424609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.196690865047276, |
| "step": 4790 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 34.5, |
| "grad_norm_var": 9.820768229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5893, |
| "loss/crossentropy": 2.1725818127393723, |
| "loss/hidden": 3.33828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18248203694820403, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.12025, |
| "grad_norm": 31.375, |
| "grad_norm_var": 5.5478515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5542, |
| "loss/crossentropy": 2.0823758363723757, |
| "loss/hidden": 3.503125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19437449853867292, |
| "step": 4810 |
| }, |
| { |
| "epoch": 0.1205, |
| "grad_norm": 31.125, |
| "grad_norm_var": 2.8754557291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.7018, |
| "loss/crossentropy": 2.220066267251968, |
| "loss/hidden": 3.52734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20239269211888314, |
| "step": 4820 |
| }, |
| { |
| "epoch": 0.12075, |
| "grad_norm": 33.25, |
| "grad_norm_var": 6.868489583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5765, |
| "loss/crossentropy": 2.0765403911471365, |
| "loss/hidden": 3.435546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19827509336173535, |
| "step": 4830 |
| }, |
| { |
| "epoch": 0.121, |
| "grad_norm": 34.0, |
| "grad_norm_var": 27.168489583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.7248, |
| "loss/crossentropy": 2.1397932201623915, |
| "loss/hidden": 3.50546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.204028557613492, |
| "step": 4840 |
| }, |
| { |
| "epoch": 0.12125, |
| "grad_norm": 38.0, |
| "grad_norm_var": 22.633072916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6658, |
| "loss/crossentropy": 2.3317618519067764, |
| "loss/hidden": 3.33046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19565313905477524, |
| "step": 4850 |
| }, |
| { |
| "epoch": 0.1215, |
| "grad_norm": 29.875, |
| "grad_norm_var": 4.6041015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5415, |
| "loss/crossentropy": 2.060061091184616, |
| "loss/hidden": 3.491015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19083615019917488, |
| "step": 4860 |
| }, |
| { |
| "epoch": 0.12175, |
| "grad_norm": 33.75, |
| "grad_norm_var": 5.06875, |
| "learning_rate": 0.0001, |
| "loss": 7.651, |
| "loss/crossentropy": 2.158045071363449, |
| "loss/hidden": 3.52421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2148456061258912, |
| "step": 4870 |
| }, |
| { |
| "epoch": 0.122, |
| "grad_norm": 30.625, |
| "grad_norm_var": 16.091666666666665, |
| "learning_rate": 0.0001, |
| "loss": 7.5793, |
| "loss/crossentropy": 2.0583921030163763, |
| "loss/hidden": 3.487109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19111349806189537, |
| "step": 4880 |
| }, |
| { |
| "epoch": 0.12225, |
| "grad_norm": 33.25, |
| "grad_norm_var": 17.422330729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6588, |
| "loss/crossentropy": 2.1186117827892303, |
| "loss/hidden": 3.500390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19436944983899593, |
| "step": 4890 |
| }, |
| { |
| "epoch": 0.1225, |
| "grad_norm": 36.75, |
| "grad_norm_var": 3.2676432291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4748, |
| "loss/crossentropy": 2.2382855489850044, |
| "loss/hidden": 3.39921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19216692261397839, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.12275, |
| "grad_norm": 29.375, |
| "grad_norm_var": 31.048958333333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5018, |
| "loss/crossentropy": 2.1136436641216276, |
| "loss/hidden": 3.482421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19329534620046615, |
| "step": 4910 |
| }, |
| { |
| "epoch": 0.123, |
| "grad_norm": 40.5, |
| "grad_norm_var": 8.3572265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5661, |
| "loss/crossentropy": 2.043731611967087, |
| "loss/hidden": 3.472265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18812808189541103, |
| "step": 4920 |
| }, |
| { |
| "epoch": 0.12325, |
| "grad_norm": 31.25, |
| "grad_norm_var": 16.280989583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5565, |
| "loss/crossentropy": 2.129016649723053, |
| "loss/hidden": 3.345703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1941742904484272, |
| "step": 4930 |
| }, |
| { |
| "epoch": 0.1235, |
| "grad_norm": 30.125, |
| "grad_norm_var": 1.4504557291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5356, |
| "loss/crossentropy": 2.1981059461832047, |
| "loss/hidden": 3.533203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19572316966950892, |
| "step": 4940 |
| }, |
| { |
| "epoch": 0.12375, |
| "grad_norm": 54.5, |
| "grad_norm_var": 36.024739583333336, |
| "learning_rate": 0.0001, |
| "loss": 7.5463, |
| "loss/crossentropy": 2.107844803482294, |
| "loss/hidden": 3.551953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18902508020401002, |
| "step": 4950 |
| }, |
| { |
| "epoch": 0.124, |
| "grad_norm": 33.0, |
| "grad_norm_var": 64.778125, |
| "learning_rate": 0.0001, |
| "loss": 7.6053, |
| "loss/crossentropy": 2.1164773657917975, |
| "loss/hidden": 3.5078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19264463931322098, |
| "step": 4960 |
| }, |
| { |
| "epoch": 0.12425, |
| "grad_norm": 31.5, |
| "grad_norm_var": 42.0666015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5774, |
| "loss/crossentropy": 2.176864555478096, |
| "loss/hidden": 3.491796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20114411041140556, |
| "step": 4970 |
| }, |
| { |
| "epoch": 0.1245, |
| "grad_norm": 32.25, |
| "grad_norm_var": 33.07180989583333, |
| "learning_rate": 0.0001, |
| "loss": 7.5698, |
| "loss/crossentropy": 2.1456878036260605, |
| "loss/hidden": 3.38515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19218399338424205, |
| "step": 4980 |
| }, |
| { |
| "epoch": 0.12475, |
| "grad_norm": 33.25, |
| "grad_norm_var": 9.938997395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5467, |
| "loss/crossentropy": 2.3402266025543215, |
| "loss/hidden": 3.290625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18540082685649395, |
| "step": 4990 |
| }, |
| { |
| "epoch": 0.125, |
| "grad_norm": 32.5, |
| "grad_norm_var": 1.6580729166666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5883, |
| "loss/crossentropy": 2.186999189853668, |
| "loss/hidden": 3.341796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21516974158585073, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.12525, |
| "grad_norm": 33.5, |
| "grad_norm_var": 4.194205729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5936, |
| "loss/crossentropy": 2.171919286251068, |
| "loss/hidden": 3.41484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19737605061382055, |
| "step": 5010 |
| }, |
| { |
| "epoch": 0.1255, |
| "grad_norm": 32.0, |
| "grad_norm_var": 0.67265625, |
| "learning_rate": 0.0001, |
| "loss": 7.509, |
| "loss/crossentropy": 2.2188323110342028, |
| "loss/hidden": 3.33984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18663883544504642, |
| "step": 5020 |
| }, |
| { |
| "epoch": 0.12575, |
| "grad_norm": 29.75, |
| "grad_norm_var": 10.642643229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6301, |
| "loss/crossentropy": 2.191204625368118, |
| "loss/hidden": 3.348046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1946074590086937, |
| "step": 5030 |
| }, |
| { |
| "epoch": 0.126, |
| "grad_norm": 31.125, |
| "grad_norm_var": 1.6541015625, |
| "learning_rate": 0.0001, |
| "loss": 7.502, |
| "loss/crossentropy": 2.1200972706079484, |
| "loss/hidden": 3.43671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19346977435052395, |
| "step": 5040 |
| }, |
| { |
| "epoch": 0.12625, |
| "grad_norm": 44.75, |
| "grad_norm_var": 16.30390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5705, |
| "loss/crossentropy": 2.1876573234796526, |
| "loss/hidden": 3.320703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18829105645418168, |
| "step": 5050 |
| }, |
| { |
| "epoch": 0.1265, |
| "grad_norm": 31.875, |
| "grad_norm_var": 20.667708333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5126, |
| "loss/crossentropy": 2.2233003705739973, |
| "loss/hidden": 3.3703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19192412812262774, |
| "step": 5060 |
| }, |
| { |
| "epoch": 0.12675, |
| "grad_norm": 33.0, |
| "grad_norm_var": 2.3285807291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5683, |
| "loss/crossentropy": 2.0495440497994424, |
| "loss/hidden": 3.4625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18455710131675004, |
| "step": 5070 |
| }, |
| { |
| "epoch": 0.127, |
| "grad_norm": 31.5, |
| "grad_norm_var": 2.6684895833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4892, |
| "loss/crossentropy": 2.1829854756593705, |
| "loss/hidden": 3.415234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1906156621873379, |
| "step": 5080 |
| }, |
| { |
| "epoch": 0.12725, |
| "grad_norm": 31.75, |
| "grad_norm_var": 26.134375, |
| "learning_rate": 0.0001, |
| "loss": 7.7141, |
| "loss/crossentropy": 2.150173208117485, |
| "loss/hidden": 3.546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20331819988787175, |
| "step": 5090 |
| }, |
| { |
| "epoch": 0.1275, |
| "grad_norm": 36.5, |
| "grad_norm_var": 25.376822916666665, |
| "learning_rate": 0.0001, |
| "loss": 7.572, |
| "loss/crossentropy": 2.2022222489118577, |
| "loss/hidden": 3.522265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20444649420678615, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.12775, |
| "grad_norm": 31.375, |
| "grad_norm_var": 15.917643229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5598, |
| "loss/crossentropy": 2.0867941707372664, |
| "loss/hidden": 3.56328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19711919017136098, |
| "step": 5110 |
| }, |
| { |
| "epoch": 0.128, |
| "grad_norm": 30.125, |
| "grad_norm_var": 16.92265625, |
| "learning_rate": 0.0001, |
| "loss": 7.4871, |
| "loss/crossentropy": 2.1122621968388557, |
| "loss/hidden": 3.371484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1813073743134737, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.12825, |
| "grad_norm": 30.75, |
| "grad_norm_var": 5.108072916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5229, |
| "loss/crossentropy": 2.0356661707162855, |
| "loss/hidden": 3.480859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19353711605072021, |
| "step": 5130 |
| }, |
| { |
| "epoch": 0.1285, |
| "grad_norm": 32.5, |
| "grad_norm_var": 34.81920572916667, |
| "learning_rate": 0.0001, |
| "loss": 7.64, |
| "loss/crossentropy": 2.1762280851602553, |
| "loss/hidden": 3.421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20253173671662808, |
| "step": 5140 |
| }, |
| { |
| "epoch": 0.12875, |
| "grad_norm": 32.25, |
| "grad_norm_var": 51.16223958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6475, |
| "loss/crossentropy": 2.14297553896904, |
| "loss/hidden": 3.48671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2105777282267809, |
| "step": 5150 |
| }, |
| { |
| "epoch": 0.129, |
| "grad_norm": 34.0, |
| "grad_norm_var": 2.513641882357806e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.6232, |
| "loss/crossentropy": 2.190713110566139, |
| "loss/hidden": 3.689453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2259815253317356, |
| "step": 5160 |
| }, |
| { |
| "epoch": 0.12925, |
| "grad_norm": 50.0, |
| "grad_norm_var": 2.513641882159625e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.5304, |
| "loss/crossentropy": 2.287049275636673, |
| "loss/hidden": 3.22109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1791717953979969, |
| "step": 5170 |
| }, |
| { |
| "epoch": 0.1295, |
| "grad_norm": 32.25, |
| "grad_norm_var": 57.03932291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5666, |
| "loss/crossentropy": 2.205572660267353, |
| "loss/hidden": 3.40234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1940192885696888, |
| "step": 5180 |
| }, |
| { |
| "epoch": 0.12975, |
| "grad_norm": 33.25, |
| "grad_norm_var": 2.115625, |
| "learning_rate": 0.0001, |
| "loss": 7.5783, |
| "loss/crossentropy": 2.170276886224747, |
| "loss/hidden": 3.498828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19651044271886348, |
| "step": 5190 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 31.5, |
| "grad_norm_var": 3.549934895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6073, |
| "loss/crossentropy": 2.2128631293773653, |
| "loss/hidden": 3.335546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19390376433730125, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.13025, |
| "grad_norm": 35.75, |
| "grad_norm_var": 9.099934895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.575, |
| "loss/crossentropy": 2.171933504939079, |
| "loss/hidden": 3.3765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1874479927122593, |
| "step": 5210 |
| }, |
| { |
| "epoch": 0.1305, |
| "grad_norm": 38.25, |
| "grad_norm_var": 12.2869140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5635, |
| "loss/crossentropy": 2.1169356971979143, |
| "loss/hidden": 3.334375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1857094492763281, |
| "step": 5220 |
| }, |
| { |
| "epoch": 0.13075, |
| "grad_norm": 51.25, |
| "grad_norm_var": 892.175, |
| "learning_rate": 0.0001, |
| "loss": 7.5924, |
| "loss/crossentropy": 2.1072397351264955, |
| "loss/hidden": 3.505078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20098341945558787, |
| "step": 5230 |
| }, |
| { |
| "epoch": 0.131, |
| "grad_norm": 32.5, |
| "grad_norm_var": 895.24140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5454, |
| "loss/crossentropy": 2.260575148463249, |
| "loss/hidden": 3.348046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18017468005418777, |
| "step": 5240 |
| }, |
| { |
| "epoch": 0.13125, |
| "grad_norm": 48.0, |
| "grad_norm_var": 114.225, |
| "learning_rate": 0.0001, |
| "loss": 7.6162, |
| "loss/crossentropy": 2.2057667702436445, |
| "loss/hidden": 3.387890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20519790165126323, |
| "step": 5250 |
| }, |
| { |
| "epoch": 0.1315, |
| "grad_norm": 74.0, |
| "grad_norm_var": 349.00983072916665, |
| "learning_rate": 0.0001, |
| "loss": 7.6702, |
| "loss/crossentropy": 2.154927045106888, |
| "loss/hidden": 3.366796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21574116442352534, |
| "step": 5260 |
| }, |
| { |
| "epoch": 0.13175, |
| "grad_norm": 31.0, |
| "grad_norm_var": 229.61243489583333, |
| "learning_rate": 0.0001, |
| "loss": 7.4891, |
| "loss/crossentropy": 2.107641798257828, |
| "loss/hidden": 3.414453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18632576130330564, |
| "step": 5270 |
| }, |
| { |
| "epoch": 0.132, |
| "grad_norm": 33.75, |
| "grad_norm_var": 7.731705729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5382, |
| "loss/crossentropy": 2.245188394188881, |
| "loss/hidden": 3.26875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18824921660125254, |
| "step": 5280 |
| }, |
| { |
| "epoch": 0.13225, |
| "grad_norm": 33.75, |
| "grad_norm_var": 3.63125, |
| "learning_rate": 0.0001, |
| "loss": 7.4886, |
| "loss/crossentropy": 2.2113157629966738, |
| "loss/hidden": 3.294140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1719427563250065, |
| "step": 5290 |
| }, |
| { |
| "epoch": 0.1325, |
| "grad_norm": 31.125, |
| "grad_norm_var": 5.176822916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5181, |
| "loss/crossentropy": 2.2539247930049897, |
| "loss/hidden": 3.31640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18716043829917908, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.13275, |
| "grad_norm": 30.5, |
| "grad_norm_var": 5.7791015625, |
| "learning_rate": 0.0001, |
| "loss": 7.4139, |
| "loss/crossentropy": 2.232303848862648, |
| "loss/hidden": 3.4, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18582747615873813, |
| "step": 5310 |
| }, |
| { |
| "epoch": 0.133, |
| "grad_norm": 29.25, |
| "grad_norm_var": 2.792708333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4902, |
| "loss/crossentropy": 2.2038251549005508, |
| "loss/hidden": 3.330859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19186565633863212, |
| "step": 5320 |
| }, |
| { |
| "epoch": 0.13325, |
| "grad_norm": 31.0, |
| "grad_norm_var": 6.145768229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.568, |
| "loss/crossentropy": 2.258873853087425, |
| "loss/hidden": 3.490625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2075445156544447, |
| "step": 5330 |
| }, |
| { |
| "epoch": 0.1335, |
| "grad_norm": 35.25, |
| "grad_norm_var": 2.1952473958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5231, |
| "loss/crossentropy": 2.1799694120883943, |
| "loss/hidden": 3.3015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18682638984173536, |
| "step": 5340 |
| }, |
| { |
| "epoch": 0.13375, |
| "grad_norm": 30.0, |
| "grad_norm_var": 1.9739583333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5416, |
| "loss/crossentropy": 2.2539006620645523, |
| "loss/hidden": 3.34921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18906075097620487, |
| "step": 5350 |
| }, |
| { |
| "epoch": 0.134, |
| "grad_norm": 31.125, |
| "grad_norm_var": 2.880989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5205, |
| "loss/crossentropy": 2.111976405978203, |
| "loss/hidden": 3.3953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18815945349633695, |
| "step": 5360 |
| }, |
| { |
| "epoch": 0.13425, |
| "grad_norm": 41.75, |
| "grad_norm_var": 8.989322916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5645, |
| "loss/crossentropy": 2.150078758597374, |
| "loss/hidden": 3.305859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18150232955813408, |
| "step": 5370 |
| }, |
| { |
| "epoch": 0.1345, |
| "grad_norm": 32.25, |
| "grad_norm_var": 9.262239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5691, |
| "loss/crossentropy": 2.2475525766611097, |
| "loss/hidden": 3.469921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22073253151029348, |
| "step": 5380 |
| }, |
| { |
| "epoch": 0.13475, |
| "grad_norm": 34.25, |
| "grad_norm_var": 4.981705729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5575, |
| "loss/crossentropy": 2.1171315133571627, |
| "loss/hidden": 3.445703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20035637486726046, |
| "step": 5390 |
| }, |
| { |
| "epoch": 0.135, |
| "grad_norm": 212.0, |
| "grad_norm_var": 2028.59375, |
| "learning_rate": 0.0001, |
| "loss": 7.6561, |
| "loss/crossentropy": 2.147063474357128, |
| "loss/hidden": 3.507421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20187063701450825, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.13525, |
| "grad_norm": 28.5, |
| "grad_norm_var": 2002.4431640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5921, |
| "loss/crossentropy": 2.248355305194855, |
| "loss/hidden": 3.5234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2067353159189224, |
| "step": 5410 |
| }, |
| { |
| "epoch": 0.1355, |
| "grad_norm": 30.375, |
| "grad_norm_var": 2.59765625, |
| "learning_rate": 0.0001, |
| "loss": 7.5656, |
| "loss/crossentropy": 2.140727072954178, |
| "loss/hidden": 3.42109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20080696120858194, |
| "step": 5420 |
| }, |
| { |
| "epoch": 0.13575, |
| "grad_norm": 30.875, |
| "grad_norm_var": 2.299934895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6261, |
| "loss/crossentropy": 2.2185936748981474, |
| "loss/hidden": 3.36796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1954452872276306, |
| "step": 5430 |
| }, |
| { |
| "epoch": 0.136, |
| "grad_norm": 32.0, |
| "grad_norm_var": 2.5455729166666665, |
| "learning_rate": 0.0001, |
| "loss": 7.5373, |
| "loss/crossentropy": 2.0910344183444978, |
| "loss/hidden": 3.333984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1913301758468151, |
| "step": 5440 |
| }, |
| { |
| "epoch": 0.13625, |
| "grad_norm": 34.5, |
| "grad_norm_var": 2.590559895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5366, |
| "loss/crossentropy": 2.278426119685173, |
| "loss/hidden": 3.3359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1943113673478365, |
| "step": 5450 |
| }, |
| { |
| "epoch": 0.1365, |
| "grad_norm": 31.625, |
| "grad_norm_var": 2.5580729166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5847, |
| "loss/crossentropy": 2.076927217841148, |
| "loss/hidden": 3.4703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18883342035114764, |
| "step": 5460 |
| }, |
| { |
| "epoch": 0.13675, |
| "grad_norm": 34.75, |
| "grad_norm_var": 28.4197265625, |
| "learning_rate": 0.0001, |
| "loss": 7.6209, |
| "loss/crossentropy": 2.234089860320091, |
| "loss/hidden": 3.47734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21088924966752529, |
| "step": 5470 |
| }, |
| { |
| "epoch": 0.137, |
| "grad_norm": 31.25, |
| "grad_norm_var": 169.49479166666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5375, |
| "loss/crossentropy": 2.117088034749031, |
| "loss/hidden": 3.3953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1874922074377537, |
| "step": 5480 |
| }, |
| { |
| "epoch": 0.13725, |
| "grad_norm": 31.0, |
| "grad_norm_var": 168.70462239583333, |
| "learning_rate": 0.0001, |
| "loss": 7.5082, |
| "loss/crossentropy": 2.1952930808067324, |
| "loss/hidden": 3.319921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19221495129168034, |
| "step": 5490 |
| }, |
| { |
| "epoch": 0.1375, |
| "grad_norm": 30.5, |
| "grad_norm_var": 27.639518229166665, |
| "learning_rate": 0.0001, |
| "loss": 7.4408, |
| "loss/crossentropy": 2.184998545050621, |
| "loss/hidden": 3.355859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17893593553453685, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.13775, |
| "grad_norm": 32.5, |
| "grad_norm_var": 1.6910807291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6031, |
| "loss/crossentropy": 2.280049467086792, |
| "loss/hidden": 3.506640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21599141787737608, |
| "step": 5510 |
| }, |
| { |
| "epoch": 0.138, |
| "grad_norm": 31.5, |
| "grad_norm_var": 2.730208333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5417, |
| "loss/crossentropy": 2.159082019329071, |
| "loss/hidden": 3.31640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19252310022711755, |
| "step": 5520 |
| }, |
| { |
| "epoch": 0.13825, |
| "grad_norm": 31.125, |
| "grad_norm_var": 2.520768229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6031, |
| "loss/crossentropy": 2.2485590517520904, |
| "loss/hidden": 3.448828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19249292369931936, |
| "step": 5530 |
| }, |
| { |
| "epoch": 0.1385, |
| "grad_norm": 31.75, |
| "grad_norm_var": 2.939583333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5843, |
| "loss/crossentropy": 2.0814964517951013, |
| "loss/hidden": 3.501953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19598778411746026, |
| "step": 5540 |
| }, |
| { |
| "epoch": 0.13875, |
| "grad_norm": 30.25, |
| "grad_norm_var": 515.5889973958333, |
| "learning_rate": 0.0001, |
| "loss": 7.6637, |
| "loss/crossentropy": 2.1314420223236086, |
| "loss/hidden": 3.451953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20185216665267944, |
| "step": 5550 |
| }, |
| { |
| "epoch": 0.139, |
| "grad_norm": 33.0, |
| "grad_norm_var": 3.4643229166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5407, |
| "loss/crossentropy": 2.190821570158005, |
| "loss/hidden": 3.362890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1984243031591177, |
| "step": 5560 |
| }, |
| { |
| "epoch": 0.13925, |
| "grad_norm": 30.875, |
| "grad_norm_var": 3.1483723958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.587, |
| "loss/crossentropy": 2.2163518011569976, |
| "loss/hidden": 3.44140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19457473792135715, |
| "step": 5570 |
| }, |
| { |
| "epoch": 0.1395, |
| "grad_norm": 28.625, |
| "grad_norm_var": 3.090559895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5224, |
| "loss/crossentropy": 2.077855309844017, |
| "loss/hidden": 3.551171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19357334338128568, |
| "step": 5580 |
| }, |
| { |
| "epoch": 0.13975, |
| "grad_norm": 31.25, |
| "grad_norm_var": 18.9041015625, |
| "learning_rate": 0.0001, |
| "loss": 7.6508, |
| "loss/crossentropy": 2.0995124436914923, |
| "loss/hidden": 3.54921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20311756529845298, |
| "step": 5590 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 30.875, |
| "grad_norm_var": 16.024739583333332, |
| "learning_rate": 0.0001, |
| "loss": 7.5486, |
| "loss/crossentropy": 2.1742469370365143, |
| "loss/hidden": 3.501171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2091912193223834, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.14025, |
| "grad_norm": 31.0, |
| "grad_norm_var": 1.0989583333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5563, |
| "loss/crossentropy": 2.1435637921094894, |
| "loss/hidden": 3.525390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20397210270166397, |
| "step": 5610 |
| }, |
| { |
| "epoch": 0.1405, |
| "grad_norm": 44.0, |
| "grad_norm_var": 11.0041015625, |
| "learning_rate": 0.0001, |
| "loss": 7.593, |
| "loss/crossentropy": 2.151739400625229, |
| "loss/hidden": 3.477734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20091456174850464, |
| "step": 5620 |
| }, |
| { |
| "epoch": 0.14075, |
| "grad_norm": 31.125, |
| "grad_norm_var": 16.405208333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5337, |
| "loss/crossentropy": 2.2840585201978683, |
| "loss/hidden": 3.346875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21120928078889847, |
| "step": 5630 |
| }, |
| { |
| "epoch": 0.141, |
| "grad_norm": 29.0, |
| "grad_norm_var": 2.7375, |
| "learning_rate": 0.0001, |
| "loss": 7.4997, |
| "loss/crossentropy": 2.041360355913639, |
| "loss/hidden": 3.439453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2038326717913151, |
| "step": 5640 |
| }, |
| { |
| "epoch": 0.14125, |
| "grad_norm": 29.25, |
| "grad_norm_var": 23.368684895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5633, |
| "loss/crossentropy": 2.111186498403549, |
| "loss/hidden": 3.463671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20292142927646636, |
| "step": 5650 |
| }, |
| { |
| "epoch": 0.1415, |
| "grad_norm": 33.5, |
| "grad_norm_var": 4.765559895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.577, |
| "loss/crossentropy": 2.1156825721263885, |
| "loss/hidden": 3.423828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19096483811736106, |
| "step": 5660 |
| }, |
| { |
| "epoch": 0.14175, |
| "grad_norm": 29.625, |
| "grad_norm_var": 6.212434895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.561, |
| "loss/crossentropy": 2.1884111180901527, |
| "loss/hidden": 3.416015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19188922494649888, |
| "step": 5670 |
| }, |
| { |
| "epoch": 0.142, |
| "grad_norm": 31.75, |
| "grad_norm_var": 4.145572916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5785, |
| "loss/crossentropy": 2.1473594516515733, |
| "loss/hidden": 3.4375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19657318461686374, |
| "step": 5680 |
| }, |
| { |
| "epoch": 0.14225, |
| "grad_norm": 32.75, |
| "grad_norm_var": 9.868489583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5749, |
| "loss/crossentropy": 2.224351739883423, |
| "loss/hidden": 3.390234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19284768104553224, |
| "step": 5690 |
| }, |
| { |
| "epoch": 0.1425, |
| "grad_norm": 29.375, |
| "grad_norm_var": 24.6166015625, |
| "learning_rate": 0.0001, |
| "loss": 7.4841, |
| "loss/crossentropy": 2.0835042744874954, |
| "loss/hidden": 3.505859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1889321893453598, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.14275, |
| "grad_norm": 32.5, |
| "grad_norm_var": 3.8934895833333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5411, |
| "loss/crossentropy": 2.132970982789993, |
| "loss/hidden": 3.4078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20858928225934506, |
| "step": 5710 |
| }, |
| { |
| "epoch": 0.143, |
| "grad_norm": 33.0, |
| "grad_norm_var": 96.9619140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5633, |
| "loss/crossentropy": 2.2334523528814314, |
| "loss/hidden": 3.442578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20192687548696994, |
| "step": 5720 |
| }, |
| { |
| "epoch": 0.14325, |
| "grad_norm": 32.0, |
| "grad_norm_var": 90.8759765625, |
| "learning_rate": 0.0001, |
| "loss": 7.5, |
| "loss/crossentropy": 2.181543472409248, |
| "loss/hidden": 3.44453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20608940124511718, |
| "step": 5730 |
| }, |
| { |
| "epoch": 0.1435, |
| "grad_norm": 29.125, |
| "grad_norm_var": 68.45416666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6044, |
| "loss/crossentropy": 2.2480223774909973, |
| "loss/hidden": 3.317578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18675435222685338, |
| "step": 5740 |
| }, |
| { |
| "epoch": 0.14375, |
| "grad_norm": 39.25, |
| "grad_norm_var": 94.02057291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5096, |
| "loss/crossentropy": 2.2112644970417024, |
| "loss/hidden": 3.40390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21228924561291934, |
| "step": 5750 |
| }, |
| { |
| "epoch": 0.144, |
| "grad_norm": 31.5, |
| "grad_norm_var": 2.1797421917742129e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.6201, |
| "loss/crossentropy": 2.2231735616922377, |
| "loss/hidden": 3.3765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19756054263561965, |
| "step": 5760 |
| }, |
| { |
| "epoch": 0.14425, |
| "grad_norm": 38.75, |
| "grad_norm_var": 58.33743489583333, |
| "learning_rate": 0.0001, |
| "loss": 7.5131, |
| "loss/crossentropy": 2.10022853910923, |
| "loss/hidden": 3.383984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1919796233996749, |
| "step": 5770 |
| }, |
| { |
| "epoch": 0.1445, |
| "grad_norm": 34.25, |
| "grad_norm_var": 46.984375, |
| "learning_rate": 0.0001, |
| "loss": 7.6109, |
| "loss/crossentropy": 2.190520279109478, |
| "loss/hidden": 3.358984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20563599281013012, |
| "step": 5780 |
| }, |
| { |
| "epoch": 0.14475, |
| "grad_norm": 32.75, |
| "grad_norm_var": 6.2166015625, |
| "learning_rate": 0.0001, |
| "loss": 7.6612, |
| "loss/crossentropy": 2.0708674401044846, |
| "loss/hidden": 3.473828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20486081317067145, |
| "step": 5790 |
| }, |
| { |
| "epoch": 0.145, |
| "grad_norm": 30.5, |
| "grad_norm_var": 21.637239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5095, |
| "loss/crossentropy": 2.2081795185804367, |
| "loss/hidden": 3.50703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20764970332384108, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.14525, |
| "grad_norm": 32.25, |
| "grad_norm_var": 2.1119140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5625, |
| "loss/crossentropy": 2.134955820441246, |
| "loss/hidden": 3.32890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18278160132467747, |
| "step": 5810 |
| }, |
| { |
| "epoch": 0.1455, |
| "grad_norm": 32.0, |
| "grad_norm_var": 26.182291666666668, |
| "learning_rate": 0.0001, |
| "loss": 7.5741, |
| "loss/crossentropy": 2.101625883579254, |
| "loss/hidden": 3.490234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21009023115038872, |
| "step": 5820 |
| }, |
| { |
| "epoch": 0.14575, |
| "grad_norm": 32.25, |
| "grad_norm_var": 12.02890625, |
| "learning_rate": 0.0001, |
| "loss": 7.5459, |
| "loss/crossentropy": 2.269702708721161, |
| "loss/hidden": 3.36796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1850876223295927, |
| "step": 5830 |
| }, |
| { |
| "epoch": 0.146, |
| "grad_norm": 31.625, |
| "grad_norm_var": 10.504166666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5756, |
| "loss/crossentropy": 2.1922702878713607, |
| "loss/hidden": 3.408203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1936382047832012, |
| "step": 5840 |
| }, |
| { |
| "epoch": 0.14625, |
| "grad_norm": 34.25, |
| "grad_norm_var": 2.6372395833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5806, |
| "loss/crossentropy": 2.0927012979984285, |
| "loss/hidden": 3.408203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.200397995300591, |
| "step": 5850 |
| }, |
| { |
| "epoch": 0.1465, |
| "grad_norm": 31.375, |
| "grad_norm_var": 16.130143229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.6031, |
| "loss/crossentropy": 2.150560998916626, |
| "loss/hidden": 3.379296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20800711959600449, |
| "step": 5860 |
| }, |
| { |
| "epoch": 0.14675, |
| "grad_norm": 30.875, |
| "grad_norm_var": 15.814583333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4831, |
| "loss/crossentropy": 2.138367956876755, |
| "loss/hidden": 3.299609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1825962917879224, |
| "step": 5870 |
| }, |
| { |
| "epoch": 0.147, |
| "grad_norm": 31.375, |
| "grad_norm_var": 11.920833333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5279, |
| "loss/crossentropy": 2.0560067892074585, |
| "loss/hidden": 3.4578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18732867166399955, |
| "step": 5880 |
| }, |
| { |
| "epoch": 0.14725, |
| "grad_norm": 31.625, |
| "grad_norm_var": 11.8087890625, |
| "learning_rate": 0.0001, |
| "loss": 7.4841, |
| "loss/crossentropy": 2.204834724962711, |
| "loss/hidden": 3.3015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18124654777348043, |
| "step": 5890 |
| }, |
| { |
| "epoch": 0.1475, |
| "grad_norm": 29.625, |
| "grad_norm_var": 12.66015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5171, |
| "loss/crossentropy": 2.185581070184708, |
| "loss/hidden": 3.3984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19726393837481737, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.14775, |
| "grad_norm": 29.625, |
| "grad_norm_var": 2.1747395833333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4905, |
| "loss/crossentropy": 2.0858706533908844, |
| "loss/hidden": 3.57734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20108112394809724, |
| "step": 5910 |
| }, |
| { |
| "epoch": 0.148, |
| "grad_norm": 30.0, |
| "grad_norm_var": 3.87265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5801, |
| "loss/crossentropy": 2.11753663122654, |
| "loss/hidden": 3.512890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20907512214034796, |
| "step": 5920 |
| }, |
| { |
| "epoch": 0.14825, |
| "grad_norm": 31.25, |
| "grad_norm_var": 2.6212890625, |
| "learning_rate": 0.0001, |
| "loss": 7.5707, |
| "loss/crossentropy": 2.1879894763231276, |
| "loss/hidden": 3.381640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19724611584097146, |
| "step": 5930 |
| }, |
| { |
| "epoch": 0.1485, |
| "grad_norm": 31.625, |
| "grad_norm_var": 5.4134765625, |
| "learning_rate": 0.0001, |
| "loss": 7.5137, |
| "loss/crossentropy": 2.0121699988842012, |
| "loss/hidden": 3.653515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2113606294617057, |
| "step": 5940 |
| }, |
| { |
| "epoch": 0.14875, |
| "grad_norm": 30.375, |
| "grad_norm_var": 7.517643229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5851, |
| "loss/crossentropy": 2.1624063462018968, |
| "loss/hidden": 3.3625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19148119539022446, |
| "step": 5950 |
| }, |
| { |
| "epoch": 0.149, |
| "grad_norm": 40.5, |
| "grad_norm_var": 18.078580729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4704, |
| "loss/crossentropy": 2.100359010696411, |
| "loss/hidden": 3.39765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18530675377696754, |
| "step": 5960 |
| }, |
| { |
| "epoch": 0.14925, |
| "grad_norm": 33.0, |
| "grad_norm_var": 106.165625, |
| "learning_rate": 0.0001, |
| "loss": 7.5243, |
| "loss/crossentropy": 2.088457000255585, |
| "loss/hidden": 3.45, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1950376622378826, |
| "step": 5970 |
| }, |
| { |
| "epoch": 0.1495, |
| "grad_norm": 35.5, |
| "grad_norm_var": 110.30618489583334, |
| "learning_rate": 0.0001, |
| "loss": 7.566, |
| "loss/crossentropy": 2.25458045899868, |
| "loss/hidden": 3.35, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.192095298320055, |
| "step": 5980 |
| }, |
| { |
| "epoch": 0.14975, |
| "grad_norm": 33.25, |
| "grad_norm_var": 21.141080729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5733, |
| "loss/crossentropy": 2.2489352226257324, |
| "loss/hidden": 3.384375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19246331304311753, |
| "step": 5990 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 33.25, |
| "grad_norm_var": 1.6747395833333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6863, |
| "loss/crossentropy": 2.0839652568101883, |
| "loss/hidden": 3.4921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20588702652603388, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.15025, |
| "grad_norm": 29.875, |
| "grad_norm_var": 9.457291666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5802, |
| "loss/crossentropy": 2.1618823766708375, |
| "loss/hidden": 3.36015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19899031519889832, |
| "step": 6010 |
| }, |
| { |
| "epoch": 0.1505, |
| "grad_norm": 32.5, |
| "grad_norm_var": 10.827018229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6105, |
| "loss/crossentropy": 2.0799810975790023, |
| "loss/hidden": 3.6, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20440249070525168, |
| "step": 6020 |
| }, |
| { |
| "epoch": 0.15075, |
| "grad_norm": 30.875, |
| "grad_norm_var": 3.2405598958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.6337, |
| "loss/crossentropy": 2.131713417172432, |
| "loss/hidden": 3.492578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20057708621025086, |
| "step": 6030 |
| }, |
| { |
| "epoch": 0.151, |
| "grad_norm": 31.125, |
| "grad_norm_var": 17.879166666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.7142, |
| "loss/crossentropy": 2.116998878121376, |
| "loss/hidden": 3.485546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2165115473791957, |
| "step": 6040 |
| }, |
| { |
| "epoch": 0.15125, |
| "grad_norm": 31.375, |
| "grad_norm_var": 19.562239583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5782, |
| "loss/crossentropy": 2.187731945514679, |
| "loss/hidden": 3.42421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1970472853630781, |
| "step": 6050 |
| }, |
| { |
| "epoch": 0.1515, |
| "grad_norm": 51.0, |
| "grad_norm_var": 25.872330729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5339, |
| "loss/crossentropy": 2.0746294870972632, |
| "loss/hidden": 3.49375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18855103328824044, |
| "step": 6060 |
| }, |
| { |
| "epoch": 0.15175, |
| "grad_norm": 31.75, |
| "grad_norm_var": 24.65390625, |
| "learning_rate": 0.0001, |
| "loss": 7.6192, |
| "loss/crossentropy": 2.223821607232094, |
| "loss/hidden": 3.48203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20149823743849993, |
| "step": 6070 |
| }, |
| { |
| "epoch": 0.152, |
| "grad_norm": 32.5, |
| "grad_norm_var": 1.99765625, |
| "learning_rate": 0.0001, |
| "loss": 7.634, |
| "loss/crossentropy": 2.187976914644241, |
| "loss/hidden": 3.4640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19613044820725917, |
| "step": 6080 |
| }, |
| { |
| "epoch": 0.15225, |
| "grad_norm": 32.0, |
| "grad_norm_var": 2.8854166666666665, |
| "learning_rate": 0.0001, |
| "loss": 7.5475, |
| "loss/crossentropy": 2.2121584147214888, |
| "loss/hidden": 3.28828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1814923081547022, |
| "step": 6090 |
| }, |
| { |
| "epoch": 0.1525, |
| "grad_norm": 33.0, |
| "grad_norm_var": 2.9692057291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5725, |
| "loss/crossentropy": 2.189678418636322, |
| "loss/hidden": 3.412890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19301872439682483, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.15275, |
| "grad_norm": 31.0, |
| "grad_norm_var": 3.457747395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5315, |
| "loss/crossentropy": 2.1482601583004, |
| "loss/hidden": 3.35703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1997425738722086, |
| "step": 6110 |
| }, |
| { |
| "epoch": 0.153, |
| "grad_norm": 36.0, |
| "grad_norm_var": 4.5072265625, |
| "learning_rate": 0.0001, |
| "loss": 7.571, |
| "loss/crossentropy": 2.1208418533205986, |
| "loss/hidden": 3.41953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19502468593418598, |
| "step": 6120 |
| }, |
| { |
| "epoch": 0.15325, |
| "grad_norm": 33.0, |
| "grad_norm_var": 17.070572916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5239, |
| "loss/crossentropy": 2.160035288333893, |
| "loss/hidden": 3.4359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.189887585490942, |
| "step": 6130 |
| }, |
| { |
| "epoch": 0.1535, |
| "grad_norm": 30.625, |
| "grad_norm_var": 22.48515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5566, |
| "loss/crossentropy": 2.173004740476608, |
| "loss/hidden": 3.366796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2055901188403368, |
| "step": 6140 |
| }, |
| { |
| "epoch": 0.15375, |
| "grad_norm": 35.5, |
| "grad_norm_var": 16.00390625, |
| "learning_rate": 0.0001, |
| "loss": 7.6958, |
| "loss/crossentropy": 2.0565848529338835, |
| "loss/hidden": 3.53203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22364541105926036, |
| "step": 6150 |
| }, |
| { |
| "epoch": 0.154, |
| "grad_norm": 32.0, |
| "grad_norm_var": 11.542122395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6093, |
| "loss/crossentropy": 2.085195133090019, |
| "loss/hidden": 3.45546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19985817223787308, |
| "step": 6160 |
| }, |
| { |
| "epoch": 0.15425, |
| "grad_norm": 29.5, |
| "grad_norm_var": 2.1666666666666665, |
| "learning_rate": 0.0001, |
| "loss": 7.5611, |
| "loss/crossentropy": 2.1293098747730257, |
| "loss/hidden": 3.402734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19878034461289645, |
| "step": 6170 |
| }, |
| { |
| "epoch": 0.1545, |
| "grad_norm": 31.25, |
| "grad_norm_var": 20.2822265625, |
| "learning_rate": 0.0001, |
| "loss": 7.6405, |
| "loss/crossentropy": 2.2258552461862564, |
| "loss/hidden": 3.430078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20626316573470832, |
| "step": 6180 |
| }, |
| { |
| "epoch": 0.15475, |
| "grad_norm": 31.25, |
| "grad_norm_var": 20.730208333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4726, |
| "loss/crossentropy": 2.1775721326470374, |
| "loss/hidden": 3.449609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18398564979434012, |
| "step": 6190 |
| }, |
| { |
| "epoch": 0.155, |
| "grad_norm": 32.75, |
| "grad_norm_var": 3.1958333333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5734, |
| "loss/crossentropy": 2.223562794923782, |
| "loss/hidden": 3.34921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.190413929335773, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.15525, |
| "grad_norm": 30.125, |
| "grad_norm_var": 5.166080729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6525, |
| "loss/crossentropy": 2.085880035161972, |
| "loss/hidden": 3.384375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1801287617534399, |
| "step": 6210 |
| }, |
| { |
| "epoch": 0.1555, |
| "grad_norm": 30.875, |
| "grad_norm_var": 2.7035807291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5305, |
| "loss/crossentropy": 2.1930001616477965, |
| "loss/hidden": 3.420703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1861238319426775, |
| "step": 6220 |
| }, |
| { |
| "epoch": 0.15575, |
| "grad_norm": 31.0, |
| "grad_norm_var": 4.468489583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6889, |
| "loss/crossentropy": 2.22431803047657, |
| "loss/hidden": 3.388671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19135152641683817, |
| "step": 6230 |
| }, |
| { |
| "epoch": 0.156, |
| "grad_norm": 33.75, |
| "grad_norm_var": 4.770572916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5177, |
| "loss/crossentropy": 2.093687379360199, |
| "loss/hidden": 3.599609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1883873265236616, |
| "step": 6240 |
| }, |
| { |
| "epoch": 0.15625, |
| "grad_norm": 31.5, |
| "grad_norm_var": 1.1872395833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6073, |
| "loss/crossentropy": 2.2236929804086687, |
| "loss/hidden": 3.43828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.213064675219357, |
| "step": 6250 |
| }, |
| { |
| "epoch": 0.1565, |
| "grad_norm": 31.0, |
| "grad_norm_var": 1.5947265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5079, |
| "loss/crossentropy": 2.0539269253611563, |
| "loss/hidden": 3.4625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1872910862788558, |
| "step": 6260 |
| }, |
| { |
| "epoch": 0.15675, |
| "grad_norm": 40.75, |
| "grad_norm_var": 21.492708333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6006, |
| "loss/crossentropy": 2.266616016626358, |
| "loss/hidden": 3.373828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1959751147776842, |
| "step": 6270 |
| }, |
| { |
| "epoch": 0.157, |
| "grad_norm": 31.375, |
| "grad_norm_var": 25.213541666666668, |
| "learning_rate": 0.0001, |
| "loss": 7.6284, |
| "loss/crossentropy": 2.2850263684988024, |
| "loss/hidden": 3.5203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2038384210318327, |
| "step": 6280 |
| }, |
| { |
| "epoch": 0.15725, |
| "grad_norm": 31.75, |
| "grad_norm_var": 2.0010416666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5067, |
| "loss/crossentropy": 2.122395873069763, |
| "loss/hidden": 3.448046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18144308719784022, |
| "step": 6290 |
| }, |
| { |
| "epoch": 0.1575, |
| "grad_norm": 34.0, |
| "grad_norm_var": 3.7462890625, |
| "learning_rate": 0.0001, |
| "loss": 7.5297, |
| "loss/crossentropy": 1.9984511777758598, |
| "loss/hidden": 3.563671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19643332287669182, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.15775, |
| "grad_norm": 30.5, |
| "grad_norm_var": 5.603059895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5455, |
| "loss/crossentropy": 2.029903215169907, |
| "loss/hidden": 3.496484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20556226037442685, |
| "step": 6310 |
| }, |
| { |
| "epoch": 0.158, |
| "grad_norm": 32.0, |
| "grad_norm_var": 35.5853515625, |
| "learning_rate": 0.0001, |
| "loss": 7.593, |
| "loss/crossentropy": 2.19532273709774, |
| "loss/hidden": 3.40390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1972879134118557, |
| "step": 6320 |
| }, |
| { |
| "epoch": 0.15825, |
| "grad_norm": 38.75, |
| "grad_norm_var": 18.0791015625, |
| "learning_rate": 0.0001, |
| "loss": 7.6415, |
| "loss/crossentropy": 2.0865643858909606, |
| "loss/hidden": 3.5046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2021018836647272, |
| "step": 6330 |
| }, |
| { |
| "epoch": 0.1585, |
| "grad_norm": 30.875, |
| "grad_norm_var": 12.580989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.658, |
| "loss/crossentropy": 2.31261685192585, |
| "loss/hidden": 3.40390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2027669247239828, |
| "step": 6340 |
| }, |
| { |
| "epoch": 0.15875, |
| "grad_norm": 31.875, |
| "grad_norm_var": 4.453580729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5713, |
| "loss/crossentropy": 2.1463931113481522, |
| "loss/hidden": 3.4390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18860225863754748, |
| "step": 6350 |
| }, |
| { |
| "epoch": 0.159, |
| "grad_norm": 32.25, |
| "grad_norm_var": 2.1, |
| "learning_rate": 0.0001, |
| "loss": 7.5179, |
| "loss/crossentropy": 2.134768417477608, |
| "loss/hidden": 3.369921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19954264387488366, |
| "step": 6360 |
| }, |
| { |
| "epoch": 0.15925, |
| "grad_norm": 33.5, |
| "grad_norm_var": 7.157747395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5755, |
| "loss/crossentropy": 2.144196245074272, |
| "loss/hidden": 3.425390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18948693461716176, |
| "step": 6370 |
| }, |
| { |
| "epoch": 0.1595, |
| "grad_norm": 31.375, |
| "grad_norm_var": 1.9546223958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6119, |
| "loss/crossentropy": 2.147921970486641, |
| "loss/hidden": 3.413671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19279659036546945, |
| "step": 6380 |
| }, |
| { |
| "epoch": 0.15975, |
| "grad_norm": 33.25, |
| "grad_norm_var": 1.9129557291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4797, |
| "loss/crossentropy": 2.1023626953363417, |
| "loss/hidden": 3.486328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1855178650468588, |
| "step": 6390 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 31.375, |
| "grad_norm_var": 1.7577473958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5115, |
| "loss/crossentropy": 2.1468859046697615, |
| "loss/hidden": 3.398828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19039052687585353, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.16025, |
| "grad_norm": 31.875, |
| "grad_norm_var": 2.43515625, |
| "learning_rate": 0.0001, |
| "loss": 7.473, |
| "loss/crossentropy": 2.1335047364234923, |
| "loss/hidden": 3.349609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17829085066914557, |
| "step": 6410 |
| }, |
| { |
| "epoch": 0.1605, |
| "grad_norm": 31.5, |
| "grad_norm_var": 2.027083333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5477, |
| "loss/crossentropy": 2.0842413723468782, |
| "loss/hidden": 3.537109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20616262052208184, |
| "step": 6420 |
| }, |
| { |
| "epoch": 0.16075, |
| "grad_norm": 33.75, |
| "grad_norm_var": 33.49791666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5695, |
| "loss/crossentropy": 2.153767225146294, |
| "loss/hidden": 3.493359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20842270255088807, |
| "step": 6430 |
| }, |
| { |
| "epoch": 0.161, |
| "grad_norm": 32.75, |
| "grad_norm_var": 15.109375, |
| "learning_rate": 0.0001, |
| "loss": 7.6661, |
| "loss/crossentropy": 2.0475671708583834, |
| "loss/hidden": 3.6484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2175968911498785, |
| "step": 6440 |
| }, |
| { |
| "epoch": 0.16125, |
| "grad_norm": 32.5, |
| "grad_norm_var": 8402.958268229168, |
| "learning_rate": 0.0001, |
| "loss": 7.6392, |
| "loss/crossentropy": 2.2127752989530562, |
| "loss/hidden": 3.532421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21682111844420432, |
| "step": 6450 |
| }, |
| { |
| "epoch": 0.1615, |
| "grad_norm": 33.25, |
| "grad_norm_var": 24.9375, |
| "learning_rate": 0.0001, |
| "loss": 7.5701, |
| "loss/crossentropy": 2.1261292159557343, |
| "loss/hidden": 3.425390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20704152658581734, |
| "step": 6460 |
| }, |
| { |
| "epoch": 0.16175, |
| "grad_norm": 35.5, |
| "grad_norm_var": 30.0875, |
| "learning_rate": 0.0001, |
| "loss": 7.5408, |
| "loss/crossentropy": 2.124222718179226, |
| "loss/hidden": 3.5984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18728400766849518, |
| "step": 6470 |
| }, |
| { |
| "epoch": 0.162, |
| "grad_norm": 32.25, |
| "grad_norm_var": 3.1145833333333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5831, |
| "loss/crossentropy": 2.2058008939027784, |
| "loss/hidden": 3.3828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19860644564032554, |
| "step": 6480 |
| }, |
| { |
| "epoch": 0.16225, |
| "grad_norm": 31.875, |
| "grad_norm_var": 28.412239583333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5668, |
| "loss/crossentropy": 2.1960059702396393, |
| "loss/hidden": 3.370703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1868469040840864, |
| "step": 6490 |
| }, |
| { |
| "epoch": 0.1625, |
| "grad_norm": 32.5, |
| "grad_norm_var": 3.2416015625, |
| "learning_rate": 0.0001, |
| "loss": 7.4873, |
| "loss/crossentropy": 2.180153116583824, |
| "loss/hidden": 3.412109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18321371413767337, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.16275, |
| "grad_norm": 31.125, |
| "grad_norm_var": 2.113997395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5498, |
| "loss/crossentropy": 2.085334287583828, |
| "loss/hidden": 3.481640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20047369822859765, |
| "step": 6510 |
| }, |
| { |
| "epoch": 0.163, |
| "grad_norm": 33.75, |
| "grad_norm_var": 14.7166015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5356, |
| "loss/crossentropy": 2.081576499342918, |
| "loss/hidden": 3.52890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19775602114386856, |
| "step": 6520 |
| }, |
| { |
| "epoch": 0.16325, |
| "grad_norm": 32.25, |
| "grad_norm_var": 13.976822916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4858, |
| "loss/crossentropy": 2.0600350558757783, |
| "loss/hidden": 3.427734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18539784867316483, |
| "step": 6530 |
| }, |
| { |
| "epoch": 0.1635, |
| "grad_norm": 30.375, |
| "grad_norm_var": 3.846875, |
| "learning_rate": 0.0001, |
| "loss": 7.5135, |
| "loss/crossentropy": 2.117923478782177, |
| "loss/hidden": 3.5109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2071079235523939, |
| "step": 6540 |
| }, |
| { |
| "epoch": 0.16375, |
| "grad_norm": 33.25, |
| "grad_norm_var": 21.288997395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5807, |
| "loss/crossentropy": 2.087580367922783, |
| "loss/hidden": 3.45390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18587088529020548, |
| "step": 6550 |
| }, |
| { |
| "epoch": 0.164, |
| "grad_norm": 33.0, |
| "grad_norm_var": 5.292708333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4736, |
| "loss/crossentropy": 2.060797114670277, |
| "loss/hidden": 3.485546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1906617671251297, |
| "step": 6560 |
| }, |
| { |
| "epoch": 0.16425, |
| "grad_norm": 30.375, |
| "grad_norm_var": 1.934375, |
| "learning_rate": 0.0001, |
| "loss": 7.5351, |
| "loss/crossentropy": 2.177751311659813, |
| "loss/hidden": 3.426953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20191191136837006, |
| "step": 6570 |
| }, |
| { |
| "epoch": 0.1645, |
| "grad_norm": 33.0, |
| "grad_norm_var": 1.4184895833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6058, |
| "loss/crossentropy": 2.1490555882453917, |
| "loss/hidden": 3.52109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19388978108763694, |
| "step": 6580 |
| }, |
| { |
| "epoch": 0.16475, |
| "grad_norm": 29.125, |
| "grad_norm_var": 2.8478515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5149, |
| "loss/crossentropy": 2.2254546850919725, |
| "loss/hidden": 3.306640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18252336494624616, |
| "step": 6590 |
| }, |
| { |
| "epoch": 0.165, |
| "grad_norm": 38.5, |
| "grad_norm_var": 207.35201822916667, |
| "learning_rate": 0.0001, |
| "loss": 7.6059, |
| "loss/crossentropy": 2.1390285924077035, |
| "loss/hidden": 3.303515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1844609746709466, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.16525, |
| "grad_norm": 35.0, |
| "grad_norm_var": 203.78098958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5921, |
| "loss/crossentropy": 2.092223954200745, |
| "loss/hidden": 3.58828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17789147663861513, |
| "step": 6610 |
| }, |
| { |
| "epoch": 0.1655, |
| "grad_norm": 30.0, |
| "grad_norm_var": 65.76640625, |
| "learning_rate": 0.0001, |
| "loss": 7.558, |
| "loss/crossentropy": 2.138624146580696, |
| "loss/hidden": 3.508984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19836988989263774, |
| "step": 6620 |
| }, |
| { |
| "epoch": 0.16575, |
| "grad_norm": 30.875, |
| "grad_norm_var": 68.97337239583334, |
| "learning_rate": 0.0001, |
| "loss": 7.581, |
| "loss/crossentropy": 2.1231719397008417, |
| "loss/hidden": 3.5078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18597114123404027, |
| "step": 6630 |
| }, |
| { |
| "epoch": 0.166, |
| "grad_norm": 32.25, |
| "grad_norm_var": 125.8041015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5804, |
| "loss/crossentropy": 2.130907243490219, |
| "loss/hidden": 3.41640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17537819929420947, |
| "step": 6640 |
| }, |
| { |
| "epoch": 0.16625, |
| "grad_norm": 40.25, |
| "grad_norm_var": 399.22057291666664, |
| "learning_rate": 0.0001, |
| "loss": 7.6269, |
| "loss/crossentropy": 2.0258643075823786, |
| "loss/hidden": 3.596875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.23089794162660837, |
| "step": 6650 |
| }, |
| { |
| "epoch": 0.1665, |
| "grad_norm": 31.0, |
| "grad_norm_var": 408.07233072916665, |
| "learning_rate": 0.0001, |
| "loss": 7.4616, |
| "loss/crossentropy": 2.23043432533741, |
| "loss/hidden": 3.40078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19158051013946534, |
| "step": 6660 |
| }, |
| { |
| "epoch": 0.16675, |
| "grad_norm": 32.0, |
| "grad_norm_var": 8.009830729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5931, |
| "loss/crossentropy": 2.1956572234630585, |
| "loss/hidden": 3.50234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20411487035453318, |
| "step": 6670 |
| }, |
| { |
| "epoch": 0.167, |
| "grad_norm": 34.5, |
| "grad_norm_var": 8.894791666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6195, |
| "loss/crossentropy": 2.2672122746706007, |
| "loss/hidden": 3.309765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18957087770104408, |
| "step": 6680 |
| }, |
| { |
| "epoch": 0.16725, |
| "grad_norm": 31.0, |
| "grad_norm_var": 66.73333333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5632, |
| "loss/crossentropy": 2.2279567658901214, |
| "loss/hidden": 3.32109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18730072304606438, |
| "step": 6690 |
| }, |
| { |
| "epoch": 0.1675, |
| "grad_norm": 42.0, |
| "grad_norm_var": 71.18854166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.7089, |
| "loss/crossentropy": 2.282482776045799, |
| "loss/hidden": 3.4640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20141587276011705, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.16775, |
| "grad_norm": 31.5, |
| "grad_norm_var": 9.892122395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.3758, |
| "loss/crossentropy": 2.2182126119732857, |
| "loss/hidden": 3.28125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1786667076870799, |
| "step": 6710 |
| }, |
| { |
| "epoch": 0.168, |
| "grad_norm": 33.75, |
| "grad_norm_var": 3.6957682291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5636, |
| "loss/crossentropy": 2.1939273923635483, |
| "loss/hidden": 3.40703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18930760622024537, |
| "step": 6720 |
| }, |
| { |
| "epoch": 0.16825, |
| "grad_norm": 32.25, |
| "grad_norm_var": 3.9009765625, |
| "learning_rate": 0.0001, |
| "loss": 7.5783, |
| "loss/crossentropy": 2.0783598124980927, |
| "loss/hidden": 3.35625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18662143610417842, |
| "step": 6730 |
| }, |
| { |
| "epoch": 0.1685, |
| "grad_norm": 35.5, |
| "grad_norm_var": 5.1353515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5009, |
| "loss/crossentropy": 2.217881241440773, |
| "loss/hidden": 3.4109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19149041585624219, |
| "step": 6740 |
| }, |
| { |
| "epoch": 0.16875, |
| "grad_norm": 31.25, |
| "grad_norm_var": 5.264322916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6346, |
| "loss/crossentropy": 2.1645133450627325, |
| "loss/hidden": 3.47890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20458366964012384, |
| "step": 6750 |
| }, |
| { |
| "epoch": 0.169, |
| "grad_norm": 35.5, |
| "grad_norm_var": 4.1775390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5251, |
| "loss/crossentropy": 2.171855625510216, |
| "loss/hidden": 3.359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1922192147001624, |
| "step": 6760 |
| }, |
| { |
| "epoch": 0.16925, |
| "grad_norm": 30.25, |
| "grad_norm_var": 50.66223958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5615, |
| "loss/crossentropy": 2.269451642036438, |
| "loss/hidden": 3.3828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19260151647031307, |
| "step": 6770 |
| }, |
| { |
| "epoch": 0.1695, |
| "grad_norm": 32.5, |
| "grad_norm_var": 52.72057291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5146, |
| "loss/crossentropy": 2.2029780715703966, |
| "loss/hidden": 3.32734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18055486269295215, |
| "step": 6780 |
| }, |
| { |
| "epoch": 0.16975, |
| "grad_norm": 32.75, |
| "grad_norm_var": 5.893684895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6728, |
| "loss/crossentropy": 2.194507023692131, |
| "loss/hidden": 3.4390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21081659942865372, |
| "step": 6790 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 39.0, |
| "grad_norm_var": 9.028580729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.6095, |
| "loss/crossentropy": 2.228949736058712, |
| "loss/hidden": 3.361328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19297500066459178, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.17025, |
| "grad_norm": 51.25, |
| "grad_norm_var": 26.128125, |
| "learning_rate": 0.0001, |
| "loss": 7.5173, |
| "loss/crossentropy": 2.0145166903734206, |
| "loss/hidden": 3.473828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19940503742545843, |
| "step": 6810 |
| }, |
| { |
| "epoch": 0.1705, |
| "grad_norm": 30.0, |
| "grad_norm_var": 45.58932291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5887, |
| "loss/crossentropy": 2.1203667253255842, |
| "loss/hidden": 3.576953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.24707065224647523, |
| "step": 6820 |
| }, |
| { |
| "epoch": 0.17075, |
| "grad_norm": 32.5, |
| "grad_norm_var": 13.638997395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.6031, |
| "loss/crossentropy": 2.1143920481204987, |
| "loss/hidden": 3.6578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2371676929295063, |
| "step": 6830 |
| }, |
| { |
| "epoch": 0.171, |
| "grad_norm": 37.75, |
| "grad_norm_var": 11.876822916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4937, |
| "loss/crossentropy": 2.3085153490304946, |
| "loss/hidden": 3.395703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19867490641772748, |
| "step": 6840 |
| }, |
| { |
| "epoch": 0.17125, |
| "grad_norm": 31.625, |
| "grad_norm_var": 7.063541666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.3836, |
| "loss/crossentropy": 2.2163452029228212, |
| "loss/hidden": 3.31796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19262240454554558, |
| "step": 6850 |
| }, |
| { |
| "epoch": 0.1715, |
| "grad_norm": 29.375, |
| "grad_norm_var": 7.3353515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5696, |
| "loss/crossentropy": 2.1725872844457625, |
| "loss/hidden": 3.282421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18168828263878822, |
| "step": 6860 |
| }, |
| { |
| "epoch": 0.17175, |
| "grad_norm": 33.25, |
| "grad_norm_var": 6.418489583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5374, |
| "loss/crossentropy": 2.1833366841077804, |
| "loss/hidden": 3.42109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18841250911355018, |
| "step": 6870 |
| }, |
| { |
| "epoch": 0.172, |
| "grad_norm": 36.5, |
| "grad_norm_var": 21.055208333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5021, |
| "loss/crossentropy": 2.166957159340382, |
| "loss/hidden": 3.430078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1870790719985962, |
| "step": 6880 |
| }, |
| { |
| "epoch": 0.17225, |
| "grad_norm": 33.75, |
| "grad_norm_var": 24.2744140625, |
| "learning_rate": 0.0001, |
| "loss": 7.463, |
| "loss/crossentropy": 2.2468337625265122, |
| "loss/hidden": 3.28359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19030643235892059, |
| "step": 6890 |
| }, |
| { |
| "epoch": 0.1725, |
| "grad_norm": 27.875, |
| "grad_norm_var": 5.3275390625, |
| "learning_rate": 0.0001, |
| "loss": 7.453, |
| "loss/crossentropy": 2.1895847231149674, |
| "loss/hidden": 3.420703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19813059270381927, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.17275, |
| "grad_norm": 33.25, |
| "grad_norm_var": 12.654166666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5002, |
| "loss/crossentropy": 2.2333322286605837, |
| "loss/hidden": 3.418359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20021227821707727, |
| "step": 6910 |
| }, |
| { |
| "epoch": 0.173, |
| "grad_norm": 30.75, |
| "grad_norm_var": 11.69765625, |
| "learning_rate": 0.0001, |
| "loss": 7.4872, |
| "loss/crossentropy": 2.0374642267823218, |
| "loss/hidden": 3.4, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18903901670128107, |
| "step": 6920 |
| }, |
| { |
| "epoch": 0.17325, |
| "grad_norm": 31.875, |
| "grad_norm_var": 2.220572916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5288, |
| "loss/crossentropy": 2.2860298246145248, |
| "loss/hidden": 3.413671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20242451801896094, |
| "step": 6930 |
| }, |
| { |
| "epoch": 0.1735, |
| "grad_norm": 30.375, |
| "grad_norm_var": 1.9811848958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6115, |
| "loss/crossentropy": 2.174576237797737, |
| "loss/hidden": 3.502734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20389086604118348, |
| "step": 6940 |
| }, |
| { |
| "epoch": 0.17375, |
| "grad_norm": 29.5, |
| "grad_norm_var": 5.17265625, |
| "learning_rate": 0.0001, |
| "loss": 7.6028, |
| "loss/crossentropy": 2.160076954960823, |
| "loss/hidden": 3.35703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18584340140223504, |
| "step": 6950 |
| }, |
| { |
| "epoch": 0.174, |
| "grad_norm": 32.25, |
| "grad_norm_var": 3.4150390625, |
| "learning_rate": 0.0001, |
| "loss": 7.6249, |
| "loss/crossentropy": 2.2141772389411924, |
| "loss/hidden": 3.384765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2044496938586235, |
| "step": 6960 |
| }, |
| { |
| "epoch": 0.17425, |
| "grad_norm": 30.25, |
| "grad_norm_var": 2.63125, |
| "learning_rate": 0.0001, |
| "loss": 7.5556, |
| "loss/crossentropy": 2.1613443583250045, |
| "loss/hidden": 3.461328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20456040017306804, |
| "step": 6970 |
| }, |
| { |
| "epoch": 0.1745, |
| "grad_norm": 32.0, |
| "grad_norm_var": 1.9184895833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5504, |
| "loss/crossentropy": 2.179374423623085, |
| "loss/hidden": 3.309375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18155127875506877, |
| "step": 6980 |
| }, |
| { |
| "epoch": 0.17475, |
| "grad_norm": 31.125, |
| "grad_norm_var": 1.1302083333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4596, |
| "loss/crossentropy": 2.2060728073120117, |
| "loss/hidden": 3.326171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18002954982221125, |
| "step": 6990 |
| }, |
| { |
| "epoch": 0.175, |
| "grad_norm": 27.875, |
| "grad_norm_var": 2.27265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5016, |
| "loss/crossentropy": 2.1729022413492203, |
| "loss/hidden": 3.37265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20215826146304608, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.17525, |
| "grad_norm": 32.75, |
| "grad_norm_var": 2.187239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5186, |
| "loss/crossentropy": 2.1925390481948854, |
| "loss/hidden": 3.564453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21080133505165577, |
| "step": 7010 |
| }, |
| { |
| "epoch": 0.1755, |
| "grad_norm": 33.0, |
| "grad_norm_var": 3.3681640625, |
| "learning_rate": 0.0001, |
| "loss": 7.6812, |
| "loss/crossentropy": 2.1888799130916596, |
| "loss/hidden": 3.362109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1895390186458826, |
| "step": 7020 |
| }, |
| { |
| "epoch": 0.17575, |
| "grad_norm": 29.0, |
| "grad_norm_var": 3.4625, |
| "learning_rate": 0.0001, |
| "loss": 7.4658, |
| "loss/crossentropy": 2.1911296755075456, |
| "loss/hidden": 3.43828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18743323888629676, |
| "step": 7030 |
| }, |
| { |
| "epoch": 0.176, |
| "grad_norm": 36.75, |
| "grad_norm_var": 4.143489583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5154, |
| "loss/crossentropy": 2.2533592522144317, |
| "loss/hidden": 3.323046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18201965987682342, |
| "step": 7040 |
| }, |
| { |
| "epoch": 0.17625, |
| "grad_norm": 34.0, |
| "grad_norm_var": 3.7556640625, |
| "learning_rate": 0.0001, |
| "loss": 7.4748, |
| "loss/crossentropy": 2.259921830892563, |
| "loss/hidden": 3.408203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1965817864984274, |
| "step": 7050 |
| }, |
| { |
| "epoch": 0.1765, |
| "grad_norm": 32.75, |
| "grad_norm_var": 76.703125, |
| "learning_rate": 0.0001, |
| "loss": 7.6429, |
| "loss/crossentropy": 2.18757144510746, |
| "loss/hidden": 3.415625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18871748261153698, |
| "step": 7060 |
| }, |
| { |
| "epoch": 0.17675, |
| "grad_norm": 29.125, |
| "grad_norm_var": 18.242708333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4899, |
| "loss/crossentropy": 2.1853384137153626, |
| "loss/hidden": 3.356640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18886385671794415, |
| "step": 7070 |
| }, |
| { |
| "epoch": 0.177, |
| "grad_norm": 33.5, |
| "grad_norm_var": 17.101497395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5441, |
| "loss/crossentropy": 2.1331110268831255, |
| "loss/hidden": 3.295703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1878782594576478, |
| "step": 7080 |
| }, |
| { |
| "epoch": 0.17725, |
| "grad_norm": 35.25, |
| "grad_norm_var": 6.013997395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.463, |
| "loss/crossentropy": 2.252199110388756, |
| "loss/hidden": 3.41640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.197531633451581, |
| "step": 7090 |
| }, |
| { |
| "epoch": 0.1775, |
| "grad_norm": 32.75, |
| "grad_norm_var": 6.199739583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5179, |
| "loss/crossentropy": 2.1631226271390913, |
| "loss/hidden": 3.53828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20652975142002106, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.17775, |
| "grad_norm": 30.875, |
| "grad_norm_var": 2.31015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5643, |
| "loss/crossentropy": 2.153787222504616, |
| "loss/hidden": 3.455859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2014656089246273, |
| "step": 7110 |
| }, |
| { |
| "epoch": 0.178, |
| "grad_norm": 29.875, |
| "grad_norm_var": 4.601822916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5516, |
| "loss/crossentropy": 2.1199822768568994, |
| "loss/hidden": 3.448046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1905859999358654, |
| "step": 7120 |
| }, |
| { |
| "epoch": 0.17825, |
| "grad_norm": 32.25, |
| "grad_norm_var": 3.4760416666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5994, |
| "loss/crossentropy": 2.26395897269249, |
| "loss/hidden": 3.402734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19645936116576196, |
| "step": 7130 |
| }, |
| { |
| "epoch": 0.1785, |
| "grad_norm": 36.25, |
| "grad_norm_var": 3.8854166666666665, |
| "learning_rate": 0.0001, |
| "loss": 7.5969, |
| "loss/crossentropy": 2.1285897165536882, |
| "loss/hidden": 3.4140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18263876978307964, |
| "step": 7140 |
| }, |
| { |
| "epoch": 0.17875, |
| "grad_norm": 34.0, |
| "grad_norm_var": 4.657291666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5153, |
| "loss/crossentropy": 2.127173164486885, |
| "loss/hidden": 3.401171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19018456861376762, |
| "step": 7150 |
| }, |
| { |
| "epoch": 0.179, |
| "grad_norm": 31.125, |
| "grad_norm_var": 4.314322916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5891, |
| "loss/crossentropy": 2.187330016493797, |
| "loss/hidden": 3.371484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19513612650334836, |
| "step": 7160 |
| }, |
| { |
| "epoch": 0.17925, |
| "grad_norm": 32.5, |
| "grad_norm_var": 2.4479166666666665, |
| "learning_rate": 0.0001, |
| "loss": 7.5653, |
| "loss/crossentropy": 2.0964292854070665, |
| "loss/hidden": 3.467578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20541836731135846, |
| "step": 7170 |
| }, |
| { |
| "epoch": 0.1795, |
| "grad_norm": 33.0, |
| "grad_norm_var": 2.4025390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5849, |
| "loss/crossentropy": 2.1456298559904097, |
| "loss/hidden": 3.515234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21056269146502019, |
| "step": 7180 |
| }, |
| { |
| "epoch": 0.17975, |
| "grad_norm": 33.5, |
| "grad_norm_var": 2.78515625, |
| "learning_rate": 0.0001, |
| "loss": 7.6594, |
| "loss/crossentropy": 2.0873524725437163, |
| "loss/hidden": 3.595703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2302736073732376, |
| "step": 7190 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 32.0, |
| "grad_norm_var": 1.4697916666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5801, |
| "loss/crossentropy": 2.0368966817855836, |
| "loss/hidden": 3.572265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20361636602319777, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.18025, |
| "grad_norm": 32.75, |
| "grad_norm_var": 1.9942057291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6054, |
| "loss/crossentropy": 2.079051211476326, |
| "loss/hidden": 3.55546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21228218004107474, |
| "step": 7210 |
| }, |
| { |
| "epoch": 0.1805, |
| "grad_norm": 33.25, |
| "grad_norm_var": 4.237955729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6754, |
| "loss/crossentropy": 2.130552776157856, |
| "loss/hidden": 3.37265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1921296551823616, |
| "step": 7220 |
| }, |
| { |
| "epoch": 0.18075, |
| "grad_norm": 30.5, |
| "grad_norm_var": 2.526497395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5296, |
| "loss/crossentropy": 2.0738827764987944, |
| "loss/hidden": 3.403125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19692382737994193, |
| "step": 7230 |
| }, |
| { |
| "epoch": 0.181, |
| "grad_norm": 32.75, |
| "grad_norm_var": 2.584375, |
| "learning_rate": 0.0001, |
| "loss": 7.6061, |
| "loss/crossentropy": 2.1579408079385756, |
| "loss/hidden": 3.51953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21399259492754935, |
| "step": 7240 |
| }, |
| { |
| "epoch": 0.18125, |
| "grad_norm": 33.5, |
| "grad_norm_var": 2.8416015625, |
| "learning_rate": 0.0001, |
| "loss": 7.6824, |
| "loss/crossentropy": 2.221544751524925, |
| "loss/hidden": 3.53203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2030305091291666, |
| "step": 7250 |
| }, |
| { |
| "epoch": 0.1815, |
| "grad_norm": 35.5, |
| "grad_norm_var": 1.8848307291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5215, |
| "loss/crossentropy": 2.180348289012909, |
| "loss/hidden": 3.46796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2144750364124775, |
| "step": 7260 |
| }, |
| { |
| "epoch": 0.18175, |
| "grad_norm": 53.25, |
| "grad_norm_var": 31.4353515625, |
| "learning_rate": 0.0001, |
| "loss": 7.6027, |
| "loss/crossentropy": 2.1652086317539214, |
| "loss/hidden": 3.4296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20979799777269365, |
| "step": 7270 |
| }, |
| { |
| "epoch": 0.182, |
| "grad_norm": 31.375, |
| "grad_norm_var": 31.370768229166668, |
| "learning_rate": 0.0001, |
| "loss": 7.5252, |
| "loss/crossentropy": 2.0706035763025286, |
| "loss/hidden": 3.38828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18490511737763882, |
| "step": 7280 |
| }, |
| { |
| "epoch": 0.18225, |
| "grad_norm": 32.25, |
| "grad_norm_var": 8.717643229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5688, |
| "loss/crossentropy": 2.2949971139431, |
| "loss/hidden": 3.4234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20526532903313638, |
| "step": 7290 |
| }, |
| { |
| "epoch": 0.1825, |
| "grad_norm": 32.0, |
| "grad_norm_var": 4.91640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5752, |
| "loss/crossentropy": 2.2193857818841933, |
| "loss/hidden": 3.330078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1804453806951642, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.18275, |
| "grad_norm": 31.375, |
| "grad_norm_var": 3.701041666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6028, |
| "loss/crossentropy": 2.207156080007553, |
| "loss/hidden": 3.436328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2132449522614479, |
| "step": 7310 |
| }, |
| { |
| "epoch": 0.183, |
| "grad_norm": 31.75, |
| "grad_norm_var": 3.289322916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5818, |
| "loss/crossentropy": 2.271843919157982, |
| "loss/hidden": 3.36640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19545750357210637, |
| "step": 7320 |
| }, |
| { |
| "epoch": 0.18325, |
| "grad_norm": 31.25, |
| "grad_norm_var": 1.0124348958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.502, |
| "loss/crossentropy": 2.1379879862070084, |
| "loss/hidden": 3.403125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1861495029181242, |
| "step": 7330 |
| }, |
| { |
| "epoch": 0.1835, |
| "grad_norm": 31.25, |
| "grad_norm_var": 0.7556640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5501, |
| "loss/crossentropy": 2.080298659205437, |
| "loss/hidden": 3.46875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18716856762766837, |
| "step": 7340 |
| }, |
| { |
| "epoch": 0.18375, |
| "grad_norm": 31.25, |
| "grad_norm_var": 2.450455729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.3934, |
| "loss/crossentropy": 2.0731329679489137, |
| "loss/hidden": 3.33984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17404056414961816, |
| "step": 7350 |
| }, |
| { |
| "epoch": 0.184, |
| "grad_norm": 34.25, |
| "grad_norm_var": 3.0947265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5861, |
| "loss/crossentropy": 2.0820507287979124, |
| "loss/hidden": 3.465234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1926917627453804, |
| "step": 7360 |
| }, |
| { |
| "epoch": 0.18425, |
| "grad_norm": 30.875, |
| "grad_norm_var": 2.1624348958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5492, |
| "loss/crossentropy": 2.2114842593669892, |
| "loss/hidden": 3.48828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2127979423850775, |
| "step": 7370 |
| }, |
| { |
| "epoch": 0.1845, |
| "grad_norm": 33.5, |
| "grad_norm_var": 4.882291666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5255, |
| "loss/crossentropy": 2.075804352760315, |
| "loss/hidden": 3.455859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18906766772270203, |
| "step": 7380 |
| }, |
| { |
| "epoch": 0.18475, |
| "grad_norm": 35.75, |
| "grad_norm_var": 34.35729166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6434, |
| "loss/crossentropy": 2.136875703930855, |
| "loss/hidden": 3.371875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18598171565681695, |
| "step": 7390 |
| }, |
| { |
| "epoch": 0.185, |
| "grad_norm": 32.75, |
| "grad_norm_var": 4.624739583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6191, |
| "loss/crossentropy": 2.170763599872589, |
| "loss/hidden": 3.5109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2151478011161089, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.18525, |
| "grad_norm": 30.75, |
| "grad_norm_var": 2.051497395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4116, |
| "loss/crossentropy": 2.143643561005592, |
| "loss/hidden": 3.46796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19506660383194685, |
| "step": 7410 |
| }, |
| { |
| "epoch": 0.1855, |
| "grad_norm": 36.0, |
| "grad_norm_var": 5.089322916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.534, |
| "loss/crossentropy": 2.1735941752791406, |
| "loss/hidden": 3.307421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17996873259544371, |
| "step": 7420 |
| }, |
| { |
| "epoch": 0.18575, |
| "grad_norm": 29.0, |
| "grad_norm_var": 4.81640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5846, |
| "loss/crossentropy": 2.15124252140522, |
| "loss/hidden": 3.45546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1944795411080122, |
| "step": 7430 |
| }, |
| { |
| "epoch": 0.186, |
| "grad_norm": 30.125, |
| "grad_norm_var": 1.7900390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5394, |
| "loss/crossentropy": 2.189228793978691, |
| "loss/hidden": 3.428125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19658807516098023, |
| "step": 7440 |
| }, |
| { |
| "epoch": 0.18625, |
| "grad_norm": 31.25, |
| "grad_norm_var": 1.1572916666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5156, |
| "loss/crossentropy": 2.1805285453796386, |
| "loss/hidden": 3.43984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20203232783824204, |
| "step": 7450 |
| }, |
| { |
| "epoch": 0.1865, |
| "grad_norm": 32.75, |
| "grad_norm_var": 1.015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5803, |
| "loss/crossentropy": 2.031705692410469, |
| "loss/hidden": 3.429296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19695232547819613, |
| "step": 7460 |
| }, |
| { |
| "epoch": 0.18675, |
| "grad_norm": 31.75, |
| "grad_norm_var": 1.6389973958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5008, |
| "loss/crossentropy": 2.0725282967090606, |
| "loss/hidden": 3.53515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19746688194572926, |
| "step": 7470 |
| }, |
| { |
| "epoch": 0.187, |
| "grad_norm": 33.75, |
| "grad_norm_var": 30.643684895833335, |
| "learning_rate": 0.0001, |
| "loss": 7.5746, |
| "loss/crossentropy": 2.2134982645511627, |
| "loss/hidden": 3.4640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2143145205453038, |
| "step": 7480 |
| }, |
| { |
| "epoch": 0.18725, |
| "grad_norm": 28.75, |
| "grad_norm_var": 173.95358072916667, |
| "learning_rate": 0.0001, |
| "loss": 7.5472, |
| "loss/crossentropy": 2.2404279142618178, |
| "loss/hidden": 3.349609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18395285904407502, |
| "step": 7490 |
| }, |
| { |
| "epoch": 0.1875, |
| "grad_norm": 33.0, |
| "grad_norm_var": 295.3212890625, |
| "learning_rate": 0.0001, |
| "loss": 7.6232, |
| "loss/crossentropy": 2.1231855720281603, |
| "loss/hidden": 3.43203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19835165105760097, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.18775, |
| "grad_norm": 33.5, |
| "grad_norm_var": 9.3119140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5797, |
| "loss/crossentropy": 2.1263521701097488, |
| "loss/hidden": 3.39140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18789808079600334, |
| "step": 7510 |
| }, |
| { |
| "epoch": 0.188, |
| "grad_norm": 29.75, |
| "grad_norm_var": 30.567708333333332, |
| "learning_rate": 0.0001, |
| "loss": 7.544, |
| "loss/crossentropy": 2.2000538021326066, |
| "loss/hidden": 3.31796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19643234014511107, |
| "step": 7520 |
| }, |
| { |
| "epoch": 0.18825, |
| "grad_norm": 30.75, |
| "grad_norm_var": 26.5056640625, |
| "learning_rate": 0.0001, |
| "loss": 7.4594, |
| "loss/crossentropy": 2.1121160596609116, |
| "loss/hidden": 3.42734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18368990309536457, |
| "step": 7530 |
| }, |
| { |
| "epoch": 0.1885, |
| "grad_norm": 30.75, |
| "grad_norm_var": 2.661393229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4737, |
| "loss/crossentropy": 2.2245935589075088, |
| "loss/hidden": 3.24140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17950727473944425, |
| "step": 7540 |
| }, |
| { |
| "epoch": 0.18875, |
| "grad_norm": 32.75, |
| "grad_norm_var": 3.11015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5744, |
| "loss/crossentropy": 2.2126697182655333, |
| "loss/hidden": 3.308203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19009165093302727, |
| "step": 7550 |
| }, |
| { |
| "epoch": 0.189, |
| "grad_norm": 34.25, |
| "grad_norm_var": 2.104622395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6006, |
| "loss/crossentropy": 2.0755111388862133, |
| "loss/hidden": 3.523046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20379672143608332, |
| "step": 7560 |
| }, |
| { |
| "epoch": 0.18925, |
| "grad_norm": 32.5, |
| "grad_norm_var": 16.618684895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.528, |
| "loss/crossentropy": 2.205251544713974, |
| "loss/hidden": 3.429296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18352425061166286, |
| "step": 7570 |
| }, |
| { |
| "epoch": 0.1895, |
| "grad_norm": 29.875, |
| "grad_norm_var": 12.086393229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4678, |
| "loss/crossentropy": 2.0505823358893394, |
| "loss/hidden": 3.4125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2034569911658764, |
| "step": 7580 |
| }, |
| { |
| "epoch": 0.18975, |
| "grad_norm": 35.0, |
| "grad_norm_var": 15.94140625, |
| "learning_rate": 0.0001, |
| "loss": 7.506, |
| "loss/crossentropy": 2.2195076823234556, |
| "loss/hidden": 3.361328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19098728708922863, |
| "step": 7590 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 31.375, |
| "grad_norm_var": 16.2853515625, |
| "learning_rate": 0.0001, |
| "loss": 7.4491, |
| "loss/crossentropy": 2.185887323319912, |
| "loss/hidden": 3.40546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19378383718430997, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.19025, |
| "grad_norm": 33.0, |
| "grad_norm_var": 3.0389973958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5015, |
| "loss/crossentropy": 2.057955393195152, |
| "loss/hidden": 3.536328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20849357955157757, |
| "step": 7610 |
| }, |
| { |
| "epoch": 0.1905, |
| "grad_norm": 31.375, |
| "grad_norm_var": 17.470833333333335, |
| "learning_rate": 0.0001, |
| "loss": 7.6086, |
| "loss/crossentropy": 2.1172866210341454, |
| "loss/hidden": 3.476171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20152031816542149, |
| "step": 7620 |
| }, |
| { |
| "epoch": 0.19075, |
| "grad_norm": 32.5, |
| "grad_norm_var": 20.54765625, |
| "learning_rate": 0.0001, |
| "loss": 7.5686, |
| "loss/crossentropy": 2.2929946899414064, |
| "loss/hidden": 3.29765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18747647628188133, |
| "step": 7630 |
| }, |
| { |
| "epoch": 0.191, |
| "grad_norm": 37.25, |
| "grad_norm_var": 40.75807291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6427, |
| "loss/crossentropy": 2.180236041545868, |
| "loss/hidden": 3.54140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20883531272411346, |
| "step": 7640 |
| }, |
| { |
| "epoch": 0.19125, |
| "grad_norm": 29.625, |
| "grad_norm_var": 40.85520833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4917, |
| "loss/crossentropy": 2.1129625350236894, |
| "loss/hidden": 3.362109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18379813842475415, |
| "step": 7650 |
| }, |
| { |
| "epoch": 0.1915, |
| "grad_norm": 39.75, |
| "grad_norm_var": 6.581184895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5268, |
| "loss/crossentropy": 2.228352552652359, |
| "loss/hidden": 3.460546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2048925407230854, |
| "step": 7660 |
| }, |
| { |
| "epoch": 0.19175, |
| "grad_norm": 34.75, |
| "grad_norm_var": 6.0181640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5951, |
| "loss/crossentropy": 2.2298452496528625, |
| "loss/hidden": 3.5171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21614472325891257, |
| "step": 7670 |
| }, |
| { |
| "epoch": 0.192, |
| "grad_norm": 29.625, |
| "grad_norm_var": 3.6025390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5215, |
| "loss/crossentropy": 2.167386993765831, |
| "loss/hidden": 3.3921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1897386133670807, |
| "step": 7680 |
| }, |
| { |
| "epoch": 0.19225, |
| "grad_norm": 31.25, |
| "grad_norm_var": 1.5754557291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4768, |
| "loss/crossentropy": 2.1173581033945084, |
| "loss/hidden": 3.43046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19473140574991704, |
| "step": 7690 |
| }, |
| { |
| "epoch": 0.1925, |
| "grad_norm": 38.0, |
| "grad_norm_var": 16.32890625, |
| "learning_rate": 0.0001, |
| "loss": 7.5827, |
| "loss/crossentropy": 2.205728626251221, |
| "loss/hidden": 3.40859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20203282944858075, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.19275, |
| "grad_norm": 7583301632.0, |
| "grad_norm_var": 3.594153946076571e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.4851, |
| "loss/crossentropy": 2.0537545680999756, |
| "loss/hidden": 3.4953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19121616017073392, |
| "step": 7710 |
| }, |
| { |
| "epoch": 0.193, |
| "grad_norm": 31.5, |
| "grad_norm_var": 3.594153947356253e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.6091, |
| "loss/crossentropy": 2.2010452926158903, |
| "loss/hidden": 3.41328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19013969153165816, |
| "step": 7720 |
| }, |
| { |
| "epoch": 0.19325, |
| "grad_norm": 32.25, |
| "grad_norm_var": 127.8494140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5129, |
| "loss/crossentropy": 2.1829321801662447, |
| "loss/hidden": 3.33828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18250775039196016, |
| "step": 7730 |
| }, |
| { |
| "epoch": 0.1935, |
| "grad_norm": 32.5, |
| "grad_norm_var": 6.54765625, |
| "learning_rate": 0.0001, |
| "loss": 7.5028, |
| "loss/crossentropy": 2.059948954731226, |
| "loss/hidden": 3.469921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1829567258246243, |
| "step": 7740 |
| }, |
| { |
| "epoch": 0.19375, |
| "grad_norm": 34.25, |
| "grad_norm_var": 65.40305989583334, |
| "learning_rate": 0.0001, |
| "loss": 7.5152, |
| "loss/crossentropy": 2.113444189727306, |
| "loss/hidden": 3.4, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19062405563890933, |
| "step": 7750 |
| }, |
| { |
| "epoch": 0.194, |
| "grad_norm": 32.25, |
| "grad_norm_var": 61.19166666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5399, |
| "loss/crossentropy": 2.0975135535001757, |
| "loss/hidden": 3.5359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21106049697846174, |
| "step": 7760 |
| }, |
| { |
| "epoch": 0.19425, |
| "grad_norm": 55.5, |
| "grad_norm_var": 35.5087890625, |
| "learning_rate": 0.0001, |
| "loss": 7.6208, |
| "loss/crossentropy": 2.168052741885185, |
| "loss/hidden": 3.458984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20290581844747066, |
| "step": 7770 |
| }, |
| { |
| "epoch": 0.1945, |
| "grad_norm": 32.5, |
| "grad_norm_var": 34.890625, |
| "learning_rate": 0.0001, |
| "loss": 7.6052, |
| "loss/crossentropy": 2.203343018889427, |
| "loss/hidden": 3.51953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2262148879468441, |
| "step": 7780 |
| }, |
| { |
| "epoch": 0.19475, |
| "grad_norm": 33.75, |
| "grad_norm_var": 1.90390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5521, |
| "loss/crossentropy": 2.096657195687294, |
| "loss/hidden": 3.4359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19364065770059824, |
| "step": 7790 |
| }, |
| { |
| "epoch": 0.195, |
| "grad_norm": 32.25, |
| "grad_norm_var": 1.4916015625, |
| "learning_rate": 0.0001, |
| "loss": 7.607, |
| "loss/crossentropy": 2.0858742713928224, |
| "loss/hidden": 3.482421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.195220067165792, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.19525, |
| "grad_norm": 30.5, |
| "grad_norm_var": 30.91640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5047, |
| "loss/crossentropy": 2.1515824437141418, |
| "loss/hidden": 3.44765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19715734478086233, |
| "step": 7810 |
| }, |
| { |
| "epoch": 0.1955, |
| "grad_norm": 34.75, |
| "grad_norm_var": 29.525455729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5755, |
| "loss/crossentropy": 2.197349172830582, |
| "loss/hidden": 3.397265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1922721391543746, |
| "step": 7820 |
| }, |
| { |
| "epoch": 0.19575, |
| "grad_norm": 29.75, |
| "grad_norm_var": 2.6962890625, |
| "learning_rate": 0.0001, |
| "loss": 7.5252, |
| "loss/crossentropy": 2.1433851540088655, |
| "loss/hidden": 3.53125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2196674931794405, |
| "step": 7830 |
| }, |
| { |
| "epoch": 0.196, |
| "grad_norm": 31.625, |
| "grad_norm_var": 4.008072916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5378, |
| "loss/crossentropy": 2.151882603764534, |
| "loss/hidden": 3.528125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2241852417588234, |
| "step": 7840 |
| }, |
| { |
| "epoch": 0.19625, |
| "grad_norm": 31.125, |
| "grad_norm_var": 23.979166666666668, |
| "learning_rate": 0.0001, |
| "loss": 7.4767, |
| "loss/crossentropy": 2.100368928909302, |
| "loss/hidden": 3.429296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18492705803364515, |
| "step": 7850 |
| }, |
| { |
| "epoch": 0.1965, |
| "grad_norm": 35.25, |
| "grad_norm_var": 4.2900390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5525, |
| "loss/crossentropy": 2.1549850702285767, |
| "loss/hidden": 3.41328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19525696001946927, |
| "step": 7860 |
| }, |
| { |
| "epoch": 0.19675, |
| "grad_norm": 29.125, |
| "grad_norm_var": 3.1264973958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4894, |
| "loss/crossentropy": 2.085528630018234, |
| "loss/hidden": 3.37734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19448419120162724, |
| "step": 7870 |
| }, |
| { |
| "epoch": 0.197, |
| "grad_norm": 31.125, |
| "grad_norm_var": 2.0509765625, |
| "learning_rate": 0.0001, |
| "loss": 7.6104, |
| "loss/crossentropy": 2.197956010699272, |
| "loss/hidden": 3.54609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20526408050209283, |
| "step": 7880 |
| }, |
| { |
| "epoch": 0.19725, |
| "grad_norm": 28.5, |
| "grad_norm_var": 4.287955729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5082, |
| "loss/crossentropy": 2.093092533946037, |
| "loss/hidden": 3.373828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18932582587003707, |
| "step": 7890 |
| }, |
| { |
| "epoch": 0.1975, |
| "grad_norm": 31.875, |
| "grad_norm_var": 3.9583333333333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5156, |
| "loss/crossentropy": 2.1209671765565874, |
| "loss/hidden": 3.334375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18670345041900874, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.19775, |
| "grad_norm": 42.75, |
| "grad_norm_var": 14.573893229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.628, |
| "loss/crossentropy": 2.2129906579852103, |
| "loss/hidden": 3.41875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18701824955642224, |
| "step": 7910 |
| }, |
| { |
| "epoch": 0.198, |
| "grad_norm": 29.375, |
| "grad_norm_var": 13.91015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5283, |
| "loss/crossentropy": 2.063839703798294, |
| "loss/hidden": 3.5, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2156384490430355, |
| "step": 7920 |
| }, |
| { |
| "epoch": 0.19825, |
| "grad_norm": 30.0, |
| "grad_norm_var": 13.59140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5872, |
| "loss/crossentropy": 2.150103223323822, |
| "loss/hidden": 3.44453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2093698762357235, |
| "step": 7930 |
| }, |
| { |
| "epoch": 0.1985, |
| "grad_norm": 30.75, |
| "grad_norm_var": 8.720833333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5207, |
| "loss/crossentropy": 2.1579747438430785, |
| "loss/hidden": 3.43046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20063298791646958, |
| "step": 7940 |
| }, |
| { |
| "epoch": 0.19875, |
| "grad_norm": 29.25, |
| "grad_norm_var": 2.7270182291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5105, |
| "loss/crossentropy": 2.104949194192886, |
| "loss/hidden": 3.462890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18702716194093227, |
| "step": 7950 |
| }, |
| { |
| "epoch": 0.199, |
| "grad_norm": 31.375, |
| "grad_norm_var": 2.7018229166666665, |
| "learning_rate": 0.0001, |
| "loss": 7.6138, |
| "loss/crossentropy": 2.1542328625917433, |
| "loss/hidden": 3.401171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1854570686817169, |
| "step": 7960 |
| }, |
| { |
| "epoch": 0.19925, |
| "grad_norm": 36.25, |
| "grad_norm_var": 78.99889322916667, |
| "learning_rate": 0.0001, |
| "loss": 7.5929, |
| "loss/crossentropy": 2.196703353524208, |
| "loss/hidden": 3.473828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20412985123693944, |
| "step": 7970 |
| }, |
| { |
| "epoch": 0.1995, |
| "grad_norm": 33.0, |
| "grad_norm_var": 79.22083333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5479, |
| "loss/crossentropy": 2.1077481478452684, |
| "loss/hidden": 3.441796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20956829860806464, |
| "step": 7980 |
| }, |
| { |
| "epoch": 0.19975, |
| "grad_norm": 30.875, |
| "grad_norm_var": 2.403059895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5393, |
| "loss/crossentropy": 2.2737906739115714, |
| "loss/hidden": 3.325390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19662595726549625, |
| "step": 7990 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 31.625, |
| "grad_norm_var": 7.572916666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.576, |
| "loss/crossentropy": 2.187049573659897, |
| "loss/hidden": 3.481640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1929944284260273, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.20025, |
| "grad_norm": 31.125, |
| "grad_norm_var": 1.81875, |
| "learning_rate": 0.0001, |
| "loss": 7.584, |
| "loss/crossentropy": 2.1443400979042053, |
| "loss/hidden": 3.587890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21116771101951598, |
| "step": 8010 |
| }, |
| { |
| "epoch": 0.2005, |
| "grad_norm": 33.25, |
| "grad_norm_var": 2.2309895833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.619, |
| "loss/crossentropy": 2.1093345403671266, |
| "loss/hidden": 3.42421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19506766367703676, |
| "step": 8020 |
| }, |
| { |
| "epoch": 0.20075, |
| "grad_norm": 34.25, |
| "grad_norm_var": 2.2285807291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6101, |
| "loss/crossentropy": 2.3067246288061143, |
| "loss/hidden": 3.368359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21037033908069133, |
| "step": 8030 |
| }, |
| { |
| "epoch": 0.201, |
| "grad_norm": 31.25, |
| "grad_norm_var": 8.986393229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5813, |
| "loss/crossentropy": 2.0566743701696395, |
| "loss/hidden": 3.6375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21147819980978966, |
| "step": 8040 |
| }, |
| { |
| "epoch": 0.20125, |
| "grad_norm": 30.875, |
| "grad_norm_var": 58.540625, |
| "learning_rate": 0.0001, |
| "loss": 7.6354, |
| "loss/crossentropy": 2.119761574268341, |
| "loss/hidden": 3.5140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21174631416797637, |
| "step": 8050 |
| }, |
| { |
| "epoch": 0.2015, |
| "grad_norm": 31.125, |
| "grad_norm_var": 1.9830729166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5129, |
| "loss/crossentropy": 2.098930720984936, |
| "loss/hidden": 3.46796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1940738322213292, |
| "step": 8060 |
| }, |
| { |
| "epoch": 0.20175, |
| "grad_norm": 31.125, |
| "grad_norm_var": 1.82265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5747, |
| "loss/crossentropy": 2.1186475455760956, |
| "loss/hidden": 3.55859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19831476360559464, |
| "step": 8070 |
| }, |
| { |
| "epoch": 0.202, |
| "grad_norm": 31.5, |
| "grad_norm_var": 1.5416666666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5639, |
| "loss/crossentropy": 2.2280248433351515, |
| "loss/hidden": 3.4640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18766701072454453, |
| "step": 8080 |
| }, |
| { |
| "epoch": 0.20225, |
| "grad_norm": 31.375, |
| "grad_norm_var": 29.02265625, |
| "learning_rate": 0.0001, |
| "loss": 7.4765, |
| "loss/crossentropy": 2.135748690366745, |
| "loss/hidden": 3.394140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20170771703124046, |
| "step": 8090 |
| }, |
| { |
| "epoch": 0.2025, |
| "grad_norm": 38.25, |
| "grad_norm_var": 4.775, |
| "learning_rate": 0.0001, |
| "loss": 7.6228, |
| "loss/crossentropy": 2.08721085190773, |
| "loss/hidden": 3.42265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20192455761134626, |
| "step": 8100 |
| }, |
| { |
| "epoch": 0.20275, |
| "grad_norm": 29.5, |
| "grad_norm_var": 15.781184895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5354, |
| "loss/crossentropy": 2.230104002356529, |
| "loss/hidden": 3.46640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20827409140765668, |
| "step": 8110 |
| }, |
| { |
| "epoch": 0.203, |
| "grad_norm": 30.75, |
| "grad_norm_var": 2.0233723958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4638, |
| "loss/crossentropy": 2.071944323182106, |
| "loss/hidden": 3.436328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1925355602055788, |
| "step": 8120 |
| }, |
| { |
| "epoch": 0.20325, |
| "grad_norm": 29.375, |
| "grad_norm_var": 2.0625, |
| "learning_rate": 0.0001, |
| "loss": 7.5385, |
| "loss/crossentropy": 2.0866906702518464, |
| "loss/hidden": 3.621484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2152187094092369, |
| "step": 8130 |
| }, |
| { |
| "epoch": 0.2035, |
| "grad_norm": 31.625, |
| "grad_norm_var": 6.661458333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5217, |
| "loss/crossentropy": 2.1989782720804216, |
| "loss/hidden": 3.502734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1997136753052473, |
| "step": 8140 |
| }, |
| { |
| "epoch": 0.20375, |
| "grad_norm": 30.5, |
| "grad_norm_var": 5.3197265625, |
| "learning_rate": 0.0001, |
| "loss": 7.4881, |
| "loss/crossentropy": 2.0995738029479982, |
| "loss/hidden": 3.456640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18900278601795434, |
| "step": 8150 |
| }, |
| { |
| "epoch": 0.204, |
| "grad_norm": 30.75, |
| "grad_norm_var": 0.8395833333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5243, |
| "loss/crossentropy": 2.0287352964282035, |
| "loss/hidden": 3.475390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19610616452991964, |
| "step": 8160 |
| }, |
| { |
| "epoch": 0.20425, |
| "grad_norm": 31.25, |
| "grad_norm_var": 5.362239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5562, |
| "loss/crossentropy": 2.165475571155548, |
| "loss/hidden": 3.369140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19938987717032433, |
| "step": 8170 |
| }, |
| { |
| "epoch": 0.2045, |
| "grad_norm": 31.375, |
| "grad_norm_var": 1.571875, |
| "learning_rate": 0.0001, |
| "loss": 7.5421, |
| "loss/crossentropy": 2.2981997221708297, |
| "loss/hidden": 3.410546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18277304694056512, |
| "step": 8180 |
| }, |
| { |
| "epoch": 0.20475, |
| "grad_norm": 30.375, |
| "grad_norm_var": 4.543489583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5629, |
| "loss/crossentropy": 2.0597861796617507, |
| "loss/hidden": 3.441015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20162123087793588, |
| "step": 8190 |
| }, |
| { |
| "epoch": 0.205, |
| "grad_norm": 32.0, |
| "grad_norm_var": 2.0572916666666665, |
| "learning_rate": 0.0001, |
| "loss": 7.5376, |
| "loss/crossentropy": 2.1517196536064147, |
| "loss/hidden": 3.33984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19018489569425584, |
| "step": 8200 |
| }, |
| { |
| "epoch": 0.20525, |
| "grad_norm": 29.375, |
| "grad_norm_var": 1.4994140625, |
| "learning_rate": 0.0001, |
| "loss": 7.4639, |
| "loss/crossentropy": 2.2632179886102675, |
| "loss/hidden": 3.25625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18489395044744014, |
| "step": 8210 |
| }, |
| { |
| "epoch": 0.2055, |
| "grad_norm": 36.5, |
| "grad_norm_var": 2.61640625, |
| "learning_rate": 0.0001, |
| "loss": 7.7838, |
| "loss/crossentropy": 2.1116667434573175, |
| "loss/hidden": 3.573828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19244479853659868, |
| "step": 8220 |
| }, |
| { |
| "epoch": 0.20575, |
| "grad_norm": 31.25, |
| "grad_norm_var": 1.8582682291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5409, |
| "loss/crossentropy": 2.1676986277103425, |
| "loss/hidden": 3.42734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19751499071717263, |
| "step": 8230 |
| }, |
| { |
| "epoch": 0.206, |
| "grad_norm": 35.5, |
| "grad_norm_var": 3.9962890625, |
| "learning_rate": 0.0001, |
| "loss": 7.5365, |
| "loss/crossentropy": 2.2388996213674544, |
| "loss/hidden": 3.4109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19586583338677882, |
| "step": 8240 |
| }, |
| { |
| "epoch": 0.20625, |
| "grad_norm": 30.375, |
| "grad_norm_var": 5.229166666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4937, |
| "loss/crossentropy": 2.173008766770363, |
| "loss/hidden": 3.384375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18703010510653256, |
| "step": 8250 |
| }, |
| { |
| "epoch": 0.2065, |
| "grad_norm": 31.625, |
| "grad_norm_var": 14.615625, |
| "learning_rate": 0.0001, |
| "loss": 7.5587, |
| "loss/crossentropy": 2.1825241267681124, |
| "loss/hidden": 3.443359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19528221413493158, |
| "step": 8260 |
| }, |
| { |
| "epoch": 0.20675, |
| "grad_norm": 30.375, |
| "grad_norm_var": 8.207747395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.479, |
| "loss/crossentropy": 2.0385641396045684, |
| "loss/hidden": 3.484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20498643592000007, |
| "step": 8270 |
| }, |
| { |
| "epoch": 0.207, |
| "grad_norm": 36.25, |
| "grad_norm_var": 5.3650390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5835, |
| "loss/crossentropy": 2.2289943635463714, |
| "loss/hidden": 3.53046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20597830023616553, |
| "step": 8280 |
| }, |
| { |
| "epoch": 0.20725, |
| "grad_norm": 32.5, |
| "grad_norm_var": 72.63723958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4999, |
| "loss/crossentropy": 2.201861135661602, |
| "loss/hidden": 3.373828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1963998381048441, |
| "step": 8290 |
| }, |
| { |
| "epoch": 0.2075, |
| "grad_norm": 33.5, |
| "grad_norm_var": 75.33932291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4931, |
| "loss/crossentropy": 2.0565819829702376, |
| "loss/hidden": 3.587890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2153871137648821, |
| "step": 8300 |
| }, |
| { |
| "epoch": 0.20775, |
| "grad_norm": 33.75, |
| "grad_norm_var": 2.57265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5969, |
| "loss/crossentropy": 2.2017314821481704, |
| "loss/hidden": 3.509765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20366298444569111, |
| "step": 8310 |
| }, |
| { |
| "epoch": 0.208, |
| "grad_norm": 31.5, |
| "grad_norm_var": 93.0625, |
| "learning_rate": 0.0001, |
| "loss": 7.5622, |
| "loss/crossentropy": 2.139016662538052, |
| "loss/hidden": 3.40859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18943111039698124, |
| "step": 8320 |
| }, |
| { |
| "epoch": 0.20825, |
| "grad_norm": 48.25, |
| "grad_norm_var": 20.628125, |
| "learning_rate": 0.0001, |
| "loss": 7.4914, |
| "loss/crossentropy": 2.1512654572725296, |
| "loss/hidden": 3.39765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20672829449176788, |
| "step": 8330 |
| }, |
| { |
| "epoch": 0.2085, |
| "grad_norm": 32.25, |
| "grad_norm_var": 19.8759765625, |
| "learning_rate": 0.0001, |
| "loss": 7.5845, |
| "loss/crossentropy": 2.0606055706739426, |
| "loss/hidden": 3.49921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20263293255120515, |
| "step": 8340 |
| }, |
| { |
| "epoch": 0.20875, |
| "grad_norm": 31.375, |
| "grad_norm_var": 55.4369140625, |
| "learning_rate": 0.0001, |
| "loss": 7.4669, |
| "loss/crossentropy": 2.074323023855686, |
| "loss/hidden": 3.344921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17513209469616414, |
| "step": 8350 |
| }, |
| { |
| "epoch": 0.209, |
| "grad_norm": 29.5, |
| "grad_norm_var": 54.5603515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5259, |
| "loss/crossentropy": 2.197512632608414, |
| "loss/hidden": 3.438671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20681734681129454, |
| "step": 8360 |
| }, |
| { |
| "epoch": 0.20925, |
| "grad_norm": 29.625, |
| "grad_norm_var": 3.138541666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.58, |
| "loss/crossentropy": 2.2005858927965165, |
| "loss/hidden": 3.414453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2138770181685686, |
| "step": 8370 |
| }, |
| { |
| "epoch": 0.2095, |
| "grad_norm": 33.0, |
| "grad_norm_var": 7.381184895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.604, |
| "loss/crossentropy": 2.2078860282897947, |
| "loss/hidden": 3.570703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21495202817022802, |
| "step": 8380 |
| }, |
| { |
| "epoch": 0.20975, |
| "grad_norm": 31.875, |
| "grad_norm_var": 6.4931640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5081, |
| "loss/crossentropy": 2.1586494892835617, |
| "loss/hidden": 3.434375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19594298098236324, |
| "step": 8390 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 32.5, |
| "grad_norm_var": 2.05390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5932, |
| "loss/crossentropy": 2.1475181549787523, |
| "loss/hidden": 3.41953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19644266795367002, |
| "step": 8400 |
| }, |
| { |
| "epoch": 0.21025, |
| "grad_norm": 32.5, |
| "grad_norm_var": 1.1671223958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4963, |
| "loss/crossentropy": 2.2497402161359785, |
| "loss/hidden": 3.29296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20083448998630046, |
| "step": 8410 |
| }, |
| { |
| "epoch": 0.2105, |
| "grad_norm": 36.0, |
| "grad_norm_var": 4.269205729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4909, |
| "loss/crossentropy": 2.1163347721099854, |
| "loss/hidden": 3.487890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19131154641509057, |
| "step": 8420 |
| }, |
| { |
| "epoch": 0.21075, |
| "grad_norm": 32.25, |
| "grad_norm_var": 5.568489583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5108, |
| "loss/crossentropy": 2.1301738530397416, |
| "loss/hidden": 3.437890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18502578027546407, |
| "step": 8430 |
| }, |
| { |
| "epoch": 0.211, |
| "grad_norm": 30.625, |
| "grad_norm_var": 15.867708333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5275, |
| "loss/crossentropy": 2.127982833981514, |
| "loss/hidden": 3.399609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2027182461693883, |
| "step": 8440 |
| }, |
| { |
| "epoch": 0.21125, |
| "grad_norm": 29.625, |
| "grad_norm_var": 18.677018229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5249, |
| "loss/crossentropy": 2.1977096766233446, |
| "loss/hidden": 3.477734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20560352243483065, |
| "step": 8450 |
| }, |
| { |
| "epoch": 0.2115, |
| "grad_norm": 29.75, |
| "grad_norm_var": 16.110872395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4708, |
| "loss/crossentropy": 2.1555270701646805, |
| "loss/hidden": 3.3078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18343626484274864, |
| "step": 8460 |
| }, |
| { |
| "epoch": 0.21175, |
| "grad_norm": 29.5, |
| "grad_norm_var": 25.607291666666665, |
| "learning_rate": 0.0001, |
| "loss": 7.4635, |
| "loss/crossentropy": 2.1936387956142425, |
| "loss/hidden": 3.4921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21095133386552334, |
| "step": 8470 |
| }, |
| { |
| "epoch": 0.212, |
| "grad_norm": 33.0, |
| "grad_norm_var": 16.8197265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5373, |
| "loss/crossentropy": 2.231749877333641, |
| "loss/hidden": 3.382421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1943896021693945, |
| "step": 8480 |
| }, |
| { |
| "epoch": 0.21225, |
| "grad_norm": 31.75, |
| "grad_norm_var": 7.643489583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5582, |
| "loss/crossentropy": 2.171339076757431, |
| "loss/hidden": 3.475390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20933733694255352, |
| "step": 8490 |
| }, |
| { |
| "epoch": 0.2125, |
| "grad_norm": 36.5, |
| "grad_norm_var": 3.863541666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5613, |
| "loss/crossentropy": 2.170750407129526, |
| "loss/hidden": 3.416015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19806241411715747, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.21275, |
| "grad_norm": 30.625, |
| "grad_norm_var": 6.814518229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.475, |
| "loss/crossentropy": 2.1906991213560105, |
| "loss/hidden": 3.403515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19974222220480442, |
| "step": 8510 |
| }, |
| { |
| "epoch": 0.213, |
| "grad_norm": 31.75, |
| "grad_norm_var": 1.0955729166666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5352, |
| "loss/crossentropy": 2.101140005886555, |
| "loss/hidden": 3.46953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20742072239518167, |
| "step": 8520 |
| }, |
| { |
| "epoch": 0.21325, |
| "grad_norm": 33.0, |
| "grad_norm_var": 2.234375, |
| "learning_rate": 0.0001, |
| "loss": 7.6097, |
| "loss/crossentropy": 2.142820453643799, |
| "loss/hidden": 3.417578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2011982973664999, |
| "step": 8530 |
| }, |
| { |
| "epoch": 0.2135, |
| "grad_norm": 30.75, |
| "grad_norm_var": 4.988541666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5074, |
| "loss/crossentropy": 2.274955728650093, |
| "loss/hidden": 3.254296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18379125297069548, |
| "step": 8540 |
| }, |
| { |
| "epoch": 0.21375, |
| "grad_norm": 29.875, |
| "grad_norm_var": 5.245572916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4441, |
| "loss/crossentropy": 2.2132862359285355, |
| "loss/hidden": 3.355859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20189897604286672, |
| "step": 8550 |
| }, |
| { |
| "epoch": 0.214, |
| "grad_norm": 29.375, |
| "grad_norm_var": 1.3567057291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5183, |
| "loss/crossentropy": 2.1324101746082307, |
| "loss/hidden": 3.542578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2091756235808134, |
| "step": 8560 |
| }, |
| { |
| "epoch": 0.21425, |
| "grad_norm": 37.25, |
| "grad_norm_var": 26.8353515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5208, |
| "loss/crossentropy": 2.2107133328914643, |
| "loss/hidden": 3.4234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19691390544176102, |
| "step": 8570 |
| }, |
| { |
| "epoch": 0.2145, |
| "grad_norm": 32.25, |
| "grad_norm_var": 8.3525390625, |
| "learning_rate": 0.0001, |
| "loss": 7.4517, |
| "loss/crossentropy": 2.1903372198343276, |
| "loss/hidden": 3.5, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21582610420882703, |
| "step": 8580 |
| }, |
| { |
| "epoch": 0.21475, |
| "grad_norm": 31.5, |
| "grad_norm_var": 1.2587890625, |
| "learning_rate": 0.0001, |
| "loss": 7.5885, |
| "loss/crossentropy": 2.1499848544597624, |
| "loss/hidden": 3.508984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2200299922376871, |
| "step": 8590 |
| }, |
| { |
| "epoch": 0.215, |
| "grad_norm": 44.75, |
| "grad_norm_var": 11.795833333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5716, |
| "loss/crossentropy": 2.2607037901878355, |
| "loss/hidden": 3.37890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19387190453708172, |
| "step": 8600 |
| }, |
| { |
| "epoch": 0.21525, |
| "grad_norm": 31.875, |
| "grad_norm_var": 13.584830729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4842, |
| "loss/crossentropy": 2.2367573767900466, |
| "loss/hidden": 3.291796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1797945935279131, |
| "step": 8610 |
| }, |
| { |
| "epoch": 0.2155, |
| "grad_norm": 31.0, |
| "grad_norm_var": 6.67890625, |
| "learning_rate": 0.0001, |
| "loss": 7.5594, |
| "loss/crossentropy": 2.132373479008675, |
| "loss/hidden": 3.405078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19787863213568926, |
| "step": 8620 |
| }, |
| { |
| "epoch": 0.21575, |
| "grad_norm": 32.75, |
| "grad_norm_var": 1.5483723958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5224, |
| "loss/crossentropy": 2.1131544440984724, |
| "loss/hidden": 3.434765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18490277007222175, |
| "step": 8630 |
| }, |
| { |
| "epoch": 0.216, |
| "grad_norm": 28.125, |
| "grad_norm_var": 2.9827473958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5298, |
| "loss/crossentropy": 2.1318382740020754, |
| "loss/hidden": 3.45625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20170295760035514, |
| "step": 8640 |
| }, |
| { |
| "epoch": 0.21625, |
| "grad_norm": 29.5, |
| "grad_norm_var": 2.6635416666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5196, |
| "loss/crossentropy": 2.143068727850914, |
| "loss/hidden": 3.448046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2085008706897497, |
| "step": 8650 |
| }, |
| { |
| "epoch": 0.2165, |
| "grad_norm": 31.75, |
| "grad_norm_var": 4.72265625, |
| "learning_rate": 0.0001, |
| "loss": 7.583, |
| "loss/crossentropy": 2.191588431596756, |
| "loss/hidden": 3.420703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19564641676843167, |
| "step": 8660 |
| }, |
| { |
| "epoch": 0.21675, |
| "grad_norm": 36.75, |
| "grad_norm_var": 5.292122395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5915, |
| "loss/crossentropy": 2.1791785418987275, |
| "loss/hidden": 3.390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19772212151437998, |
| "step": 8670 |
| }, |
| { |
| "epoch": 0.217, |
| "grad_norm": 29.75, |
| "grad_norm_var": 4.042708333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.496, |
| "loss/crossentropy": 2.1210457414388655, |
| "loss/hidden": 3.411328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20252102315425874, |
| "step": 8680 |
| }, |
| { |
| "epoch": 0.21725, |
| "grad_norm": 30.25, |
| "grad_norm_var": 2.02265625, |
| "learning_rate": 0.0001, |
| "loss": 7.4529, |
| "loss/crossentropy": 2.1803869009017944, |
| "loss/hidden": 3.412890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19335724450647832, |
| "step": 8690 |
| }, |
| { |
| "epoch": 0.2175, |
| "grad_norm": 32.5, |
| "grad_norm_var": 3.9624348958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5914, |
| "loss/crossentropy": 2.134222483634949, |
| "loss/hidden": 3.3515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19238188080489635, |
| "step": 8700 |
| }, |
| { |
| "epoch": 0.21775, |
| "grad_norm": 32.75, |
| "grad_norm_var": 6.13515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5587, |
| "loss/crossentropy": 2.1801154255867004, |
| "loss/hidden": 3.456640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19446788169443607, |
| "step": 8710 |
| }, |
| { |
| "epoch": 0.218, |
| "grad_norm": 30.0, |
| "grad_norm_var": 3.06015625, |
| "learning_rate": 0.0001, |
| "loss": 7.561, |
| "loss/crossentropy": 2.1410862773656847, |
| "loss/hidden": 3.444140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20139614418148993, |
| "step": 8720 |
| }, |
| { |
| "epoch": 0.21825, |
| "grad_norm": 33.5, |
| "grad_norm_var": 2.0707682291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.527, |
| "loss/crossentropy": 2.137886345386505, |
| "loss/hidden": 3.4, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2020585484802723, |
| "step": 8730 |
| }, |
| { |
| "epoch": 0.2185, |
| "grad_norm": 31.125, |
| "grad_norm_var": 2.278125, |
| "learning_rate": 0.0001, |
| "loss": 7.5367, |
| "loss/crossentropy": 2.1495157063007353, |
| "loss/hidden": 3.352734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2042895916849375, |
| "step": 8740 |
| }, |
| { |
| "epoch": 0.21875, |
| "grad_norm": 31.25, |
| "grad_norm_var": 4.108072916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5061, |
| "loss/crossentropy": 2.2290263891220095, |
| "loss/hidden": 3.3984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18052256256341934, |
| "step": 8750 |
| }, |
| { |
| "epoch": 0.219, |
| "grad_norm": 30.75, |
| "grad_norm_var": 3.0518229166666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5422, |
| "loss/crossentropy": 2.204638335108757, |
| "loss/hidden": 3.273046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1816064776852727, |
| "step": 8760 |
| }, |
| { |
| "epoch": 0.21925, |
| "grad_norm": 33.75, |
| "grad_norm_var": 1.0259765625, |
| "learning_rate": 0.0001, |
| "loss": 7.5936, |
| "loss/crossentropy": 2.173864471912384, |
| "loss/hidden": 3.409375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2168831005692482, |
| "step": 8770 |
| }, |
| { |
| "epoch": 0.2195, |
| "grad_norm": 33.0, |
| "grad_norm_var": 1.4957682291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6125, |
| "loss/crossentropy": 2.2631371796131132, |
| "loss/hidden": 3.37421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1991407833993435, |
| "step": 8780 |
| }, |
| { |
| "epoch": 0.21975, |
| "grad_norm": 30.625, |
| "grad_norm_var": 1.62890625, |
| "learning_rate": 0.0001, |
| "loss": 7.486, |
| "loss/crossentropy": 2.156681847572327, |
| "loss/hidden": 3.38359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18595759831368924, |
| "step": 8790 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 33.5, |
| "grad_norm_var": 1.2202473958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4714, |
| "loss/crossentropy": 2.069344013929367, |
| "loss/hidden": 3.5078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19667117949575186, |
| "step": 8800 |
| }, |
| { |
| "epoch": 0.22025, |
| "grad_norm": 30.625, |
| "grad_norm_var": 3.7875, |
| "learning_rate": 0.0001, |
| "loss": 7.6153, |
| "loss/crossentropy": 2.159384399652481, |
| "loss/hidden": 3.43125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19717039167881012, |
| "step": 8810 |
| }, |
| { |
| "epoch": 0.2205, |
| "grad_norm": 31.25, |
| "grad_norm_var": 2.7080729166666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5037, |
| "loss/crossentropy": 2.06855808198452, |
| "loss/hidden": 3.596875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19925107434391975, |
| "step": 8820 |
| }, |
| { |
| "epoch": 0.22075, |
| "grad_norm": 33.0, |
| "grad_norm_var": 14.142122395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.61, |
| "loss/crossentropy": 2.1856627821922303, |
| "loss/hidden": 3.4203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19198091998696326, |
| "step": 8830 |
| }, |
| { |
| "epoch": 0.221, |
| "grad_norm": 29.75, |
| "grad_norm_var": 23.12265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5265, |
| "loss/crossentropy": 2.215584135055542, |
| "loss/hidden": 3.360546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.193273763731122, |
| "step": 8840 |
| }, |
| { |
| "epoch": 0.22125, |
| "grad_norm": 32.5, |
| "grad_norm_var": 23.365625, |
| "learning_rate": 0.0001, |
| "loss": 7.6189, |
| "loss/crossentropy": 2.217027261853218, |
| "loss/hidden": 3.487890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1961042732000351, |
| "step": 8850 |
| }, |
| { |
| "epoch": 0.2215, |
| "grad_norm": 31.25, |
| "grad_norm_var": 2.9160807291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.441, |
| "loss/crossentropy": 2.1981815114617347, |
| "loss/hidden": 3.42578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19394037276506423, |
| "step": 8860 |
| }, |
| { |
| "epoch": 0.22175, |
| "grad_norm": 27.75, |
| "grad_norm_var": 9.90390625, |
| "learning_rate": 0.0001, |
| "loss": 7.4553, |
| "loss/crossentropy": 2.112216001749039, |
| "loss/hidden": 3.380859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18079342246055602, |
| "step": 8870 |
| }, |
| { |
| "epoch": 0.222, |
| "grad_norm": 31.375, |
| "grad_norm_var": 7.8384765625, |
| "learning_rate": 0.0001, |
| "loss": 7.4818, |
| "loss/crossentropy": 2.101804518699646, |
| "loss/hidden": 3.3984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1867048390209675, |
| "step": 8880 |
| }, |
| { |
| "epoch": 0.22225, |
| "grad_norm": 32.25, |
| "grad_norm_var": 9.934309895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5937, |
| "loss/crossentropy": 2.193597176671028, |
| "loss/hidden": 3.404296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20191702879965306, |
| "step": 8890 |
| }, |
| { |
| "epoch": 0.2225, |
| "grad_norm": 30.0, |
| "grad_norm_var": 32.552083333333336, |
| "learning_rate": 0.0001, |
| "loss": 7.4298, |
| "loss/crossentropy": 2.2074760258197785, |
| "loss/hidden": 3.3828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19682303816080093, |
| "step": 8900 |
| }, |
| { |
| "epoch": 0.22275, |
| "grad_norm": 36.0, |
| "grad_norm_var": 28.680143229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5468, |
| "loss/crossentropy": 2.008703652024269, |
| "loss/hidden": 3.46015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2202893177047372, |
| "step": 8910 |
| }, |
| { |
| "epoch": 0.223, |
| "grad_norm": 33.25, |
| "grad_norm_var": 10.238541666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5334, |
| "loss/crossentropy": 2.0829237014055253, |
| "loss/hidden": 3.330859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1849207304418087, |
| "step": 8920 |
| }, |
| { |
| "epoch": 0.22325, |
| "grad_norm": 33.25, |
| "grad_norm_var": 3.3676432291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5348, |
| "loss/crossentropy": 2.1565073817968368, |
| "loss/hidden": 3.37109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19267312660813332, |
| "step": 8930 |
| }, |
| { |
| "epoch": 0.2235, |
| "grad_norm": 42.5, |
| "grad_norm_var": 14.748372395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5472, |
| "loss/crossentropy": 2.0750382035970687, |
| "loss/hidden": 3.515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1899881549179554, |
| "step": 8940 |
| }, |
| { |
| "epoch": 0.22375, |
| "grad_norm": 35.5, |
| "grad_norm_var": 12.167122395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6047, |
| "loss/crossentropy": 2.1998844176530836, |
| "loss/hidden": 3.358984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1992990154772997, |
| "step": 8950 |
| }, |
| { |
| "epoch": 0.224, |
| "grad_norm": 39.5, |
| "grad_norm_var": 8.703125, |
| "learning_rate": 0.0001, |
| "loss": 7.5236, |
| "loss/crossentropy": 2.1398618072271347, |
| "loss/hidden": 3.38671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2044746194034815, |
| "step": 8960 |
| }, |
| { |
| "epoch": 0.22425, |
| "grad_norm": 29.5, |
| "grad_norm_var": 7.711458333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5588, |
| "loss/crossentropy": 2.0922764956951143, |
| "loss/hidden": 3.363671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18962055966258048, |
| "step": 8970 |
| }, |
| { |
| "epoch": 0.2245, |
| "grad_norm": 38.25, |
| "grad_norm_var": 8.417122395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5705, |
| "loss/crossentropy": 2.2088142544031144, |
| "loss/hidden": 3.418359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2040243223309517, |
| "step": 8980 |
| }, |
| { |
| "epoch": 0.22475, |
| "grad_norm": 31.25, |
| "grad_norm_var": 6.708268229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5883, |
| "loss/crossentropy": 2.0866949379444124, |
| "loss/hidden": 3.488671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21130447909235955, |
| "step": 8990 |
| }, |
| { |
| "epoch": 0.225, |
| "grad_norm": 28.375, |
| "grad_norm_var": 7.254166666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5008, |
| "loss/crossentropy": 2.117489975690842, |
| "loss/hidden": 3.49375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18843676745891572, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.22525, |
| "grad_norm": 32.5, |
| "grad_norm_var": 18.155143229166665, |
| "learning_rate": 0.0001, |
| "loss": 7.4718, |
| "loss/crossentropy": 2.1349531918764115, |
| "loss/hidden": 3.337890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18441876713186503, |
| "step": 9010 |
| }, |
| { |
| "epoch": 0.2255, |
| "grad_norm": 30.25, |
| "grad_norm_var": 7.642708333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4722, |
| "loss/crossentropy": 2.158918860554695, |
| "loss/hidden": 3.3796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1831710446625948, |
| "step": 9020 |
| }, |
| { |
| "epoch": 0.22575, |
| "grad_norm": 30.625, |
| "grad_norm_var": 5.802083333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5065, |
| "loss/crossentropy": 2.1577743917703627, |
| "loss/hidden": 3.443359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19996861293911933, |
| "step": 9030 |
| }, |
| { |
| "epoch": 0.226, |
| "grad_norm": 35.75, |
| "grad_norm_var": 6.050455729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.54, |
| "loss/crossentropy": 2.1675800561904905, |
| "loss/hidden": 3.3734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19423422291874887, |
| "step": 9040 |
| }, |
| { |
| "epoch": 0.22625, |
| "grad_norm": 30.25, |
| "grad_norm_var": 8.01015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5053, |
| "loss/crossentropy": 2.1748566299676897, |
| "loss/hidden": 3.298046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19081774912774563, |
| "step": 9050 |
| }, |
| { |
| "epoch": 0.2265, |
| "grad_norm": 107.5, |
| "grad_norm_var": 363.3160807291667, |
| "learning_rate": 0.0001, |
| "loss": 7.5204, |
| "loss/crossentropy": 2.1024440199136736, |
| "loss/hidden": 3.3796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19131676983088255, |
| "step": 9060 |
| }, |
| { |
| "epoch": 0.22675, |
| "grad_norm": 43.0, |
| "grad_norm_var": 382.7749348958333, |
| "learning_rate": 0.0001, |
| "loss": 7.4636, |
| "loss/crossentropy": 2.0358673214912413, |
| "loss/hidden": 3.53671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20718471184372902, |
| "step": 9070 |
| }, |
| { |
| "epoch": 0.227, |
| "grad_norm": 38.0, |
| "grad_norm_var": 34.826497395833336, |
| "learning_rate": 0.0001, |
| "loss": 7.5132, |
| "loss/crossentropy": 2.1334098488092423, |
| "loss/hidden": 3.454296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1900653975084424, |
| "step": 9080 |
| }, |
| { |
| "epoch": 0.22725, |
| "grad_norm": 47.5, |
| "grad_norm_var": 35.35182291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5471, |
| "loss/crossentropy": 2.158940353989601, |
| "loss/hidden": 3.351953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19873067885637283, |
| "step": 9090 |
| }, |
| { |
| "epoch": 0.2275, |
| "grad_norm": 32.5, |
| "grad_norm_var": 25.093489583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4687, |
| "loss/crossentropy": 2.110513925552368, |
| "loss/hidden": 3.409375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18377724830061198, |
| "step": 9100 |
| }, |
| { |
| "epoch": 0.22775, |
| "grad_norm": 47.75, |
| "grad_norm_var": 31.540625, |
| "learning_rate": 0.0001, |
| "loss": 7.5241, |
| "loss/crossentropy": 2.323571813106537, |
| "loss/hidden": 3.30234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18574214577674866, |
| "step": 9110 |
| }, |
| { |
| "epoch": 0.228, |
| "grad_norm": 32.0, |
| "grad_norm_var": 52.509375, |
| "learning_rate": 0.0001, |
| "loss": 7.5911, |
| "loss/crossentropy": 2.2099075824022294, |
| "loss/hidden": 3.365234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1919841269031167, |
| "step": 9120 |
| }, |
| { |
| "epoch": 0.22825, |
| "grad_norm": 28.5, |
| "grad_norm_var": 44.95104166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5406, |
| "loss/crossentropy": 2.146890181303024, |
| "loss/hidden": 3.46640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19247145019471645, |
| "step": 9130 |
| }, |
| { |
| "epoch": 0.2285, |
| "grad_norm": 39.0, |
| "grad_norm_var": 14.4625, |
| "learning_rate": 0.0001, |
| "loss": 7.4844, |
| "loss/crossentropy": 2.127192445099354, |
| "loss/hidden": 3.3484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17968443408608437, |
| "step": 9140 |
| }, |
| { |
| "epoch": 0.22875, |
| "grad_norm": 28.75, |
| "grad_norm_var": 15.970572916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5806, |
| "loss/crossentropy": 2.0811379849910736, |
| "loss/hidden": 3.437109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1876799188554287, |
| "step": 9150 |
| }, |
| { |
| "epoch": 0.229, |
| "grad_norm": 36.25, |
| "grad_norm_var": 16.017643229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.4714, |
| "loss/crossentropy": 2.1674265801906585, |
| "loss/hidden": 3.404296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19961154460906982, |
| "step": 9160 |
| }, |
| { |
| "epoch": 0.22925, |
| "grad_norm": 30.25, |
| "grad_norm_var": 14.31640625, |
| "learning_rate": 0.0001, |
| "loss": 7.4578, |
| "loss/crossentropy": 2.1701185166835786, |
| "loss/hidden": 3.306640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19284930936992167, |
| "step": 9170 |
| }, |
| { |
| "epoch": 0.2295, |
| "grad_norm": 33.0, |
| "grad_norm_var": 24.164322916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5706, |
| "loss/crossentropy": 2.2007659181952475, |
| "loss/hidden": 3.483984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20873381607234479, |
| "step": 9180 |
| }, |
| { |
| "epoch": 0.22975, |
| "grad_norm": 33.5, |
| "grad_norm_var": 167.6619140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5246, |
| "loss/crossentropy": 2.013364678621292, |
| "loss/hidden": 3.480859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1943978626281023, |
| "step": 9190 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 32.5, |
| "grad_norm_var": 161.8853515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5824, |
| "loss/crossentropy": 2.1492466554045677, |
| "loss/hidden": 3.383984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19080942254513503, |
| "step": 9200 |
| }, |
| { |
| "epoch": 0.23025, |
| "grad_norm": 35.0, |
| "grad_norm_var": 25.631184895833332, |
| "learning_rate": 0.0001, |
| "loss": 7.4525, |
| "loss/crossentropy": 2.1856732040643694, |
| "loss/hidden": 3.508203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21295880153775215, |
| "step": 9210 |
| }, |
| { |
| "epoch": 0.2305, |
| "grad_norm": 29.625, |
| "grad_norm_var": 44.44348958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5668, |
| "loss/crossentropy": 2.1970660746097566, |
| "loss/hidden": 3.31171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1991808257997036, |
| "step": 9220 |
| }, |
| { |
| "epoch": 0.23075, |
| "grad_norm": 33.25, |
| "grad_norm_var": 29.702083333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5854, |
| "loss/crossentropy": 2.1504436887800695, |
| "loss/hidden": 3.446875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18685424784198404, |
| "step": 9230 |
| }, |
| { |
| "epoch": 0.231, |
| "grad_norm": 29.625, |
| "grad_norm_var": 10.211393229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.3922, |
| "loss/crossentropy": 2.1555099219083784, |
| "loss/hidden": 3.510546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20160883199423552, |
| "step": 9240 |
| }, |
| { |
| "epoch": 0.23125, |
| "grad_norm": 30.75, |
| "grad_norm_var": 6.043489583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4663, |
| "loss/crossentropy": 2.153862714767456, |
| "loss/hidden": 3.34765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18072083070874215, |
| "step": 9250 |
| }, |
| { |
| "epoch": 0.2315, |
| "grad_norm": 35.25, |
| "grad_norm_var": 5.314322916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4845, |
| "loss/crossentropy": 2.2198209404945373, |
| "loss/hidden": 3.328515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19124398212879895, |
| "step": 9260 |
| }, |
| { |
| "epoch": 0.23175, |
| "grad_norm": 29.25, |
| "grad_norm_var": 8.071875, |
| "learning_rate": 0.0001, |
| "loss": 7.6265, |
| "loss/crossentropy": 2.1362643599510194, |
| "loss/hidden": 3.42265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18900094255805017, |
| "step": 9270 |
| }, |
| { |
| "epoch": 0.232, |
| "grad_norm": 30.375, |
| "grad_norm_var": 9.327018229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4949, |
| "loss/crossentropy": 2.226493775844574, |
| "loss/hidden": 3.346875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19106594361364843, |
| "step": 9280 |
| }, |
| { |
| "epoch": 0.23225, |
| "grad_norm": 42.75, |
| "grad_norm_var": 11.729622395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5367, |
| "loss/crossentropy": 2.212230810523033, |
| "loss/hidden": 3.358203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20718637369573117, |
| "step": 9290 |
| }, |
| { |
| "epoch": 0.2325, |
| "grad_norm": 34.5, |
| "grad_norm_var": 13.130989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4321, |
| "loss/crossentropy": 2.021953631937504, |
| "loss/hidden": 3.455859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19819956757128238, |
| "step": 9300 |
| }, |
| { |
| "epoch": 0.23275, |
| "grad_norm": 31.125, |
| "grad_norm_var": 5.062239583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5474, |
| "loss/crossentropy": 2.147363981604576, |
| "loss/hidden": 3.435546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1922474455088377, |
| "step": 9310 |
| }, |
| { |
| "epoch": 0.233, |
| "grad_norm": 32.25, |
| "grad_norm_var": 3.471875, |
| "learning_rate": 0.0001, |
| "loss": 7.5815, |
| "loss/crossentropy": 2.1315445095300674, |
| "loss/hidden": 3.546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22608386687934398, |
| "step": 9320 |
| }, |
| { |
| "epoch": 0.23325, |
| "grad_norm": 29.0, |
| "grad_norm_var": 4.233072916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5392, |
| "loss/crossentropy": 2.0831587575376034, |
| "loss/hidden": 3.44921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20386858209967612, |
| "step": 9330 |
| }, |
| { |
| "epoch": 0.2335, |
| "grad_norm": 31.75, |
| "grad_norm_var": 225.54895833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5913, |
| "loss/crossentropy": 2.2117609918117522, |
| "loss/hidden": 3.427734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2390334837138653, |
| "step": 9340 |
| }, |
| { |
| "epoch": 0.23375, |
| "grad_norm": 30.75, |
| "grad_norm_var": 10.939518229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4838, |
| "loss/crossentropy": 2.2391141802072525, |
| "loss/hidden": 3.35, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1901895135641098, |
| "step": 9350 |
| }, |
| { |
| "epoch": 0.234, |
| "grad_norm": 36.0, |
| "grad_norm_var": 8.078059895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4995, |
| "loss/crossentropy": 2.179610106348991, |
| "loss/hidden": 3.396875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19948177523910998, |
| "step": 9360 |
| }, |
| { |
| "epoch": 0.23425, |
| "grad_norm": 37.0, |
| "grad_norm_var": 4.773372395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.566, |
| "loss/crossentropy": 2.265758016705513, |
| "loss/hidden": 3.49140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2015686921775341, |
| "step": 9370 |
| }, |
| { |
| "epoch": 0.2345, |
| "grad_norm": 33.25, |
| "grad_norm_var": 5.334309895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5378, |
| "loss/crossentropy": 2.164343351125717, |
| "loss/hidden": 3.4609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2006633374840021, |
| "step": 9380 |
| }, |
| { |
| "epoch": 0.23475, |
| "grad_norm": 32.5, |
| "grad_norm_var": 29.55625, |
| "learning_rate": 0.0001, |
| "loss": 7.5607, |
| "loss/crossentropy": 2.150678759813309, |
| "loss/hidden": 3.463671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22009918540716172, |
| "step": 9390 |
| }, |
| { |
| "epoch": 0.235, |
| "grad_norm": 31.75, |
| "grad_norm_var": 275.2309895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6283, |
| "loss/crossentropy": 2.2133118510246277, |
| "loss/hidden": 3.39296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18500468730926514, |
| "step": 9400 |
| }, |
| { |
| "epoch": 0.23525, |
| "grad_norm": 29.125, |
| "grad_norm_var": 3.2697265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5132, |
| "loss/crossentropy": 2.225054568052292, |
| "loss/hidden": 3.37734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20218575857579707, |
| "step": 9410 |
| }, |
| { |
| "epoch": 0.2355, |
| "grad_norm": 30.875, |
| "grad_norm_var": 7.542122395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4509, |
| "loss/crossentropy": 2.1811843127012254, |
| "loss/hidden": 3.381640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19548335634171962, |
| "step": 9420 |
| }, |
| { |
| "epoch": 0.23575, |
| "grad_norm": 31.875, |
| "grad_norm_var": 1.2934895833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5001, |
| "loss/crossentropy": 1.9795912995934486, |
| "loss/hidden": 3.547265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19292352311313152, |
| "step": 9430 |
| }, |
| { |
| "epoch": 0.236, |
| "grad_norm": 31.125, |
| "grad_norm_var": 95.365625, |
| "learning_rate": 0.0001, |
| "loss": 7.5911, |
| "loss/crossentropy": 2.032686772942543, |
| "loss/hidden": 3.380859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1814481422305107, |
| "step": 9440 |
| }, |
| { |
| "epoch": 0.23625, |
| "grad_norm": 31.375, |
| "grad_norm_var": 199.14348958333332, |
| "learning_rate": 0.0001, |
| "loss": 7.4887, |
| "loss/crossentropy": 2.1834616482257845, |
| "loss/hidden": 3.34921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1900908298790455, |
| "step": 9450 |
| }, |
| { |
| "epoch": 0.2365, |
| "grad_norm": 42.5, |
| "grad_norm_var": 9.148893229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5461, |
| "loss/crossentropy": 2.1409550577402117, |
| "loss/hidden": 3.430078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19009583443403244, |
| "step": 9460 |
| }, |
| { |
| "epoch": 0.23675, |
| "grad_norm": 30.0, |
| "grad_norm_var": 11.903125, |
| "learning_rate": 0.0001, |
| "loss": 7.5157, |
| "loss/crossentropy": 2.0411314353346826, |
| "loss/hidden": 3.420703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18683939017355441, |
| "step": 9470 |
| }, |
| { |
| "epoch": 0.237, |
| "grad_norm": 35.75, |
| "grad_norm_var": 14.804622395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5184, |
| "loss/crossentropy": 2.0848400443792343, |
| "loss/hidden": 3.5078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2053835943341255, |
| "step": 9480 |
| }, |
| { |
| "epoch": 0.23725, |
| "grad_norm": 31.875, |
| "grad_norm_var": 4.326822916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5465, |
| "loss/crossentropy": 2.198624536395073, |
| "loss/hidden": 3.4609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20486781597137452, |
| "step": 9490 |
| }, |
| { |
| "epoch": 0.2375, |
| "grad_norm": 39.75, |
| "grad_norm_var": 23.5791015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5646, |
| "loss/crossentropy": 2.136742886900902, |
| "loss/hidden": 3.412890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19704439640045165, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.23775, |
| "grad_norm": 45.5, |
| "grad_norm_var": 35.976497395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.4662, |
| "loss/crossentropy": 2.1736690044403075, |
| "loss/hidden": 3.464453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19949170239269734, |
| "step": 9510 |
| }, |
| { |
| "epoch": 0.238, |
| "grad_norm": 31.625, |
| "grad_norm_var": 23.130989583333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5822, |
| "loss/crossentropy": 2.1334351167082786, |
| "loss/hidden": 3.53828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20324139203876257, |
| "step": 9520 |
| }, |
| { |
| "epoch": 0.23825, |
| "grad_norm": 29.25, |
| "grad_norm_var": 17.6181640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5358, |
| "loss/crossentropy": 2.142412620782852, |
| "loss/hidden": 3.44140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2030354391783476, |
| "step": 9530 |
| }, |
| { |
| "epoch": 0.2385, |
| "grad_norm": 39.75, |
| "grad_norm_var": 18.976041666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.494, |
| "loss/crossentropy": 2.181875669956207, |
| "loss/hidden": 3.4671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1956815965473652, |
| "step": 9540 |
| }, |
| { |
| "epoch": 0.23875, |
| "grad_norm": 34.75, |
| "grad_norm_var": 8.832747395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.648, |
| "loss/crossentropy": 2.2528909504413606, |
| "loss/hidden": 3.435546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20324745066463948, |
| "step": 9550 |
| }, |
| { |
| "epoch": 0.239, |
| "grad_norm": 35.5, |
| "grad_norm_var": 3.3268229166666665, |
| "learning_rate": 0.0001, |
| "loss": 7.5439, |
| "loss/crossentropy": 2.2434193670749663, |
| "loss/hidden": 3.415625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20051232390105725, |
| "step": 9560 |
| }, |
| { |
| "epoch": 0.23925, |
| "grad_norm": 31.75, |
| "grad_norm_var": 1.7962890625, |
| "learning_rate": 0.0001, |
| "loss": 7.5977, |
| "loss/crossentropy": 2.0810982078313827, |
| "loss/hidden": 3.519921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1919446151703596, |
| "step": 9570 |
| }, |
| { |
| "epoch": 0.2395, |
| "grad_norm": 31.5, |
| "grad_norm_var": 18.6556640625, |
| "learning_rate": 0.0001, |
| "loss": 7.4919, |
| "loss/crossentropy": 2.11810165643692, |
| "loss/hidden": 3.272265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17909912299364805, |
| "step": 9580 |
| }, |
| { |
| "epoch": 0.23975, |
| "grad_norm": 30.25, |
| "grad_norm_var": 19.439322916666665, |
| "learning_rate": 0.0001, |
| "loss": 7.5892, |
| "loss/crossentropy": 2.2458123177289964, |
| "loss/hidden": 3.40703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2052942331880331, |
| "step": 9590 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 32.5, |
| "grad_norm_var": 5.358268229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4833, |
| "loss/crossentropy": 2.294092634320259, |
| "loss/hidden": 3.355859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18821652382612228, |
| "step": 9600 |
| }, |
| { |
| "epoch": 0.24025, |
| "grad_norm": 34.0, |
| "grad_norm_var": 15.582747395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.623, |
| "loss/crossentropy": 2.152405506372452, |
| "loss/hidden": 3.583203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20593744479119777, |
| "step": 9610 |
| }, |
| { |
| "epoch": 0.2405, |
| "grad_norm": 30.5, |
| "grad_norm_var": 11.377018229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6211, |
| "loss/crossentropy": 2.0907988399267197, |
| "loss/hidden": 3.487890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19778051003813743, |
| "step": 9620 |
| }, |
| { |
| "epoch": 0.24075, |
| "grad_norm": 31.25, |
| "grad_norm_var": 4.201822916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5435, |
| "loss/crossentropy": 2.1334788501262665, |
| "loss/hidden": 3.402734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1883639894425869, |
| "step": 9630 |
| }, |
| { |
| "epoch": 0.241, |
| "grad_norm": 32.25, |
| "grad_norm_var": 9.456705729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.4907, |
| "loss/crossentropy": 2.1615520387887956, |
| "loss/hidden": 3.52734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.212527497112751, |
| "step": 9640 |
| }, |
| { |
| "epoch": 0.24125, |
| "grad_norm": 30.875, |
| "grad_norm_var": 9.05, |
| "learning_rate": 0.0001, |
| "loss": 7.4189, |
| "loss/crossentropy": 2.182580092549324, |
| "loss/hidden": 3.437890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1890213243663311, |
| "step": 9650 |
| }, |
| { |
| "epoch": 0.2415, |
| "grad_norm": 29.75, |
| "grad_norm_var": 4.44765625, |
| "learning_rate": 0.0001, |
| "loss": 7.5499, |
| "loss/crossentropy": 2.1858610659837723, |
| "loss/hidden": 3.375390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1913149781525135, |
| "step": 9660 |
| }, |
| { |
| "epoch": 0.24175, |
| "grad_norm": 29.875, |
| "grad_norm_var": 7.605989583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4714, |
| "loss/crossentropy": 2.212157425284386, |
| "loss/hidden": 3.312890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1864764802157879, |
| "step": 9670 |
| }, |
| { |
| "epoch": 0.242, |
| "grad_norm": 30.375, |
| "grad_norm_var": 6.686393229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4193, |
| "loss/crossentropy": 2.226425829529762, |
| "loss/hidden": 3.341796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19140857569873332, |
| "step": 9680 |
| }, |
| { |
| "epoch": 0.24225, |
| "grad_norm": 31.0, |
| "grad_norm_var": 2.3114583333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5538, |
| "loss/crossentropy": 2.123721697926521, |
| "loss/hidden": 3.612109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21398749127984046, |
| "step": 9690 |
| }, |
| { |
| "epoch": 0.2425, |
| "grad_norm": 31.25, |
| "grad_norm_var": 2.03515625, |
| "learning_rate": 0.0001, |
| "loss": 7.4686, |
| "loss/crossentropy": 2.189563122391701, |
| "loss/hidden": 3.285546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19756122268736362, |
| "step": 9700 |
| }, |
| { |
| "epoch": 0.24275, |
| "grad_norm": 36.75, |
| "grad_norm_var": 4326.120768229167, |
| "learning_rate": 0.0001, |
| "loss": 7.6897, |
| "loss/crossentropy": 2.137082815170288, |
| "loss/hidden": 3.33359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18757453747093678, |
| "step": 9710 |
| }, |
| { |
| "epoch": 0.243, |
| "grad_norm": 32.5, |
| "grad_norm_var": 4357.42265625, |
| "learning_rate": 0.0001, |
| "loss": 7.4598, |
| "loss/crossentropy": 2.1567126482725145, |
| "loss/hidden": 3.319921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19823984801769257, |
| "step": 9720 |
| }, |
| { |
| "epoch": 0.24325, |
| "grad_norm": 30.625, |
| "grad_norm_var": 3.7192057291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5681, |
| "loss/crossentropy": 2.0657265201210975, |
| "loss/hidden": 3.423828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21410678885877132, |
| "step": 9730 |
| }, |
| { |
| "epoch": 0.2435, |
| "grad_norm": 30.875, |
| "grad_norm_var": 3.54765625, |
| "learning_rate": 0.0001, |
| "loss": 7.4632, |
| "loss/crossentropy": 2.231111526489258, |
| "loss/hidden": 3.218359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1727623924612999, |
| "step": 9740 |
| }, |
| { |
| "epoch": 0.24375, |
| "grad_norm": 33.25, |
| "grad_norm_var": 32.34348958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5244, |
| "loss/crossentropy": 2.184462660551071, |
| "loss/hidden": 3.3578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19121489115059376, |
| "step": 9750 |
| }, |
| { |
| "epoch": 0.244, |
| "grad_norm": 29.5, |
| "grad_norm_var": 13.9212890625, |
| "learning_rate": 0.0001, |
| "loss": 7.5373, |
| "loss/crossentropy": 2.237063002586365, |
| "loss/hidden": 3.42265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19552523456513882, |
| "step": 9760 |
| }, |
| { |
| "epoch": 0.24425, |
| "grad_norm": 31.0, |
| "grad_norm_var": 12.137239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5296, |
| "loss/crossentropy": 2.1166174903512003, |
| "loss/hidden": 3.482421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19640736747533083, |
| "step": 9770 |
| }, |
| { |
| "epoch": 0.2445, |
| "grad_norm": 31.625, |
| "grad_norm_var": 1.8927083333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4695, |
| "loss/crossentropy": 2.215598449110985, |
| "loss/hidden": 3.330078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1820572379976511, |
| "step": 9780 |
| }, |
| { |
| "epoch": 0.24475, |
| "grad_norm": 28.375, |
| "grad_norm_var": 6.012239583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4857, |
| "loss/crossentropy": 2.19299538731575, |
| "loss/hidden": 3.433984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20531897619366646, |
| "step": 9790 |
| }, |
| { |
| "epoch": 0.245, |
| "grad_norm": 36.5, |
| "grad_norm_var": 2.2799472864610222e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.6354, |
| "loss/crossentropy": 2.141967089474201, |
| "loss/hidden": 3.656640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19617959037423133, |
| "step": 9800 |
| }, |
| { |
| "epoch": 0.24525, |
| "grad_norm": 31.75, |
| "grad_norm_var": 2.2799472865365197e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.5381, |
| "loss/crossentropy": 2.1641202688217165, |
| "loss/hidden": 3.4640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19467307589948177, |
| "step": 9810 |
| }, |
| { |
| "epoch": 0.2455, |
| "grad_norm": 32.0, |
| "grad_norm_var": 1.9330729166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4407, |
| "loss/crossentropy": 2.1282627910375593, |
| "loss/hidden": 3.428515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1983230970799923, |
| "step": 9820 |
| }, |
| { |
| "epoch": 0.24575, |
| "grad_norm": 30.375, |
| "grad_norm_var": 1.5020833333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5029, |
| "loss/crossentropy": 2.1581913977861404, |
| "loss/hidden": 3.44140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20057737156748773, |
| "step": 9830 |
| }, |
| { |
| "epoch": 0.246, |
| "grad_norm": 32.0, |
| "grad_norm_var": 3.36640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5631, |
| "loss/crossentropy": 2.111289617419243, |
| "loss/hidden": 3.56328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19521272610872983, |
| "step": 9840 |
| }, |
| { |
| "epoch": 0.24625, |
| "grad_norm": 34.0, |
| "grad_norm_var": 2.7947265625, |
| "learning_rate": 0.0001, |
| "loss": 7.4617, |
| "loss/crossentropy": 2.1353485763072966, |
| "loss/hidden": 3.377734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18625408709049224, |
| "step": 9850 |
| }, |
| { |
| "epoch": 0.2465, |
| "grad_norm": 29.875, |
| "grad_norm_var": 2.5582682291666665, |
| "learning_rate": 0.0001, |
| "loss": 7.4291, |
| "loss/crossentropy": 2.2995183020830154, |
| "loss/hidden": 3.41015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1992196377366781, |
| "step": 9860 |
| }, |
| { |
| "epoch": 0.24675, |
| "grad_norm": 31.25, |
| "grad_norm_var": 1.32890625, |
| "learning_rate": 0.0001, |
| "loss": 7.6033, |
| "loss/crossentropy": 2.2881169497966765, |
| "loss/hidden": 3.410546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1932061992585659, |
| "step": 9870 |
| }, |
| { |
| "epoch": 0.247, |
| "grad_norm": 30.0, |
| "grad_norm_var": 14.017643229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5277, |
| "loss/crossentropy": 2.1658580511808396, |
| "loss/hidden": 3.341015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19005278386175634, |
| "step": 9880 |
| }, |
| { |
| "epoch": 0.24725, |
| "grad_norm": 29.625, |
| "grad_norm_var": 11.984830729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4924, |
| "loss/crossentropy": 2.2023943603038787, |
| "loss/hidden": 3.339453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19262171424925328, |
| "step": 9890 |
| }, |
| { |
| "epoch": 0.2475, |
| "grad_norm": 31.625, |
| "grad_norm_var": 6.539518229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.3901, |
| "loss/crossentropy": 2.1616971135139464, |
| "loss/hidden": 3.405078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19074857234954834, |
| "step": 9900 |
| }, |
| { |
| "epoch": 0.24775, |
| "grad_norm": 31.625, |
| "grad_norm_var": 2.436393229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4839, |
| "loss/crossentropy": 2.21129602342844, |
| "loss/hidden": 3.37265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1898047223687172, |
| "step": 9910 |
| }, |
| { |
| "epoch": 0.248, |
| "grad_norm": 31.125, |
| "grad_norm_var": 1.9580729166666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5074, |
| "loss/crossentropy": 2.1486201629042627, |
| "loss/hidden": 3.327734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19970939867198467, |
| "step": 9920 |
| }, |
| { |
| "epoch": 0.24825, |
| "grad_norm": 31.75, |
| "grad_norm_var": 1.9666666666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4634, |
| "loss/crossentropy": 2.073464626073837, |
| "loss/hidden": 3.489453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21017258744686843, |
| "step": 9930 |
| }, |
| { |
| "epoch": 0.2485, |
| "grad_norm": 31.25, |
| "grad_norm_var": 1.4166666666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4696, |
| "loss/crossentropy": 2.156512539088726, |
| "loss/hidden": 3.529296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20021300427615643, |
| "step": 9940 |
| }, |
| { |
| "epoch": 0.24875, |
| "grad_norm": 31.375, |
| "grad_norm_var": 2.6681640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5618, |
| "loss/crossentropy": 2.098437860608101, |
| "loss/hidden": 3.4296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2031120091676712, |
| "step": 9950 |
| }, |
| { |
| "epoch": 0.249, |
| "grad_norm": 29.125, |
| "grad_norm_var": 4.5259765625, |
| "learning_rate": 0.0001, |
| "loss": 7.5073, |
| "loss/crossentropy": 2.054112070798874, |
| "loss/hidden": 3.3703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18417572602629662, |
| "step": 9960 |
| }, |
| { |
| "epoch": 0.24925, |
| "grad_norm": 32.25, |
| "grad_norm_var": 9.677083333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4506, |
| "loss/crossentropy": 2.173594242334366, |
| "loss/hidden": 3.3609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1788831666111946, |
| "step": 9970 |
| }, |
| { |
| "epoch": 0.2495, |
| "grad_norm": 31.625, |
| "grad_norm_var": 12.448958333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5566, |
| "loss/crossentropy": 2.1865617662668226, |
| "loss/hidden": 3.54375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.219557130523026, |
| "step": 9980 |
| }, |
| { |
| "epoch": 0.24975, |
| "grad_norm": 31.125, |
| "grad_norm_var": 7.601822916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6451, |
| "loss/crossentropy": 2.161116376519203, |
| "loss/hidden": 3.42109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20050363764166831, |
| "step": 9990 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 30.375, |
| "grad_norm_var": 1.603125, |
| "learning_rate": 0.0001, |
| "loss": 7.4863, |
| "loss/crossentropy": 2.056240776181221, |
| "loss/hidden": 3.48203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21083669643849134, |
| "step": 10000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 40000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 5000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.8575100320088064e+19, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|