| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.25, |
| "eval_steps": 2000, |
| "global_step": 10000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.00025, |
| "grad_norm": 32.5, |
| "learning_rate": 0.0001, |
| "loss": 7.9852, |
| "loss/crossentropy": 2.2558943748474123, |
| "loss/hidden": 3.53671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22032691352069378, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0005, |
| "grad_norm": 39.0, |
| "grad_norm_var": 6.1306640625, |
| "learning_rate": 0.0001, |
| "loss": 8.0827, |
| "loss/crossentropy": 2.219619666039944, |
| "loss/hidden": 3.397265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20763051956892015, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.00075, |
| "grad_norm": 30.375, |
| "grad_norm_var": 7.383072916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.8479, |
| "loss/crossentropy": 2.185364603996277, |
| "loss/hidden": 3.59296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.23391152992844583, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.001, |
| "grad_norm": 32.75, |
| "grad_norm_var": 187.0322265625, |
| "learning_rate": 0.0001, |
| "loss": 7.8341, |
| "loss/crossentropy": 2.083733668923378, |
| "loss/hidden": 3.375390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2040597340092063, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.00125, |
| "grad_norm": 32.5, |
| "grad_norm_var": 5.111393229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6815, |
| "loss/crossentropy": 2.182037726044655, |
| "loss/hidden": 3.439453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20423058047890663, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.0015, |
| "grad_norm": 33.0, |
| "grad_norm_var": 1.3853515625, |
| "learning_rate": 0.0001, |
| "loss": 7.6919, |
| "loss/crossentropy": 2.1419573068618774, |
| "loss/hidden": 3.375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19463330563157796, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.00175, |
| "grad_norm": 33.25, |
| "grad_norm_var": 2.228125, |
| "learning_rate": 0.0001, |
| "loss": 7.838, |
| "loss/crossentropy": 2.242894622683525, |
| "loss/hidden": 3.425390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22062199115753173, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.002, |
| "grad_norm": 32.5, |
| "grad_norm_var": 8.371809895833334, |
| "learning_rate": 0.0001, |
| "loss": 8.018, |
| "loss/crossentropy": 2.0408543169498445, |
| "loss/hidden": 3.506640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20080858804285526, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.00225, |
| "grad_norm": 32.75, |
| "grad_norm_var": 7.322916666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.8807, |
| "loss/crossentropy": 2.0654082030057905, |
| "loss/hidden": 3.41640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2109438929706812, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.0025, |
| "grad_norm": 34.5, |
| "grad_norm_var": 2805.3603515625, |
| "learning_rate": 0.0001, |
| "loss": 8.0497, |
| "loss/crossentropy": 2.0930048365145923, |
| "loss/hidden": 3.410546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2103888330049813, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.00275, |
| "grad_norm": 33.0, |
| "grad_norm_var": 2798.430143229167, |
| "learning_rate": 0.0001, |
| "loss": 7.8583, |
| "loss/crossentropy": 2.3308374524116515, |
| "loss/hidden": 3.373046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2045755073428154, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.003, |
| "grad_norm": 34.0, |
| "grad_norm_var": 1.6999348958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.7088, |
| "loss/crossentropy": 2.120428466796875, |
| "loss/hidden": 3.56796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21561774536967276, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.00325, |
| "grad_norm": 31.0, |
| "grad_norm_var": 40.73899739583333, |
| "learning_rate": 0.0001, |
| "loss": 7.762, |
| "loss/crossentropy": 2.1464689180254934, |
| "loss/hidden": 3.4203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2084670951589942, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.0035, |
| "grad_norm": 33.0, |
| "grad_norm_var": 42.67265625, |
| "learning_rate": 0.0001, |
| "loss": 7.711, |
| "loss/crossentropy": 2.1301105961203577, |
| "loss/hidden": 3.512890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21644905991852284, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.00375, |
| "grad_norm": 32.75, |
| "grad_norm_var": 8.066080729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.7295, |
| "loss/crossentropy": 2.115240353345871, |
| "loss/hidden": 3.491796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2064087452366948, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.004, |
| "grad_norm": 32.5, |
| "grad_norm_var": 422.72057291666664, |
| "learning_rate": 0.0001, |
| "loss": 7.9367, |
| "loss/crossentropy": 2.141779786348343, |
| "loss/hidden": 3.457421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21747791785746812, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.00425, |
| "grad_norm": 32.0, |
| "grad_norm_var": 6.705208333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.756, |
| "loss/crossentropy": 2.103468084335327, |
| "loss/hidden": 3.449609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2027540819719434, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.0045, |
| "grad_norm": 32.25, |
| "grad_norm_var": 17.8556640625, |
| "learning_rate": 0.0001, |
| "loss": 7.7008, |
| "loss/crossentropy": 2.1950977832078933, |
| "loss/hidden": 3.453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2186649737879634, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.00475, |
| "grad_norm": 30.5, |
| "grad_norm_var": 66.61608072916667, |
| "learning_rate": 0.0001, |
| "loss": 7.6552, |
| "loss/crossentropy": 2.029752139747143, |
| "loss/hidden": 3.585546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22110425475984813, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.005, |
| "grad_norm": 36.0, |
| "grad_norm_var": 13.160872395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.6964, |
| "loss/crossentropy": 2.169684535264969, |
| "loss/hidden": 3.52265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22122678495943546, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.00525, |
| "grad_norm": 33.25, |
| "grad_norm_var": 3.442643229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.7135, |
| "loss/crossentropy": 2.1366169154644012, |
| "loss/hidden": 3.449609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19148335698992014, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.0055, |
| "grad_norm": 35.25, |
| "grad_norm_var": 4.151822916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.654, |
| "loss/crossentropy": 2.196457767486572, |
| "loss/hidden": 3.573828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22240160517394542, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.00575, |
| "grad_norm": 34.5, |
| "grad_norm_var": 61.07057291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.7286, |
| "loss/crossentropy": 2.099469523131847, |
| "loss/hidden": 3.41796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19720815271139144, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.006, |
| "grad_norm": 33.75, |
| "grad_norm_var": 5.479166666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6841, |
| "loss/crossentropy": 2.0730464071035386, |
| "loss/hidden": 3.480078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20290055498480797, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.00625, |
| "grad_norm": 29.5, |
| "grad_norm_var": 4.9197265625, |
| "learning_rate": 0.0001, |
| "loss": 7.6999, |
| "loss/crossentropy": 2.1514100462198256, |
| "loss/hidden": 3.42109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2013280361890793, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.0065, |
| "grad_norm": 33.25, |
| "grad_norm_var": 150.86712239583332, |
| "learning_rate": 0.0001, |
| "loss": 7.7617, |
| "loss/crossentropy": 2.0071112543344496, |
| "loss/hidden": 3.39921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1923872010782361, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.00675, |
| "grad_norm": 32.25, |
| "grad_norm_var": 41.373958333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.7968, |
| "loss/crossentropy": 2.254330241680145, |
| "loss/hidden": 3.391015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20844143405556678, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.007, |
| "grad_norm": 30.25, |
| "grad_norm_var": 5.9625, |
| "learning_rate": 0.0001, |
| "loss": 7.7437, |
| "loss/crossentropy": 2.2058747708797455, |
| "loss/hidden": 3.406640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19603454861789943, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.00725, |
| "grad_norm": 32.0, |
| "grad_norm_var": 3.4306640625, |
| "learning_rate": 0.0001, |
| "loss": 7.7308, |
| "loss/crossentropy": 2.158563455939293, |
| "loss/hidden": 3.485546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20696840062737465, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.0075, |
| "grad_norm": 35.25, |
| "grad_norm_var": 4.163997395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.7781, |
| "loss/crossentropy": 2.1728759124875068, |
| "loss/hidden": 3.6984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2232737574726343, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.00775, |
| "grad_norm": 32.25, |
| "grad_norm_var": 9.7212890625, |
| "learning_rate": 0.0001, |
| "loss": 7.7039, |
| "loss/crossentropy": 2.1304744452238085, |
| "loss/hidden": 3.510546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21460633352398872, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.008, |
| "grad_norm": 34.25, |
| "grad_norm_var": 102.21979166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.7386, |
| "loss/crossentropy": 2.2150494754314423, |
| "loss/hidden": 3.470703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2204894032329321, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.00825, |
| "grad_norm": 34.5, |
| "grad_norm_var": 103.0806640625, |
| "learning_rate": 0.0001, |
| "loss": 7.6843, |
| "loss/crossentropy": 2.1483607694506643, |
| "loss/hidden": 3.525, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2168185070157051, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.0085, |
| "grad_norm": 29.75, |
| "grad_norm_var": 4.245768229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6824, |
| "loss/crossentropy": 2.2668254554271696, |
| "loss/hidden": 3.36171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1978676740080118, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.00875, |
| "grad_norm": 30.75, |
| "grad_norm_var": 7.785416666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6476, |
| "loss/crossentropy": 2.247351437807083, |
| "loss/hidden": 3.45078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2044550308957696, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.009, |
| "grad_norm": 33.0, |
| "grad_norm_var": 36.1150390625, |
| "learning_rate": 0.0001, |
| "loss": 7.6707, |
| "loss/crossentropy": 2.2508403569459916, |
| "loss/hidden": 3.351171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20586735829710961, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.00925, |
| "grad_norm": 27.875, |
| "grad_norm_var": 42.02805989583333, |
| "learning_rate": 0.0001, |
| "loss": 7.787, |
| "loss/crossentropy": 2.2524363905191422, |
| "loss/hidden": 3.39921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20639074668288232, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.0095, |
| "grad_norm": 29.625, |
| "grad_norm_var": 19.65625, |
| "learning_rate": 0.0001, |
| "loss": 7.7258, |
| "loss/crossentropy": 2.2901588469743728, |
| "loss/hidden": 3.37890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20220000632107257, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.00975, |
| "grad_norm": 29.875, |
| "grad_norm_var": 15.653059895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.6445, |
| "loss/crossentropy": 2.1256834477186204, |
| "loss/hidden": 3.337109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.190180828794837, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 33.5, |
| "grad_norm_var": 4.7712890625, |
| "learning_rate": 0.0001, |
| "loss": 7.7719, |
| "loss/crossentropy": 2.2237626880407335, |
| "loss/hidden": 3.484765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2264560218900442, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.01025, |
| "grad_norm": 33.0, |
| "grad_norm_var": 2.9247395833333334, |
| "learning_rate": 0.0001, |
| "loss": 7.7631, |
| "loss/crossentropy": 2.060544753074646, |
| "loss/hidden": 3.470703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2202487275004387, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.0105, |
| "grad_norm": 32.75, |
| "grad_norm_var": 2.6712890625, |
| "learning_rate": 0.0001, |
| "loss": 7.8193, |
| "loss/crossentropy": 2.270045906305313, |
| "loss/hidden": 3.49921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22449300419539214, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.01075, |
| "grad_norm": 40.0, |
| "grad_norm_var": 5.551822916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.7502, |
| "loss/crossentropy": 2.220390594005585, |
| "loss/hidden": 3.421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20708895958960055, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.011, |
| "grad_norm": 33.75, |
| "grad_norm_var": 6.554622395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6386, |
| "loss/crossentropy": 2.0412577211856844, |
| "loss/hidden": 3.52109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20780573673546315, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.01125, |
| "grad_norm": 39.5, |
| "grad_norm_var": 9.018684895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.7522, |
| "loss/crossentropy": 2.085190561413765, |
| "loss/hidden": 3.434375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22059339918196202, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.0115, |
| "grad_norm": 31.75, |
| "grad_norm_var": 5.703580729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.604, |
| "loss/crossentropy": 1.9451062515378, |
| "loss/hidden": 3.527734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2082566052675247, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.01175, |
| "grad_norm": 34.25, |
| "grad_norm_var": 1.4416666666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.7286, |
| "loss/crossentropy": 2.23127267062664, |
| "loss/hidden": 3.560546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22645943984389305, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.012, |
| "grad_norm": 43.5, |
| "grad_norm_var": 61.358333333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5857, |
| "loss/crossentropy": 2.0275631666183473, |
| "loss/hidden": 3.341015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19867698503658177, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.01225, |
| "grad_norm": 30.5, |
| "grad_norm_var": 62.425455729166664, |
| "learning_rate": 0.0001, |
| "loss": 7.7023, |
| "loss/crossentropy": 2.1564531326293945, |
| "loss/hidden": 3.422265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21295207217335702, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.0125, |
| "grad_norm": 39.0, |
| "grad_norm_var": 7.580208333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.8035, |
| "loss/crossentropy": 2.1716607972979545, |
| "loss/hidden": 3.489453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21046865545213223, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.01275, |
| "grad_norm": 35.25, |
| "grad_norm_var": 5.121809895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.764, |
| "loss/crossentropy": 2.181091034412384, |
| "loss/hidden": 3.410546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20339491367340087, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.013, |
| "grad_norm": 31.5, |
| "grad_norm_var": 3.820572916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6112, |
| "loss/crossentropy": 2.012446442246437, |
| "loss/hidden": 3.454296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20432959645986556, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.01325, |
| "grad_norm": 33.0, |
| "grad_norm_var": 2.92890625, |
| "learning_rate": 0.0001, |
| "loss": 7.73, |
| "loss/crossentropy": 2.0826023176312445, |
| "loss/hidden": 3.51171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21190985422581435, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.0135, |
| "grad_norm": 31.375, |
| "grad_norm_var": 2.0229166666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.7527, |
| "loss/crossentropy": 2.191486455500126, |
| "loss/hidden": 3.378125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1943425141274929, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.01375, |
| "grad_norm": 29.75, |
| "grad_norm_var": 4.983268229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.7035, |
| "loss/crossentropy": 2.0664332896471023, |
| "loss/hidden": 3.475, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20404404532164336, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.014, |
| "grad_norm": 31.875, |
| "grad_norm_var": 3.381705729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5679, |
| "loss/crossentropy": 2.0470397621393204, |
| "loss/hidden": 3.430859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21556914187967777, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.01425, |
| "grad_norm": 33.0, |
| "grad_norm_var": 2.508072916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.7909, |
| "loss/crossentropy": 2.21784293949604, |
| "loss/hidden": 3.4671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20632803943008185, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.0145, |
| "grad_norm": 32.25, |
| "grad_norm_var": 2.6837890625, |
| "learning_rate": 0.0001, |
| "loss": 7.6449, |
| "loss/crossentropy": 2.291385439038277, |
| "loss/hidden": 3.432421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19818231668323277, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.01475, |
| "grad_norm": 30.875, |
| "grad_norm_var": 106.909375, |
| "learning_rate": 0.0001, |
| "loss": 7.7009, |
| "loss/crossentropy": 2.102944087982178, |
| "loss/hidden": 3.532421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20003035496920346, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.015, |
| "grad_norm": 33.0, |
| "grad_norm_var": 11.816080729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5907, |
| "loss/crossentropy": 2.239013722538948, |
| "loss/hidden": 3.471484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22811597101390363, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.01525, |
| "grad_norm": 34.75, |
| "grad_norm_var": 3.2811848958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6578, |
| "loss/crossentropy": 2.1826944231986998, |
| "loss/hidden": 3.36796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19526719450950622, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.0155, |
| "grad_norm": 32.0, |
| "grad_norm_var": 29.54140625, |
| "learning_rate": 0.0001, |
| "loss": 7.6592, |
| "loss/crossentropy": 2.0491475805640222, |
| "loss/hidden": 3.47109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19470291025936604, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.01575, |
| "grad_norm": 29.625, |
| "grad_norm_var": 3.458268229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.7237, |
| "loss/crossentropy": 2.171899539232254, |
| "loss/hidden": 3.417578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19583625346422195, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.016, |
| "grad_norm": 41.5, |
| "grad_norm_var": 835.4395182291667, |
| "learning_rate": 0.0001, |
| "loss": 7.6493, |
| "loss/crossentropy": 2.019927790760994, |
| "loss/hidden": 3.40546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1889604590833187, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.01625, |
| "grad_norm": 33.25, |
| "grad_norm_var": 816.1582682291667, |
| "learning_rate": 0.0001, |
| "loss": 7.7628, |
| "loss/crossentropy": 2.146556834876537, |
| "loss/hidden": 3.53046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21917179077863694, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.0165, |
| "grad_norm": 34.0, |
| "grad_norm_var": 13.574739583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6199, |
| "loss/crossentropy": 2.2131199680268763, |
| "loss/hidden": 3.393359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20602547163143753, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.01675, |
| "grad_norm": 34.75, |
| "grad_norm_var": 28.6125, |
| "learning_rate": 0.0001, |
| "loss": 7.6113, |
| "loss/crossentropy": 2.0343705236911775, |
| "loss/hidden": 3.37734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18553176671266555, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.017, |
| "grad_norm": 31.625, |
| "grad_norm_var": 19.8369140625, |
| "learning_rate": 0.0001, |
| "loss": 7.6443, |
| "loss/crossentropy": 2.1528817296028135, |
| "loss/hidden": 3.456640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20593271851539613, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.01725, |
| "grad_norm": 36.25, |
| "grad_norm_var": 5.97265625, |
| "learning_rate": 0.0001, |
| "loss": 7.7476, |
| "loss/crossentropy": 2.2202903479337692, |
| "loss/hidden": 3.419921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2035485502332449, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.0175, |
| "grad_norm": 31.5, |
| "grad_norm_var": 7.5431640625, |
| "learning_rate": 0.0001, |
| "loss": 7.621, |
| "loss/crossentropy": 2.0744683638215067, |
| "loss/hidden": 3.359765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2023961789906025, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.01775, |
| "grad_norm": 29.875, |
| "grad_norm_var": 26.912239583333335, |
| "learning_rate": 0.0001, |
| "loss": 7.7058, |
| "loss/crossentropy": 2.1098077327013014, |
| "loss/hidden": 3.48359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20867060720920563, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.018, |
| "grad_norm": 34.0, |
| "grad_norm_var": 5.291080729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5224, |
| "loss/crossentropy": 2.0663078971207143, |
| "loss/hidden": 3.439453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1999094202183187, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.01825, |
| "grad_norm": 36.75, |
| "grad_norm_var": 34.112239583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5679, |
| "loss/crossentropy": 2.264209559559822, |
| "loss/hidden": 3.438671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.213060562312603, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.0185, |
| "grad_norm": 31.875, |
| "grad_norm_var": 31.756705729166665, |
| "learning_rate": 0.0001, |
| "loss": 7.6714, |
| "loss/crossentropy": 2.128282290697098, |
| "loss/hidden": 3.494921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21533375550061465, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.01875, |
| "grad_norm": 29.125, |
| "grad_norm_var": 4.314518229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6825, |
| "loss/crossentropy": 2.0806978911161425, |
| "loss/hidden": 3.446875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19196727648377418, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.019, |
| "grad_norm": 31.625, |
| "grad_norm_var": 9.520833333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6058, |
| "loss/crossentropy": 2.2315777271986006, |
| "loss/hidden": 3.484765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21225934717804193, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.01925, |
| "grad_norm": 33.25, |
| "grad_norm_var": 29.381705729166665, |
| "learning_rate": 0.0001, |
| "loss": 7.8377, |
| "loss/crossentropy": 2.200978134572506, |
| "loss/hidden": 3.52890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22721426151692867, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.0195, |
| "grad_norm": 32.75, |
| "grad_norm_var": 11.095247395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.7305, |
| "loss/crossentropy": 2.2799030035734176, |
| "loss/hidden": 3.512890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2418980894610286, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.01975, |
| "grad_norm": 46.5, |
| "grad_norm_var": 22.12265625, |
| "learning_rate": 0.0001, |
| "loss": 7.663, |
| "loss/crossentropy": 2.0916543275117876, |
| "loss/hidden": 3.4765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20638740565627814, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 53.75, |
| "grad_norm_var": 2.3053503983968586e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.6743, |
| "loss/crossentropy": 2.2010276943445204, |
| "loss/hidden": 3.33671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2011238183826208, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.02025, |
| "grad_norm": 34.75, |
| "grad_norm_var": 2.3053503962269005e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.7365, |
| "loss/crossentropy": 2.2651585280895232, |
| "loss/hidden": 3.425390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22051467839628458, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.0205, |
| "grad_norm": 32.25, |
| "grad_norm_var": 69.03854166666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5788, |
| "loss/crossentropy": 2.1270318403840065, |
| "loss/hidden": 3.47109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20940354652702808, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.02075, |
| "grad_norm": 33.0, |
| "grad_norm_var": 8.0603515625, |
| "learning_rate": 0.0001, |
| "loss": 7.6237, |
| "loss/crossentropy": 2.1726820170879364, |
| "loss/hidden": 3.428125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20644128862768413, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.021, |
| "grad_norm": 33.5, |
| "grad_norm_var": 9.085416666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.455, |
| "loss/crossentropy": 2.166350546479225, |
| "loss/hidden": 3.412109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20701032150536774, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.02125, |
| "grad_norm": 37.75, |
| "grad_norm_var": 6.362239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5725, |
| "loss/crossentropy": 2.095318245887756, |
| "loss/hidden": 3.429296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20427223704755307, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.0215, |
| "grad_norm": 29.625, |
| "grad_norm_var": 15.934830729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5728, |
| "loss/crossentropy": 2.1999073296785356, |
| "loss/hidden": 3.400390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2012161746621132, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.02175, |
| "grad_norm": 30.5, |
| "grad_norm_var": 5.681184895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.6168, |
| "loss/crossentropy": 2.2957967817783356, |
| "loss/hidden": 3.395703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20650502648204566, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.022, |
| "grad_norm": 30.0, |
| "grad_norm_var": 10.9181640625, |
| "learning_rate": 0.0001, |
| "loss": 7.6929, |
| "loss/crossentropy": 2.2093090921640397, |
| "loss/hidden": 3.350390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18699637930840254, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.02225, |
| "grad_norm": 35.0, |
| "grad_norm_var": 11.192122395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5316, |
| "loss/crossentropy": 2.0586251467466354, |
| "loss/hidden": 3.428125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21464286223053933, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.0225, |
| "grad_norm": 48.0, |
| "grad_norm_var": 22.92265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5857, |
| "loss/crossentropy": 2.1724458605051042, |
| "loss/hidden": 3.3828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19797445200383662, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.02275, |
| "grad_norm": 33.0, |
| "grad_norm_var": 38.4837890625, |
| "learning_rate": 0.0001, |
| "loss": 7.6184, |
| "loss/crossentropy": 2.15934486836195, |
| "loss/hidden": 3.357421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19419073052704333, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.023, |
| "grad_norm": 31.0, |
| "grad_norm_var": 29.8869140625, |
| "learning_rate": 0.0001, |
| "loss": 7.6209, |
| "loss/crossentropy": 2.1601561695337295, |
| "loss/hidden": 3.384375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20290262177586554, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.02325, |
| "grad_norm": 33.0, |
| "grad_norm_var": 2.5143229166666665, |
| "learning_rate": 0.0001, |
| "loss": 7.5969, |
| "loss/crossentropy": 2.184823766350746, |
| "loss/hidden": 3.392578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2016730338335037, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.0235, |
| "grad_norm": 30.125, |
| "grad_norm_var": 1.5764973958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5772, |
| "loss/crossentropy": 2.2380657255649568, |
| "loss/hidden": 3.319140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19638443663716315, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.02375, |
| "grad_norm": 28.875, |
| "grad_norm_var": 5.199934895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.6348, |
| "loss/crossentropy": 2.183762513846159, |
| "loss/hidden": 3.4171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19700763542205096, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.024, |
| "grad_norm": 33.0, |
| "grad_norm_var": 20.1666015625, |
| "learning_rate": 0.0001, |
| "loss": 7.6351, |
| "loss/crossentropy": 2.1142914414405825, |
| "loss/hidden": 3.46953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2082229983061552, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.02425, |
| "grad_norm": 31.5, |
| "grad_norm_var": 18.738541666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6302, |
| "loss/crossentropy": 2.315229868888855, |
| "loss/hidden": 3.32734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20684304945170878, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.0245, |
| "grad_norm": 33.75, |
| "grad_norm_var": 2.501822916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6816, |
| "loss/crossentropy": 2.094369947910309, |
| "loss/hidden": 3.59921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2428071454167366, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.02475, |
| "grad_norm": 29.25, |
| "grad_norm_var": 4.06015625, |
| "learning_rate": 0.0001, |
| "loss": 7.6289, |
| "loss/crossentropy": 2.183109185099602, |
| "loss/hidden": 3.5, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21173047311604024, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.025, |
| "grad_norm": 33.5, |
| "grad_norm_var": 2.9593098958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.6417, |
| "loss/crossentropy": 2.1311034083366396, |
| "loss/hidden": 3.44765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1998819222673774, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.02525, |
| "grad_norm": 31.0, |
| "grad_norm_var": 4.510872395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6299, |
| "loss/crossentropy": 2.1297010451555254, |
| "loss/hidden": 3.475390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20842864252626897, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.0255, |
| "grad_norm": 30.375, |
| "grad_norm_var": 3.9893229166666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6593, |
| "loss/crossentropy": 2.2224902719259263, |
| "loss/hidden": 3.509375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21827217563986778, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.02575, |
| "grad_norm": 38.25, |
| "grad_norm_var": 6.010416666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6712, |
| "loss/crossentropy": 2.1976612359285355, |
| "loss/hidden": 3.506640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.210753770545125, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.026, |
| "grad_norm": 33.5, |
| "grad_norm_var": 9.435872395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.748, |
| "loss/crossentropy": 2.1889317661523817, |
| "loss/hidden": 3.398828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2081079863011837, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.02625, |
| "grad_norm": 31.0, |
| "grad_norm_var": 6.206184895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5803, |
| "loss/crossentropy": 2.0802227064967154, |
| "loss/hidden": 3.46796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19740718584507705, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.0265, |
| "grad_norm": 31.625, |
| "grad_norm_var": 1.3770182291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6098, |
| "loss/crossentropy": 1.969551184773445, |
| "loss/hidden": 3.4546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1954148853197694, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.02675, |
| "grad_norm": 30.75, |
| "grad_norm_var": 2.09375, |
| "learning_rate": 0.0001, |
| "loss": 7.7826, |
| "loss/crossentropy": 2.160974936187267, |
| "loss/hidden": 3.581640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2183740811422467, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.027, |
| "grad_norm": 30.625, |
| "grad_norm_var": 2.6520182291666665, |
| "learning_rate": 0.0001, |
| "loss": 7.6186, |
| "loss/crossentropy": 2.179084411263466, |
| "loss/hidden": 3.4109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19020346291363238, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.02725, |
| "grad_norm": 56.5, |
| "grad_norm_var": 48.91868489583333, |
| "learning_rate": 0.0001, |
| "loss": 7.7756, |
| "loss/crossentropy": 2.1456103891134264, |
| "loss/hidden": 3.5, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2178689869120717, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.0275, |
| "grad_norm": 32.0, |
| "grad_norm_var": 42.19557291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6714, |
| "loss/crossentropy": 2.2156156271696092, |
| "loss/hidden": 3.43515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2013509316369891, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.02775, |
| "grad_norm": 31.625, |
| "grad_norm_var": 29.3775390625, |
| "learning_rate": 0.0001, |
| "loss": 7.6833, |
| "loss/crossentropy": 2.0683740943670275, |
| "loss/hidden": 3.5046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21320818569511174, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.028, |
| "grad_norm": 27.5, |
| "grad_norm_var": 35.483333333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.7302, |
| "loss/crossentropy": 2.098052313923836, |
| "loss/hidden": 3.46484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19890500828623772, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.02825, |
| "grad_norm": 29.0, |
| "grad_norm_var": 10.8775390625, |
| "learning_rate": 0.0001, |
| "loss": 7.6716, |
| "loss/crossentropy": 2.0999813921749593, |
| "loss/hidden": 3.373046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18992104195058346, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.0285, |
| "grad_norm": 30.875, |
| "grad_norm_var": 3.67890625, |
| "learning_rate": 0.0001, |
| "loss": 7.5401, |
| "loss/crossentropy": 2.07411085665226, |
| "loss/hidden": 3.433984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2018830729648471, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.02875, |
| "grad_norm": 31.125, |
| "grad_norm_var": 18.053580729166665, |
| "learning_rate": 0.0001, |
| "loss": 7.645, |
| "loss/crossentropy": 2.0945233553647995, |
| "loss/hidden": 3.4859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21366582233458759, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.029, |
| "grad_norm": 31.25, |
| "grad_norm_var": 16.978125, |
| "learning_rate": 0.0001, |
| "loss": 7.6514, |
| "loss/crossentropy": 2.0980678737163543, |
| "loss/hidden": 3.434375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19850811325013637, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.02925, |
| "grad_norm": 32.5, |
| "grad_norm_var": 30.2556640625, |
| "learning_rate": 0.0001, |
| "loss": 7.6021, |
| "loss/crossentropy": 2.155895306169987, |
| "loss/hidden": 3.378125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19983574748039246, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.0295, |
| "grad_norm": 30.5, |
| "grad_norm_var": 5.733072916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6343, |
| "loss/crossentropy": 2.1906268298625946, |
| "loss/hidden": 3.348046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19584416709840297, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.02975, |
| "grad_norm": 32.5, |
| "grad_norm_var": 3.89765625, |
| "learning_rate": 0.0001, |
| "loss": 7.7077, |
| "loss/crossentropy": 2.163237012922764, |
| "loss/hidden": 3.562109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21741114580072463, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 27.875, |
| "grad_norm_var": 3.6639973958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6695, |
| "loss/crossentropy": 2.1346954315900804, |
| "loss/hidden": 3.394921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19178961254656315, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.03025, |
| "grad_norm": 30.5, |
| "grad_norm_var": 24.9125, |
| "learning_rate": 0.0001, |
| "loss": 7.6108, |
| "loss/crossentropy": 2.2493597716093063, |
| "loss/hidden": 3.433203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.209975734166801, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.0305, |
| "grad_norm": 30.75, |
| "grad_norm_var": 5.566080729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5902, |
| "loss/crossentropy": 2.046968361735344, |
| "loss/hidden": 3.451171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1867401722818613, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.03075, |
| "grad_norm": 29.75, |
| "grad_norm_var": 5.396875, |
| "learning_rate": 0.0001, |
| "loss": 7.6659, |
| "loss/crossentropy": 2.0429708033800127, |
| "loss/hidden": 3.3984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18792454693466426, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.031, |
| "grad_norm": 30.375, |
| "grad_norm_var": 2.8827473958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5425, |
| "loss/crossentropy": 2.124686148762703, |
| "loss/hidden": 3.45625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.23028194047510625, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.03125, |
| "grad_norm": 33.0, |
| "grad_norm_var": 2.220833333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.7766, |
| "loss/crossentropy": 2.1431034594774245, |
| "loss/hidden": 3.30390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1852023523300886, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.0315, |
| "grad_norm": 34.0, |
| "grad_norm_var": 3.3080729166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6547, |
| "loss/crossentropy": 2.208024913072586, |
| "loss/hidden": 3.32265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19009452145546674, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.03175, |
| "grad_norm": 31.375, |
| "grad_norm_var": 3.7249348958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.6331, |
| "loss/crossentropy": 2.0774734795093535, |
| "loss/hidden": 3.378515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19277823474258185, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.032, |
| "grad_norm": 32.75, |
| "grad_norm_var": 2.6393229166666665, |
| "learning_rate": 0.0001, |
| "loss": 7.6147, |
| "loss/crossentropy": 2.244540962576866, |
| "loss/hidden": 3.444921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20150573179125786, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.03225, |
| "grad_norm": 31.75, |
| "grad_norm_var": 3.2280598958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6724, |
| "loss/crossentropy": 2.1218355029821394, |
| "loss/hidden": 3.41640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19727950319647788, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.0325, |
| "grad_norm": 33.25, |
| "grad_norm_var": 3.6113932291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5881, |
| "loss/crossentropy": 2.048227934539318, |
| "loss/hidden": 3.352734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19401397118344904, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.03275, |
| "grad_norm": 35.5, |
| "grad_norm_var": 2.8384765625, |
| "learning_rate": 0.0001, |
| "loss": 7.6481, |
| "loss/crossentropy": 2.0217724472284315, |
| "loss/hidden": 3.398828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19338970091193913, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.033, |
| "grad_norm": 32.75, |
| "grad_norm_var": 2.3671223958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5734, |
| "loss/crossentropy": 2.123840129375458, |
| "loss/hidden": 3.377734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19704403057694436, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.03325, |
| "grad_norm": 31.375, |
| "grad_norm_var": 1.8207682291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5379, |
| "loss/crossentropy": 2.1691371381282805, |
| "loss/hidden": 3.416796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20379403475672006, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.0335, |
| "grad_norm": 30.5, |
| "grad_norm_var": 2.162239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5824, |
| "loss/crossentropy": 2.0320975854992867, |
| "loss/hidden": 3.490625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.204788769595325, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.03375, |
| "grad_norm": 29.875, |
| "grad_norm_var": 29.69140625, |
| "learning_rate": 0.0001, |
| "loss": 7.6835, |
| "loss/crossentropy": 2.1799038141965865, |
| "loss/hidden": 3.39765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21532316971570253, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.034, |
| "grad_norm": 30.625, |
| "grad_norm_var": 2.4837890625, |
| "learning_rate": 0.0001, |
| "loss": 7.6461, |
| "loss/crossentropy": 2.075017270445824, |
| "loss/hidden": 3.483984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21185822309926153, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.03425, |
| "grad_norm": 29.375, |
| "grad_norm_var": 2.6806640625, |
| "learning_rate": 0.0001, |
| "loss": 7.6084, |
| "loss/crossentropy": 2.2061389327049254, |
| "loss/hidden": 3.3671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19864549599587916, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.0345, |
| "grad_norm": 46.25, |
| "grad_norm_var": 16.428125, |
| "learning_rate": 0.0001, |
| "loss": 7.5773, |
| "loss/crossentropy": 2.1411263316869737, |
| "loss/hidden": 3.40703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18519967906177043, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.03475, |
| "grad_norm": 33.5, |
| "grad_norm_var": 312.2525390625, |
| "learning_rate": 0.0001, |
| "loss": 7.6651, |
| "loss/crossentropy": 2.1760765284299852, |
| "loss/hidden": 3.457421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19441114887595176, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.035, |
| "grad_norm": 30.375, |
| "grad_norm_var": 7.5697265625, |
| "learning_rate": 0.0001, |
| "loss": 7.6384, |
| "loss/crossentropy": 2.148920811712742, |
| "loss/hidden": 3.503515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20845398511737584, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.03525, |
| "grad_norm": 31.5, |
| "grad_norm_var": 3.2301432291666665, |
| "learning_rate": 0.0001, |
| "loss": 7.6503, |
| "loss/crossentropy": 2.2241507709026336, |
| "loss/hidden": 3.478515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2013774536550045, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.0355, |
| "grad_norm": 32.75, |
| "grad_norm_var": 3.67265625, |
| "learning_rate": 0.0001, |
| "loss": 7.6107, |
| "loss/crossentropy": 2.151585003733635, |
| "loss/hidden": 3.422265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19790932536125183, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.03575, |
| "grad_norm": 31.0, |
| "grad_norm_var": 2.192643229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6089, |
| "loss/crossentropy": 2.106749549508095, |
| "loss/hidden": 3.555078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20702882390469313, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.036, |
| "grad_norm": 34.0, |
| "grad_norm_var": 2.8622395833333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6081, |
| "loss/crossentropy": 2.0727885022759436, |
| "loss/hidden": 3.425, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21057205237448215, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.03625, |
| "grad_norm": 31.25, |
| "grad_norm_var": 3.32890625, |
| "learning_rate": 0.0001, |
| "loss": 7.7005, |
| "loss/crossentropy": 2.3014074742794035, |
| "loss/hidden": 3.35859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20723759960383176, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.0365, |
| "grad_norm": 28.625, |
| "grad_norm_var": 4.240625, |
| "learning_rate": 0.0001, |
| "loss": 7.5677, |
| "loss/crossentropy": 2.12650800794363, |
| "loss/hidden": 3.44609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20236929692327976, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.03675, |
| "grad_norm": 31.0, |
| "grad_norm_var": 8.195768229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.7299, |
| "loss/crossentropy": 2.187662351131439, |
| "loss/hidden": 3.421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20965678989887238, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.037, |
| "grad_norm": 33.0, |
| "grad_norm_var": 6.1666015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5942, |
| "loss/crossentropy": 2.174471014738083, |
| "loss/hidden": 3.534765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21107099391520023, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.03725, |
| "grad_norm": 40.5, |
| "grad_norm_var": 7.112239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5362, |
| "loss/crossentropy": 2.069592148065567, |
| "loss/hidden": 3.373046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19253854881972074, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.0375, |
| "grad_norm": 33.25, |
| "grad_norm_var": 6.4869140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5941, |
| "loss/crossentropy": 2.0679068714380264, |
| "loss/hidden": 3.453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20371587071567773, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.03775, |
| "grad_norm": 30.875, |
| "grad_norm_var": 2.5686848958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.6557, |
| "loss/crossentropy": 2.1364961892366408, |
| "loss/hidden": 3.4546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20447015166282653, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.038, |
| "grad_norm": 32.5, |
| "grad_norm_var": 2.6738932291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.622, |
| "loss/crossentropy": 2.1275009989738463, |
| "loss/hidden": 3.431640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18973923586308955, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.03825, |
| "grad_norm": 32.5, |
| "grad_norm_var": 1.3177083333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.7366, |
| "loss/crossentropy": 2.212426933646202, |
| "loss/hidden": 3.44296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21735910680145026, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.0385, |
| "grad_norm": 27.75, |
| "grad_norm_var": 2.7348307291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6356, |
| "loss/crossentropy": 2.1074824020266534, |
| "loss/hidden": 3.39609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19886015299707652, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.03875, |
| "grad_norm": 31.75, |
| "grad_norm_var": 4.234830729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.7567, |
| "loss/crossentropy": 2.177932971715927, |
| "loss/hidden": 3.5609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2221821215003729, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.039, |
| "grad_norm": 36.25, |
| "grad_norm_var": 2.8791015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5766, |
| "loss/crossentropy": 2.0759232968091963, |
| "loss/hidden": 3.386328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19086231291294098, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.03925, |
| "grad_norm": 32.0, |
| "grad_norm_var": 3.7046223958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.537, |
| "loss/crossentropy": 2.2487298890948297, |
| "loss/hidden": 3.3515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19235554365441204, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.0395, |
| "grad_norm": 33.0, |
| "grad_norm_var": 2.5927083333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5167, |
| "loss/crossentropy": 2.2023983120918276, |
| "loss/hidden": 3.405078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20318181458860635, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.03975, |
| "grad_norm": 30.0, |
| "grad_norm_var": 3.4436848958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5528, |
| "loss/crossentropy": 1.9812082558870316, |
| "loss/hidden": 3.426171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19151438660919667, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 30.0, |
| "grad_norm_var": 2.6098307291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5232, |
| "loss/crossentropy": 2.0910235196352005, |
| "loss/hidden": 3.4640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20488944984972476, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.04025, |
| "grad_norm": 30.0, |
| "grad_norm_var": 1.5854166666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.7078, |
| "loss/crossentropy": 2.079045358300209, |
| "loss/hidden": 3.483203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20172932054847478, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.0405, |
| "grad_norm": 33.5, |
| "grad_norm_var": 3.4368798010809257e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.5441, |
| "loss/crossentropy": 2.146761792898178, |
| "loss/hidden": 3.562890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20237026009708642, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.04075, |
| "grad_norm": 31.625, |
| "grad_norm_var": 3.4368798024404393e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.7162, |
| "loss/crossentropy": 2.1575208425521852, |
| "loss/hidden": 3.35703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1914537126198411, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.041, |
| "grad_norm": 29.0, |
| "grad_norm_var": 37.940625, |
| "learning_rate": 0.0001, |
| "loss": 7.5673, |
| "loss/crossentropy": 2.2059059768915175, |
| "loss/hidden": 3.385546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1920375470072031, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.04125, |
| "grad_norm": 30.75, |
| "grad_norm_var": 61.631184895833336, |
| "learning_rate": 0.0001, |
| "loss": 7.5788, |
| "loss/crossentropy": 2.1531882882118225, |
| "loss/hidden": 3.3765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.192771671153605, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.0415, |
| "grad_norm": 35.25, |
| "grad_norm_var": 47.353125, |
| "learning_rate": 0.0001, |
| "loss": 7.5891, |
| "loss/crossentropy": 2.1217811673879625, |
| "loss/hidden": 3.4046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19420051630586385, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.04175, |
| "grad_norm": 29.875, |
| "grad_norm_var": 21.580989583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6361, |
| "loss/crossentropy": 2.0970900297164916, |
| "loss/hidden": 3.48828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21125762071460485, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.042, |
| "grad_norm": 36.5, |
| "grad_norm_var": 72.77916666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.7045, |
| "loss/crossentropy": 2.1662445843219755, |
| "loss/hidden": 3.441015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21249269619584082, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.04225, |
| "grad_norm": 32.75, |
| "grad_norm_var": 72.42473958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5754, |
| "loss/crossentropy": 2.1745404630899428, |
| "loss/hidden": 3.478125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20168567337095739, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.0425, |
| "grad_norm": 32.5, |
| "grad_norm_var": 9.149739583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5555, |
| "loss/crossentropy": 2.126581160724163, |
| "loss/hidden": 3.34296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1933064443990588, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.04275, |
| "grad_norm": 32.0, |
| "grad_norm_var": 10.1369140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5377, |
| "loss/crossentropy": 2.1726800590753554, |
| "loss/hidden": 3.441796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20911512654274703, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.043, |
| "grad_norm": 31.0, |
| "grad_norm_var": 3.314322916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6106, |
| "loss/crossentropy": 1.9999253153800964, |
| "loss/hidden": 3.43046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18493321686983108, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.04325, |
| "grad_norm": 38.0, |
| "grad_norm_var": 6.673958333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.7146, |
| "loss/crossentropy": 2.1180618047714233, |
| "loss/hidden": 3.46953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2076649811118841, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.0435, |
| "grad_norm": 34.0, |
| "grad_norm_var": 7.339322916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5464, |
| "loss/crossentropy": 2.0583778262138366, |
| "loss/hidden": 3.322265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18418019600212573, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.04375, |
| "grad_norm": 31.0, |
| "grad_norm_var": 4.749739583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5528, |
| "loss/crossentropy": 2.2259044647216797, |
| "loss/hidden": 3.394921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2031107559800148, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.044, |
| "grad_norm": 30.75, |
| "grad_norm_var": 3.1389973958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5321, |
| "loss/crossentropy": 2.124616578221321, |
| "loss/hidden": 3.46484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2149519257247448, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.04425, |
| "grad_norm": 28.625, |
| "grad_norm_var": 6.442708333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.63, |
| "loss/crossentropy": 2.1418310686945916, |
| "loss/hidden": 3.35390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19125681575387715, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.0445, |
| "grad_norm": 32.25, |
| "grad_norm_var": 3.011458333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5307, |
| "loss/crossentropy": 2.144363935291767, |
| "loss/hidden": 3.440625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19334549438208343, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.04475, |
| "grad_norm": 32.75, |
| "grad_norm_var": 1.6559895833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.627, |
| "loss/crossentropy": 2.2538854971528055, |
| "loss/hidden": 3.385546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2010333575308323, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.045, |
| "grad_norm": 31.5, |
| "grad_norm_var": 1.9416015625, |
| "learning_rate": 0.0001, |
| "loss": 7.6471, |
| "loss/crossentropy": 2.091226762533188, |
| "loss/hidden": 3.450390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2152847982943058, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.04525, |
| "grad_norm": 28.875, |
| "grad_norm_var": 18.03515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5626, |
| "loss/crossentropy": 2.295862782001495, |
| "loss/hidden": 3.3984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20952691733837128, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.0455, |
| "grad_norm": 30.75, |
| "grad_norm_var": 18.4541015625, |
| "learning_rate": 0.0001, |
| "loss": 7.6872, |
| "loss/crossentropy": 2.0833657890558244, |
| "loss/hidden": 3.4875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2070673793554306, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.04575, |
| "grad_norm": 30.25, |
| "grad_norm_var": 2.812239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.647, |
| "loss/crossentropy": 2.1021256439387797, |
| "loss/hidden": 3.387890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18996517434716226, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.046, |
| "grad_norm": 33.75, |
| "grad_norm_var": 2.7009765625, |
| "learning_rate": 0.0001, |
| "loss": 7.5203, |
| "loss/crossentropy": 2.2004275381565095, |
| "loss/hidden": 3.314453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.187726416811347, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.04625, |
| "grad_norm": 30.0, |
| "grad_norm_var": 4.067643229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.6201, |
| "loss/crossentropy": 2.1705893486738206, |
| "loss/hidden": 3.522265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21116435080766677, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.0465, |
| "grad_norm": 32.5, |
| "grad_norm_var": 5.030989583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6235, |
| "loss/crossentropy": 2.1204057022929192, |
| "loss/hidden": 3.351953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19528388790786266, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.04675, |
| "grad_norm": 45.0, |
| "grad_norm_var": 15.6603515625, |
| "learning_rate": 0.0001, |
| "loss": 7.4562, |
| "loss/crossentropy": 2.154220977425575, |
| "loss/hidden": 3.4125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19214881088119, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.047, |
| "grad_norm": 32.5, |
| "grad_norm_var": 15.614518229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.6258, |
| "loss/crossentropy": 2.1000912792980673, |
| "loss/hidden": 3.459375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20185065623372794, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.04725, |
| "grad_norm": 33.25, |
| "grad_norm_var": 5.931184895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.4863, |
| "loss/crossentropy": 2.2270338363945483, |
| "loss/hidden": 3.337890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19286383930593728, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.0475, |
| "grad_norm": 32.5, |
| "grad_norm_var": 3.311393229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6001, |
| "loss/crossentropy": 2.1998794853687285, |
| "loss/hidden": 3.382421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19646506551653148, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.04775, |
| "grad_norm": 31.375, |
| "grad_norm_var": 3.577018229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.653, |
| "loss/crossentropy": 2.104053999483585, |
| "loss/hidden": 3.48203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.201510801166296, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.048, |
| "grad_norm": 30.5, |
| "grad_norm_var": 17.68125, |
| "learning_rate": 0.0001, |
| "loss": 7.5273, |
| "loss/crossentropy": 2.1184800997376443, |
| "loss/hidden": 3.358984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19248204957693815, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.04825, |
| "grad_norm": 33.5, |
| "grad_norm_var": 1.3, |
| "learning_rate": 0.0001, |
| "loss": 7.655, |
| "loss/crossentropy": 2.1734499007463457, |
| "loss/hidden": 3.455859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1961166137829423, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.0485, |
| "grad_norm": 30.125, |
| "grad_norm_var": 2.5205729166666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6186, |
| "loss/crossentropy": 2.237542712688446, |
| "loss/hidden": 3.400390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20616979897022247, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.04875, |
| "grad_norm": 30.75, |
| "grad_norm_var": 2.810416666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5436, |
| "loss/crossentropy": 2.1811093270778654, |
| "loss/hidden": 3.40234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1928004425019026, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.049, |
| "grad_norm": 34.25, |
| "grad_norm_var": 1.5181640625, |
| "learning_rate": 0.0001, |
| "loss": 7.6175, |
| "loss/crossentropy": 2.045266591012478, |
| "loss/hidden": 3.45078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21615136358886958, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.04925, |
| "grad_norm": 33.5, |
| "grad_norm_var": 26.225455729166665, |
| "learning_rate": 0.0001, |
| "loss": 7.7083, |
| "loss/crossentropy": 2.086462992429733, |
| "loss/hidden": 3.469921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2192224683240056, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.0495, |
| "grad_norm": 29.75, |
| "grad_norm_var": 8.340625, |
| "learning_rate": 0.0001, |
| "loss": 7.5032, |
| "loss/crossentropy": 2.0052025958895685, |
| "loss/hidden": 3.407421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18473264537751674, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.04975, |
| "grad_norm": 44.5, |
| "grad_norm_var": 4635.33515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5567, |
| "loss/crossentropy": 2.04089385792613, |
| "loss/hidden": 3.394921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19560968028381467, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 51.0, |
| "grad_norm_var": 4555.746875, |
| "learning_rate": 0.0001, |
| "loss": 7.6029, |
| "loss/crossentropy": 2.2241372987627983, |
| "loss/hidden": 3.387109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2209097046405077, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.05025, |
| "grad_norm": 32.5, |
| "grad_norm_var": 49.11608072916667, |
| "learning_rate": 0.0001, |
| "loss": 7.5781, |
| "loss/crossentropy": 2.2160742044448853, |
| "loss/hidden": 3.4171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19539013858884574, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.0505, |
| "grad_norm": 28.625, |
| "grad_norm_var": 73.70201822916667, |
| "learning_rate": 0.0001, |
| "loss": 7.6375, |
| "loss/crossentropy": 2.094863271713257, |
| "loss/hidden": 3.413671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20763578601181507, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.05075, |
| "grad_norm": 30.125, |
| "grad_norm_var": 22.494791666666668, |
| "learning_rate": 0.0001, |
| "loss": 7.6164, |
| "loss/crossentropy": 2.121490868926048, |
| "loss/hidden": 3.402734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19190637897700072, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.051, |
| "grad_norm": 34.5, |
| "grad_norm_var": 20.549739583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5717, |
| "loss/crossentropy": 2.0418698236346247, |
| "loss/hidden": 3.390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18076165094971658, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.05125, |
| "grad_norm": 29.125, |
| "grad_norm_var": 14.407291666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5867, |
| "loss/crossentropy": 2.1989556729793547, |
| "loss/hidden": 3.385546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2011225748807192, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.0515, |
| "grad_norm": 32.75, |
| "grad_norm_var": 17.120572916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6813, |
| "loss/crossentropy": 2.073545518517494, |
| "loss/hidden": 3.546484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22323863469064237, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.05175, |
| "grad_norm": 42.25, |
| "grad_norm_var": 76.27291666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6323, |
| "loss/crossentropy": 2.1424515694379807, |
| "loss/hidden": 3.44140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19750071745365858, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.052, |
| "grad_norm": 30.625, |
| "grad_norm_var": 74.925, |
| "learning_rate": 0.0001, |
| "loss": 7.6198, |
| "loss/crossentropy": 2.006666135787964, |
| "loss/hidden": 3.463671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19839615989476442, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.05225, |
| "grad_norm": 41.5, |
| "grad_norm_var": 30.869791666666668, |
| "learning_rate": 0.0001, |
| "loss": 7.5671, |
| "loss/crossentropy": 2.2227450221776963, |
| "loss/hidden": 3.451171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20443473970517517, |
| "step": 2090 |
| }, |
| { |
| "epoch": 0.0525, |
| "grad_norm": 34.75, |
| "grad_norm_var": 24.609309895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5928, |
| "loss/crossentropy": 2.186553081870079, |
| "loss/hidden": 3.378125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18928833175450563, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.05275, |
| "grad_norm": 29.125, |
| "grad_norm_var": 18.37265625, |
| "learning_rate": 0.0001, |
| "loss": 7.7389, |
| "loss/crossentropy": 2.0680640071630476, |
| "loss/hidden": 3.4515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.201955908536911, |
| "step": 2110 |
| }, |
| { |
| "epoch": 0.053, |
| "grad_norm": 33.5, |
| "grad_norm_var": 9.276497395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6148, |
| "loss/crossentropy": 2.1981600403785704, |
| "loss/hidden": 3.352734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1901057105511427, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.05325, |
| "grad_norm": 31.75, |
| "grad_norm_var": 5.308072916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.48, |
| "loss/crossentropy": 2.2574460208415985, |
| "loss/hidden": 3.362109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19361322987824678, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.0535, |
| "grad_norm": 29.625, |
| "grad_norm_var": 8.5900390625, |
| "learning_rate": 0.0001, |
| "loss": 7.6644, |
| "loss/crossentropy": 2.154197073727846, |
| "loss/hidden": 3.50625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2086074635386467, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.05375, |
| "grad_norm": 30.5, |
| "grad_norm_var": 8.333072916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5279, |
| "loss/crossentropy": 2.1245498836040495, |
| "loss/hidden": 3.37265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19000006280839443, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.054, |
| "grad_norm": 31.375, |
| "grad_norm_var": 10.856184895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.7098, |
| "loss/crossentropy": 2.1968549311161043, |
| "loss/hidden": 3.469140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21285793352872134, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.05425, |
| "grad_norm": 34.25, |
| "grad_norm_var": 17.820572916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.7095, |
| "loss/crossentropy": 2.2533976465463637, |
| "loss/hidden": 3.434375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20727334953844548, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.0545, |
| "grad_norm": 36.0, |
| "grad_norm_var": 22.128580729166668, |
| "learning_rate": 0.0001, |
| "loss": 7.5933, |
| "loss/crossentropy": 2.227231651544571, |
| "loss/hidden": 3.380078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19693543761968613, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.05475, |
| "grad_norm": 29.125, |
| "grad_norm_var": 11.466666666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5348, |
| "loss/crossentropy": 2.1581582985818386, |
| "loss/hidden": 3.397265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19432583590969443, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.055, |
| "grad_norm": 37.25, |
| "grad_norm_var": 10.514518229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6264, |
| "loss/crossentropy": 2.0908095851540565, |
| "loss/hidden": 3.39296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20568534098565577, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.05525, |
| "grad_norm": 33.25, |
| "grad_norm_var": 5.6494140625, |
| "learning_rate": 0.0001, |
| "loss": 7.671, |
| "loss/crossentropy": 2.0827252097427844, |
| "loss/hidden": 3.54921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20747530292719601, |
| "step": 2210 |
| }, |
| { |
| "epoch": 0.0555, |
| "grad_norm": 31.875, |
| "grad_norm_var": 47.234375, |
| "learning_rate": 0.0001, |
| "loss": 7.5823, |
| "loss/crossentropy": 2.2815075665712357, |
| "loss/hidden": 3.41484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20698099359869956, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.05575, |
| "grad_norm": 37.5, |
| "grad_norm_var": 30.277018229166668, |
| "learning_rate": 0.0001, |
| "loss": 7.5812, |
| "loss/crossentropy": 2.2282338082790374, |
| "loss/hidden": 3.365234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19244133178144693, |
| "step": 2230 |
| }, |
| { |
| "epoch": 0.056, |
| "grad_norm": 31.0, |
| "grad_norm_var": 27.68515625, |
| "learning_rate": 0.0001, |
| "loss": 7.6485, |
| "loss/crossentropy": 2.184544026851654, |
| "loss/hidden": 3.39375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1996760057285428, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.05625, |
| "grad_norm": 30.125, |
| "grad_norm_var": 2.223372395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6003, |
| "loss/crossentropy": 2.1572179198265076, |
| "loss/hidden": 3.38984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20082181300967933, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.0565, |
| "grad_norm": 63.5, |
| "grad_norm_var": 240.61295572916666, |
| "learning_rate": 0.0001, |
| "loss": 7.6129, |
| "loss/crossentropy": 2.1616927281022074, |
| "loss/hidden": 3.494140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21196384327486156, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.05675, |
| "grad_norm": 29.875, |
| "grad_norm_var": 276.98932291666665, |
| "learning_rate": 0.0001, |
| "loss": 7.5683, |
| "loss/crossentropy": 2.1882350742816925, |
| "loss/hidden": 3.43671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2052389807999134, |
| "step": 2270 |
| }, |
| { |
| "epoch": 0.057, |
| "grad_norm": 33.5, |
| "grad_norm_var": 11.6681640625, |
| "learning_rate": 0.0001, |
| "loss": 7.6076, |
| "loss/crossentropy": 2.1808354407548904, |
| "loss/hidden": 3.540625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21027475781738758, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.05725, |
| "grad_norm": 30.5, |
| "grad_norm_var": 1.6872395833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5295, |
| "loss/crossentropy": 2.1488232225179673, |
| "loss/hidden": 3.496875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2111368477344513, |
| "step": 2290 |
| }, |
| { |
| "epoch": 0.0575, |
| "grad_norm": 30.125, |
| "grad_norm_var": 2.0603515625, |
| "learning_rate": 0.0001, |
| "loss": 7.6875, |
| "loss/crossentropy": 2.1594634115695954, |
| "loss/hidden": 3.384765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20478933807462454, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.05775, |
| "grad_norm": 31.625, |
| "grad_norm_var": 3.408072916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6636, |
| "loss/crossentropy": 2.0932443618774412, |
| "loss/hidden": 3.484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21733616031706332, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.058, |
| "grad_norm": 30.5, |
| "grad_norm_var": 3.5416015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5737, |
| "loss/crossentropy": 2.1831247925758364, |
| "loss/hidden": 3.409765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1989585768431425, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.05825, |
| "grad_norm": 30.25, |
| "grad_norm_var": 2.2393229166666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5009, |
| "loss/crossentropy": 2.2847615987062455, |
| "loss/hidden": 3.424609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20479252003133297, |
| "step": 2330 |
| }, |
| { |
| "epoch": 0.0585, |
| "grad_norm": 33.0, |
| "grad_norm_var": 6.374934895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.7313, |
| "loss/crossentropy": 2.1928456306457518, |
| "loss/hidden": 3.401953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20380632691085337, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.05875, |
| "grad_norm": 29.125, |
| "grad_norm_var": 4.7697265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5476, |
| "loss/crossentropy": 2.1438629984855653, |
| "loss/hidden": 3.321875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19849517289549112, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.059, |
| "grad_norm": 33.5, |
| "grad_norm_var": 3.8893229166666665, |
| "learning_rate": 0.0001, |
| "loss": 7.6545, |
| "loss/crossentropy": 2.1132273241877555, |
| "loss/hidden": 3.419140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21593583207577466, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.05925, |
| "grad_norm": 38.0, |
| "grad_norm_var": 3.9934895833333335, |
| "learning_rate": 0.0001, |
| "loss": 7.6438, |
| "loss/crossentropy": 2.1729103833436967, |
| "loss/hidden": 3.4515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.204007020406425, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.0595, |
| "grad_norm": 28.875, |
| "grad_norm_var": 4.180989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5261, |
| "loss/crossentropy": 2.1972236961126326, |
| "loss/hidden": 3.37421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21200564429163932, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.05975, |
| "grad_norm": 30.375, |
| "grad_norm_var": 2.9854166666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6233, |
| "loss/crossentropy": 2.130857673287392, |
| "loss/hidden": 3.398828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2043182048946619, |
| "step": 2390 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 33.0, |
| "grad_norm_var": 2.552018229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5626, |
| "loss/crossentropy": 2.1780240714550017, |
| "loss/hidden": 3.4515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19501971434801818, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.06025, |
| "grad_norm": 33.25, |
| "grad_norm_var": 1.4978515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5495, |
| "loss/crossentropy": 2.075847564637661, |
| "loss/hidden": 3.377734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19856880297884344, |
| "step": 2410 |
| }, |
| { |
| "epoch": 0.0605, |
| "grad_norm": 30.375, |
| "grad_norm_var": 8.763541666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5161, |
| "loss/crossentropy": 1.960635770857334, |
| "loss/hidden": 3.521484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2014876109547913, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.06075, |
| "grad_norm": 30.375, |
| "grad_norm_var": 5.85390625, |
| "learning_rate": 0.0001, |
| "loss": 7.6453, |
| "loss/crossentropy": 2.235329329967499, |
| "loss/hidden": 3.438671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22043778821825982, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.061, |
| "grad_norm": 35.0, |
| "grad_norm_var": 2.928465629875169e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.5153, |
| "loss/crossentropy": 2.0534446865320204, |
| "loss/hidden": 3.469140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20287865065038205, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.06125, |
| "grad_norm": 31.0, |
| "grad_norm_var": 2.928465629810996e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.5862, |
| "loss/crossentropy": 2.1208725392818453, |
| "loss/hidden": 3.534765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22640016246587039, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.0615, |
| "grad_norm": 31.125, |
| "grad_norm_var": 1.5614583333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5751, |
| "loss/crossentropy": 2.206133508682251, |
| "loss/hidden": 3.387109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19071156904101372, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.06175, |
| "grad_norm": 28.625, |
| "grad_norm_var": 7.667643229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5604, |
| "loss/crossentropy": 2.227986590564251, |
| "loss/hidden": 3.30859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19392532519996167, |
| "step": 2470 |
| }, |
| { |
| "epoch": 0.062, |
| "grad_norm": 34.0, |
| "grad_norm_var": 5.81015625, |
| "learning_rate": 0.0001, |
| "loss": 7.6077, |
| "loss/crossentropy": 2.1948474526405333, |
| "loss/hidden": 3.427734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2092638686299324, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.06225, |
| "grad_norm": 34.5, |
| "grad_norm_var": 7.656184895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4338, |
| "loss/crossentropy": 1.935844713449478, |
| "loss/hidden": 3.413671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.184383431263268, |
| "step": 2490 |
| }, |
| { |
| "epoch": 0.0625, |
| "grad_norm": 42.75, |
| "grad_norm_var": 17.27890625, |
| "learning_rate": 0.0001, |
| "loss": 7.5645, |
| "loss/crossentropy": 2.2957077413797378, |
| "loss/hidden": 3.409765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2223764518275857, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.06275, |
| "grad_norm": 30.125, |
| "grad_norm_var": 31.021809895833332, |
| "learning_rate": 0.0001, |
| "loss": 7.5539, |
| "loss/crossentropy": 2.2163919866085053, |
| "loss/hidden": 3.365234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2129704337567091, |
| "step": 2510 |
| }, |
| { |
| "epoch": 0.063, |
| "grad_norm": 33.25, |
| "grad_norm_var": 40.36764322916667, |
| "learning_rate": 0.0001, |
| "loss": 7.5664, |
| "loss/crossentropy": 2.08260739967227, |
| "loss/hidden": 3.4765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20597089193761348, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.06325, |
| "grad_norm": 35.5, |
| "grad_norm_var": 34.153125, |
| "learning_rate": 0.0001, |
| "loss": 7.559, |
| "loss/crossentropy": 2.0747475802898405, |
| "loss/hidden": 3.383203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1930603832937777, |
| "step": 2530 |
| }, |
| { |
| "epoch": 0.0635, |
| "grad_norm": 31.25, |
| "grad_norm_var": 12.302018229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.609, |
| "loss/crossentropy": 2.1542711734771727, |
| "loss/hidden": 3.391796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19414089974015952, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.06375, |
| "grad_norm": 29.125, |
| "grad_norm_var": 34.333072916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5103, |
| "loss/crossentropy": 2.1697633042931557, |
| "loss/hidden": 3.325390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18199986461549997, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.064, |
| "grad_norm": 30.75, |
| "grad_norm_var": 21.8322265625, |
| "learning_rate": 0.0001, |
| "loss": 7.6335, |
| "loss/crossentropy": 2.20294488966465, |
| "loss/hidden": 3.35703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20988359525799752, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.06425, |
| "grad_norm": 36.5, |
| "grad_norm_var": 21.339322916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.665, |
| "loss/crossentropy": 2.1367219746112824, |
| "loss/hidden": 3.4421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19768773801624775, |
| "step": 2570 |
| }, |
| { |
| "epoch": 0.0645, |
| "grad_norm": 42.5, |
| "grad_norm_var": 30.690625, |
| "learning_rate": 0.0001, |
| "loss": 7.6476, |
| "loss/crossentropy": 2.2177338540554046, |
| "loss/hidden": 3.605078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22041462864726782, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.06475, |
| "grad_norm": 34.0, |
| "grad_norm_var": 31.164583333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6158, |
| "loss/crossentropy": 2.2423059731721877, |
| "loss/hidden": 3.358984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21196699403226377, |
| "step": 2590 |
| }, |
| { |
| "epoch": 0.065, |
| "grad_norm": 34.25, |
| "grad_norm_var": 23.593684895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5107, |
| "loss/crossentropy": 2.241020438075066, |
| "loss/hidden": 3.25859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18191274981945754, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.06525, |
| "grad_norm": 28.75, |
| "grad_norm_var": 30.975455729166665, |
| "learning_rate": 0.0001, |
| "loss": 7.5492, |
| "loss/crossentropy": 2.052956056594849, |
| "loss/hidden": 3.447265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2021762602031231, |
| "step": 2610 |
| }, |
| { |
| "epoch": 0.0655, |
| "grad_norm": 32.5, |
| "grad_norm_var": 31.032291666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.503, |
| "loss/crossentropy": 2.17630957365036, |
| "loss/hidden": 3.364453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19023955501616002, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.06575, |
| "grad_norm": 34.5, |
| "grad_norm_var": 6.083072916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5728, |
| "loss/crossentropy": 2.178831994533539, |
| "loss/hidden": 3.38046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1946109678596258, |
| "step": 2630 |
| }, |
| { |
| "epoch": 0.066, |
| "grad_norm": 33.5, |
| "grad_norm_var": 4.01640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5928, |
| "loss/crossentropy": 2.0746659457683565, |
| "loss/hidden": 3.374609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19469854161143302, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.06625, |
| "grad_norm": 31.0, |
| "grad_norm_var": 21.718489583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6008, |
| "loss/crossentropy": 2.202178010344505, |
| "loss/hidden": 3.44609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2072868559509516, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.0665, |
| "grad_norm": 31.375, |
| "grad_norm_var": 24.743489583333332, |
| "learning_rate": 0.0001, |
| "loss": 7.5532, |
| "loss/crossentropy": 2.0579031884670256, |
| "loss/hidden": 3.33671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18248077742755414, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.06675, |
| "grad_norm": 29.5, |
| "grad_norm_var": 6.5181640625, |
| "learning_rate": 0.0001, |
| "loss": 7.6173, |
| "loss/crossentropy": 2.0323975652456285, |
| "loss/hidden": 3.47265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20279558952897786, |
| "step": 2670 |
| }, |
| { |
| "epoch": 0.067, |
| "grad_norm": 30.125, |
| "grad_norm_var": 9.7931640625, |
| "learning_rate": 0.0001, |
| "loss": 7.6202, |
| "loss/crossentropy": 2.170803511887789, |
| "loss/hidden": 3.448828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21887149531394243, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.06725, |
| "grad_norm": 28.75, |
| "grad_norm_var": 6.42890625, |
| "learning_rate": 0.0001, |
| "loss": 7.6042, |
| "loss/crossentropy": 2.129136848449707, |
| "loss/hidden": 3.401171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21714741103351115, |
| "step": 2690 |
| }, |
| { |
| "epoch": 0.0675, |
| "grad_norm": 28.5, |
| "grad_norm_var": 5.76640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5307, |
| "loss/crossentropy": 2.2333458453416823, |
| "loss/hidden": 3.35390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19759367052465676, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.06775, |
| "grad_norm": 31.25, |
| "grad_norm_var": 4.925, |
| "learning_rate": 0.0001, |
| "loss": 7.5684, |
| "loss/crossentropy": 2.1930347234010696, |
| "loss/hidden": 3.3265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19194095116108656, |
| "step": 2710 |
| }, |
| { |
| "epoch": 0.068, |
| "grad_norm": 29.875, |
| "grad_norm_var": 1.0291666666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6177, |
| "loss/crossentropy": 2.1858380883932114, |
| "loss/hidden": 3.350390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18830202352255582, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.06825, |
| "grad_norm": 33.25, |
| "grad_norm_var": 5.19765625, |
| "learning_rate": 0.0001, |
| "loss": 7.6231, |
| "loss/crossentropy": 2.1380992412567137, |
| "loss/hidden": 3.40703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19919480197131634, |
| "step": 2730 |
| }, |
| { |
| "epoch": 0.0685, |
| "grad_norm": 34.75, |
| "grad_norm_var": 2.5872395833333335, |
| "learning_rate": 0.0001, |
| "loss": 7.7166, |
| "loss/crossentropy": 2.299161267280579, |
| "loss/hidden": 3.403125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21277474984526634, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.06875, |
| "grad_norm": 28.25, |
| "grad_norm_var": 10.6525390625, |
| "learning_rate": 0.0001, |
| "loss": 7.6385, |
| "loss/crossentropy": 2.0595856219530106, |
| "loss/hidden": 3.397265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20196862574666738, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.069, |
| "grad_norm": 30.0, |
| "grad_norm_var": 3.1885416666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5306, |
| "loss/crossentropy": 2.1047334015369414, |
| "loss/hidden": 3.527734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20494798701256514, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.06925, |
| "grad_norm": 35.5, |
| "grad_norm_var": 2.5136418842603423e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.5758, |
| "loss/crossentropy": 1.9750292956829072, |
| "loss/hidden": 3.49453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1999529790133238, |
| "step": 2770 |
| }, |
| { |
| "epoch": 0.0695, |
| "grad_norm": 30.875, |
| "grad_norm_var": 2.513641880435452e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.5901, |
| "loss/crossentropy": 2.1417267471551895, |
| "loss/hidden": 3.412109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20058641098439695, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.06975, |
| "grad_norm": 28.25, |
| "grad_norm_var": 155.97024739583333, |
| "learning_rate": 0.0001, |
| "loss": 7.5485, |
| "loss/crossentropy": 2.1048891723155974, |
| "loss/hidden": 3.33515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19070767909288405, |
| "step": 2790 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 32.25, |
| "grad_norm_var": 6.499739583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5996, |
| "loss/crossentropy": 2.176823168247938, |
| "loss/hidden": 3.344921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20276176873594523, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.07025, |
| "grad_norm": 29.5, |
| "grad_norm_var": 4.684309895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.4426, |
| "loss/crossentropy": 2.041793665289879, |
| "loss/hidden": 3.3265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17560106106102466, |
| "step": 2810 |
| }, |
| { |
| "epoch": 0.0705, |
| "grad_norm": 31.75, |
| "grad_norm_var": 3.005989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5811, |
| "loss/crossentropy": 2.1094699330627917, |
| "loss/hidden": 3.424609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19798500649631023, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.07075, |
| "grad_norm": 33.75, |
| "grad_norm_var": 4.320572916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5475, |
| "loss/crossentropy": 2.1311514347791674, |
| "loss/hidden": 3.344921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21825175136327743, |
| "step": 2830 |
| }, |
| { |
| "epoch": 0.071, |
| "grad_norm": 34.75, |
| "grad_norm_var": 3.7549465266724797e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.5823, |
| "loss/crossentropy": 2.120433983206749, |
| "loss/hidden": 3.412109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20115374326705932, |
| "step": 2840 |
| }, |
| { |
| "epoch": 0.07125, |
| "grad_norm": 29.25, |
| "grad_norm_var": 3.7549465268743306e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.5444, |
| "loss/crossentropy": 2.1518867701292037, |
| "loss/hidden": 3.398046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19730570819228888, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.0715, |
| "grad_norm": 31.0, |
| "grad_norm_var": 2.317122395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6362, |
| "loss/crossentropy": 2.137280356884003, |
| "loss/hidden": 3.422265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20695031639188527, |
| "step": 2860 |
| }, |
| { |
| "epoch": 0.07175, |
| "grad_norm": 34.0, |
| "grad_norm_var": 157.31015625, |
| "learning_rate": 0.0001, |
| "loss": 7.6643, |
| "loss/crossentropy": 2.067914080619812, |
| "loss/hidden": 3.5015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22938031535595654, |
| "step": 2870 |
| }, |
| { |
| "epoch": 0.072, |
| "grad_norm": 32.5, |
| "grad_norm_var": 151.77057291666668, |
| "learning_rate": 0.0001, |
| "loss": 7.5762, |
| "loss/crossentropy": 2.0962916165590286, |
| "loss/hidden": 3.456640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19928023852407933, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.07225, |
| "grad_norm": 35.75, |
| "grad_norm_var": 104.17962239583333, |
| "learning_rate": 0.0001, |
| "loss": 7.7066, |
| "loss/crossentropy": 2.1184468276798727, |
| "loss/hidden": 3.376953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1974171632900834, |
| "step": 2890 |
| }, |
| { |
| "epoch": 0.0725, |
| "grad_norm": 32.25, |
| "grad_norm_var": 116.02493489583334, |
| "learning_rate": 0.0001, |
| "loss": 7.6075, |
| "loss/crossentropy": 2.1381339877843857, |
| "loss/hidden": 3.434765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19655965138226747, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.07275, |
| "grad_norm": 31.5, |
| "grad_norm_var": 1.9551432291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5247, |
| "loss/crossentropy": 2.141963595151901, |
| "loss/hidden": 3.476953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19582534320652484, |
| "step": 2910 |
| }, |
| { |
| "epoch": 0.073, |
| "grad_norm": 31.875, |
| "grad_norm_var": 1.3014973958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5588, |
| "loss/crossentropy": 2.077046422660351, |
| "loss/hidden": 3.465234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20551967658102513, |
| "step": 2920 |
| }, |
| { |
| "epoch": 0.07325, |
| "grad_norm": 32.25, |
| "grad_norm_var": 8.734375, |
| "learning_rate": 0.0001, |
| "loss": 7.6195, |
| "loss/crossentropy": 2.0110410653054713, |
| "loss/hidden": 3.3984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1805876674130559, |
| "step": 2930 |
| }, |
| { |
| "epoch": 0.0735, |
| "grad_norm": 32.25, |
| "grad_norm_var": 4838.416666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6623, |
| "loss/crossentropy": 2.1130379527807235, |
| "loss/hidden": 3.478125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2008265011012554, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.07375, |
| "grad_norm": 39.0, |
| "grad_norm_var": 57.90807291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.59, |
| "loss/crossentropy": 2.1667733818292616, |
| "loss/hidden": 3.3109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19806477334350348, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.074, |
| "grad_norm": 31.875, |
| "grad_norm_var": 31.228059895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5131, |
| "loss/crossentropy": 2.2757086992263793, |
| "loss/hidden": 3.445703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.197940625064075, |
| "step": 2960 |
| }, |
| { |
| "epoch": 0.07425, |
| "grad_norm": 34.25, |
| "grad_norm_var": 2.996809895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5841, |
| "loss/crossentropy": 2.143115535378456, |
| "loss/hidden": 3.45625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20177703239023687, |
| "step": 2970 |
| }, |
| { |
| "epoch": 0.0745, |
| "grad_norm": 31.0, |
| "grad_norm_var": 3.1546223958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5363, |
| "loss/crossentropy": 2.2850147604942324, |
| "loss/hidden": 3.283984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19140432458370923, |
| "step": 2980 |
| }, |
| { |
| "epoch": 0.07475, |
| "grad_norm": 32.25, |
| "grad_norm_var": 7.310416666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5339, |
| "loss/crossentropy": 2.104042625427246, |
| "loss/hidden": 3.490625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.211691821180284, |
| "step": 2990 |
| }, |
| { |
| "epoch": 0.075, |
| "grad_norm": 32.5, |
| "grad_norm_var": 8.506705729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5448, |
| "loss/crossentropy": 2.2270253866910936, |
| "loss/hidden": 3.4390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20328052509576083, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.07525, |
| "grad_norm": 46.75, |
| "grad_norm_var": 17.10390625, |
| "learning_rate": 0.0001, |
| "loss": 7.6114, |
| "loss/crossentropy": 2.2623429775238035, |
| "loss/hidden": 3.32265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19479246698319913, |
| "step": 3010 |
| }, |
| { |
| "epoch": 0.0755, |
| "grad_norm": 31.75, |
| "grad_norm_var": 18.981184895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5739, |
| "loss/crossentropy": 2.1012531995773314, |
| "loss/hidden": 3.454296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20767469964921476, |
| "step": 3020 |
| }, |
| { |
| "epoch": 0.07575, |
| "grad_norm": 45.25, |
| "grad_norm_var": 15.55, |
| "learning_rate": 0.0001, |
| "loss": 7.6153, |
| "loss/crossentropy": 2.090057593584061, |
| "loss/hidden": 3.468359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1894347405061126, |
| "step": 3030 |
| }, |
| { |
| "epoch": 0.076, |
| "grad_norm": 32.0, |
| "grad_norm_var": 29.878125, |
| "learning_rate": 0.0001, |
| "loss": 7.6585, |
| "loss/crossentropy": 2.200800988078117, |
| "loss/hidden": 3.4234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19270651414990425, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.07625, |
| "grad_norm": 31.0, |
| "grad_norm_var": 10.167708333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5696, |
| "loss/crossentropy": 2.0458697080612183, |
| "loss/hidden": 3.53046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20314501021057368, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.0765, |
| "grad_norm": 30.375, |
| "grad_norm_var": 10.0181640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5747, |
| "loss/crossentropy": 2.2118399769067763, |
| "loss/hidden": 3.482421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22158339023590087, |
| "step": 3060 |
| }, |
| { |
| "epoch": 0.07675, |
| "grad_norm": 32.25, |
| "grad_norm_var": 0.9650390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5631, |
| "loss/crossentropy": 2.1812553733587263, |
| "loss/hidden": 3.4375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.217013025470078, |
| "step": 3070 |
| }, |
| { |
| "epoch": 0.077, |
| "grad_norm": 32.75, |
| "grad_norm_var": 0.9171223958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.578, |
| "loss/crossentropy": 2.181536224484444, |
| "loss/hidden": 3.36953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19952490609139203, |
| "step": 3080 |
| }, |
| { |
| "epoch": 0.07725, |
| "grad_norm": 31.5, |
| "grad_norm_var": 19.015559895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.4878, |
| "loss/crossentropy": 2.122265163064003, |
| "loss/hidden": 3.4171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19191155321896075, |
| "step": 3090 |
| }, |
| { |
| "epoch": 0.0775, |
| "grad_norm": 32.75, |
| "grad_norm_var": 5.199739583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.7471, |
| "loss/crossentropy": 2.210584083199501, |
| "loss/hidden": 3.42890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21269479542970657, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.07775, |
| "grad_norm": 33.25, |
| "grad_norm_var": 4.490625, |
| "learning_rate": 0.0001, |
| "loss": 7.6586, |
| "loss/crossentropy": 2.090894425660372, |
| "loss/hidden": 3.46328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19853799045085907, |
| "step": 3110 |
| }, |
| { |
| "epoch": 0.078, |
| "grad_norm": 31.375, |
| "grad_norm_var": 27.370833333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5818, |
| "loss/crossentropy": 2.044330509006977, |
| "loss/hidden": 3.50078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20974230151623488, |
| "step": 3120 |
| }, |
| { |
| "epoch": 0.07825, |
| "grad_norm": 31.75, |
| "grad_norm_var": 6.688997395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.654, |
| "loss/crossentropy": 2.1380004197359086, |
| "loss/hidden": 3.3875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20455153118818997, |
| "step": 3130 |
| }, |
| { |
| "epoch": 0.0785, |
| "grad_norm": 29.875, |
| "grad_norm_var": 7.715625, |
| "learning_rate": 0.0001, |
| "loss": 7.6551, |
| "loss/crossentropy": 2.1576705113053323, |
| "loss/hidden": 3.4484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20728315506130457, |
| "step": 3140 |
| }, |
| { |
| "epoch": 0.07875, |
| "grad_norm": 30.875, |
| "grad_norm_var": 3.2864583333333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5478, |
| "loss/crossentropy": 2.233546493947506, |
| "loss/hidden": 3.424609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20845879297703504, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.079, |
| "grad_norm": 34.25, |
| "grad_norm_var": 15.4728515625, |
| "learning_rate": 0.0001, |
| "loss": 7.6264, |
| "loss/crossentropy": 2.0873878210783006, |
| "loss/hidden": 3.355859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19576258175075054, |
| "step": 3160 |
| }, |
| { |
| "epoch": 0.07925, |
| "grad_norm": 29.625, |
| "grad_norm_var": 3.12265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5192, |
| "loss/crossentropy": 2.132347696274519, |
| "loss/hidden": 3.345703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1958464809693396, |
| "step": 3170 |
| }, |
| { |
| "epoch": 0.0795, |
| "grad_norm": 33.0, |
| "grad_norm_var": 3.370247395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5169, |
| "loss/crossentropy": 2.2280534476041796, |
| "loss/hidden": 3.35546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19412651136517525, |
| "step": 3180 |
| }, |
| { |
| "epoch": 0.07975, |
| "grad_norm": 28.25, |
| "grad_norm_var": 4.989518229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.6576, |
| "loss/crossentropy": 2.138143754005432, |
| "loss/hidden": 3.58828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.23302078600972892, |
| "step": 3190 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 30.375, |
| "grad_norm_var": 6.858072916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5707, |
| "loss/crossentropy": 2.1512865126132965, |
| "loss/hidden": 3.424609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1957532402127981, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.08025, |
| "grad_norm": 39.0, |
| "grad_norm_var": 34.83515625, |
| "learning_rate": 0.0001, |
| "loss": 7.6814, |
| "loss/crossentropy": 2.113873428106308, |
| "loss/hidden": 3.52890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22486987188458443, |
| "step": 3210 |
| }, |
| { |
| "epoch": 0.0805, |
| "grad_norm": 30.625, |
| "grad_norm_var": 33.244205729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.6542, |
| "loss/crossentropy": 2.153007471561432, |
| "loss/hidden": 3.5078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20893471594899893, |
| "step": 3220 |
| }, |
| { |
| "epoch": 0.08075, |
| "grad_norm": 31.25, |
| "grad_norm_var": 619.4145182291667, |
| "learning_rate": 0.0001, |
| "loss": 7.5091, |
| "loss/crossentropy": 2.169908273220062, |
| "loss/hidden": 3.456640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20535510070621968, |
| "step": 3230 |
| }, |
| { |
| "epoch": 0.081, |
| "grad_norm": 30.0, |
| "grad_norm_var": 599.9247395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5568, |
| "loss/crossentropy": 2.048283484578133, |
| "loss/hidden": 3.593359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21250182073563337, |
| "step": 3240 |
| }, |
| { |
| "epoch": 0.08125, |
| "grad_norm": 30.5, |
| "grad_norm_var": 2.982291666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5569, |
| "loss/crossentropy": 2.0828478574752807, |
| "loss/hidden": 3.496484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20137296654284, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.0815, |
| "grad_norm": 28.25, |
| "grad_norm_var": 17.9900390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5869, |
| "loss/crossentropy": 2.28429861664772, |
| "loss/hidden": 3.375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21088685300201176, |
| "step": 3260 |
| }, |
| { |
| "epoch": 0.08175, |
| "grad_norm": 32.25, |
| "grad_norm_var": 2.5587890625, |
| "learning_rate": 0.0001, |
| "loss": 7.6037, |
| "loss/crossentropy": 2.1336950808763504, |
| "loss/hidden": 3.431640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20057348478585482, |
| "step": 3270 |
| }, |
| { |
| "epoch": 0.082, |
| "grad_norm": 33.75, |
| "grad_norm_var": 34.57076822916667, |
| "learning_rate": 0.0001, |
| "loss": 7.6129, |
| "loss/crossentropy": 2.0662791609764097, |
| "loss/hidden": 3.408203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22612145096063613, |
| "step": 3280 |
| }, |
| { |
| "epoch": 0.08225, |
| "grad_norm": 32.75, |
| "grad_norm_var": 18.8181640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5469, |
| "loss/crossentropy": 2.069541847705841, |
| "loss/hidden": 3.376171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1844556663185358, |
| "step": 3290 |
| }, |
| { |
| "epoch": 0.0825, |
| "grad_norm": 29.375, |
| "grad_norm_var": 3.0712890625, |
| "learning_rate": 0.0001, |
| "loss": 7.6025, |
| "loss/crossentropy": 2.2499852567911147, |
| "loss/hidden": 3.451953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21941267363727093, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.08275, |
| "grad_norm": 33.0, |
| "grad_norm_var": 1.29140625, |
| "learning_rate": 0.0001, |
| "loss": 7.6334, |
| "loss/crossentropy": 2.1277933359146117, |
| "loss/hidden": 3.437109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19795072823762894, |
| "step": 3310 |
| }, |
| { |
| "epoch": 0.083, |
| "grad_norm": 34.0, |
| "grad_norm_var": 3.41015625, |
| "learning_rate": 0.0001, |
| "loss": 7.6326, |
| "loss/crossentropy": 2.214772176742554, |
| "loss/hidden": 3.35390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20066177062690257, |
| "step": 3320 |
| }, |
| { |
| "epoch": 0.08325, |
| "grad_norm": 29.875, |
| "grad_norm_var": 18.768489583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.569, |
| "loss/crossentropy": 2.093920087814331, |
| "loss/hidden": 3.4375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2021244278177619, |
| "step": 3330 |
| }, |
| { |
| "epoch": 0.0835, |
| "grad_norm": 30.25, |
| "grad_norm_var": 18.695833333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5041, |
| "loss/crossentropy": 2.0556397035717966, |
| "loss/hidden": 3.34765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17018966227769852, |
| "step": 3340 |
| }, |
| { |
| "epoch": 0.08375, |
| "grad_norm": 28.875, |
| "grad_norm_var": 2.854622395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5837, |
| "loss/crossentropy": 2.0227382972836496, |
| "loss/hidden": 3.4890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21123163159936667, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.084, |
| "grad_norm": 33.5, |
| "grad_norm_var": 2.894791666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5374, |
| "loss/crossentropy": 2.146658593416214, |
| "loss/hidden": 3.3515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18999559991061687, |
| "step": 3360 |
| }, |
| { |
| "epoch": 0.08425, |
| "grad_norm": 32.5, |
| "grad_norm_var": 2.5759765625, |
| "learning_rate": 0.0001, |
| "loss": 7.6197, |
| "loss/crossentropy": 1.9798088841140271, |
| "loss/hidden": 3.569921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20031734639778734, |
| "step": 3370 |
| }, |
| { |
| "epoch": 0.0845, |
| "grad_norm": 30.5, |
| "grad_norm_var": 1.6436848958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5738, |
| "loss/crossentropy": 2.1442878276109694, |
| "loss/hidden": 3.458984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20664554908871652, |
| "step": 3380 |
| }, |
| { |
| "epoch": 0.08475, |
| "grad_norm": 29.25, |
| "grad_norm_var": 1.9875, |
| "learning_rate": 0.0001, |
| "loss": 7.4965, |
| "loss/crossentropy": 2.150037130713463, |
| "loss/hidden": 3.3625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19511928483843805, |
| "step": 3390 |
| }, |
| { |
| "epoch": 0.085, |
| "grad_norm": 32.25, |
| "grad_norm_var": 2.2122395833333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5151, |
| "loss/crossentropy": 2.0501646161079408, |
| "loss/hidden": 3.425390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19615819547325372, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.08525, |
| "grad_norm": 33.75, |
| "grad_norm_var": 2.720833333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6432, |
| "loss/crossentropy": 2.2328430742025374, |
| "loss/hidden": 3.41796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19816372059285642, |
| "step": 3410 |
| }, |
| { |
| "epoch": 0.0855, |
| "grad_norm": 31.375, |
| "grad_norm_var": 7.519791666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.665, |
| "loss/crossentropy": 2.1567111521959306, |
| "loss/hidden": 3.500390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19839788805693387, |
| "step": 3420 |
| }, |
| { |
| "epoch": 0.08575, |
| "grad_norm": 40.25, |
| "grad_norm_var": 7.808072916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5919, |
| "loss/crossentropy": 2.2465982705354692, |
| "loss/hidden": 3.46328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21547308284789324, |
| "step": 3430 |
| }, |
| { |
| "epoch": 0.086, |
| "grad_norm": 33.0, |
| "grad_norm_var": 8.09765625, |
| "learning_rate": 0.0001, |
| "loss": 7.6239, |
| "loss/crossentropy": 2.1680932968854902, |
| "loss/hidden": 3.391015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21484404131770135, |
| "step": 3440 |
| }, |
| { |
| "epoch": 0.08625, |
| "grad_norm": 28.625, |
| "grad_norm_var": 3.070572916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4802, |
| "loss/crossentropy": 2.050510385632515, |
| "loss/hidden": 3.421484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18613745234906673, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.0865, |
| "grad_norm": 33.5, |
| "grad_norm_var": 2.2143229166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5311, |
| "loss/crossentropy": 2.0699805982410906, |
| "loss/hidden": 3.40390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1896037317812443, |
| "step": 3460 |
| }, |
| { |
| "epoch": 0.08675, |
| "grad_norm": 31.875, |
| "grad_norm_var": 5.651822916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.596, |
| "loss/crossentropy": 2.0576356425881386, |
| "loss/hidden": 3.522265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20867941789329053, |
| "step": 3470 |
| }, |
| { |
| "epoch": 0.087, |
| "grad_norm": 29.625, |
| "grad_norm_var": 5.998372395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4738, |
| "loss/crossentropy": 2.2489717990159988, |
| "loss/hidden": 3.3203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18595699593424797, |
| "step": 3480 |
| }, |
| { |
| "epoch": 0.08725, |
| "grad_norm": 28.5, |
| "grad_norm_var": 5.2056640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5246, |
| "loss/crossentropy": 2.178911143541336, |
| "loss/hidden": 3.36796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.204429741948843, |
| "step": 3490 |
| }, |
| { |
| "epoch": 0.0875, |
| "grad_norm": 28.25, |
| "grad_norm_var": 1.8134765625, |
| "learning_rate": 0.0001, |
| "loss": 7.4901, |
| "loss/crossentropy": 2.1816289871931076, |
| "loss/hidden": 3.425390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20369651466608046, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.08775, |
| "grad_norm": 32.25, |
| "grad_norm_var": 59.895572916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6056, |
| "loss/crossentropy": 2.124045217037201, |
| "loss/hidden": 3.562890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2361205333843827, |
| "step": 3510 |
| }, |
| { |
| "epoch": 0.088, |
| "grad_norm": 31.375, |
| "grad_norm_var": 58.15358072916667, |
| "learning_rate": 0.0001, |
| "loss": 7.6185, |
| "loss/crossentropy": 2.0576027542352677, |
| "loss/hidden": 3.408984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19163726801052688, |
| "step": 3520 |
| }, |
| { |
| "epoch": 0.08825, |
| "grad_norm": 30.875, |
| "grad_norm_var": 19.622330729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.608, |
| "loss/crossentropy": 2.2166375398635862, |
| "loss/hidden": 3.430078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20556784830987454, |
| "step": 3530 |
| }, |
| { |
| "epoch": 0.0885, |
| "grad_norm": 33.25, |
| "grad_norm_var": 3.76875, |
| "learning_rate": 0.0001, |
| "loss": 7.5655, |
| "loss/crossentropy": 2.215288892388344, |
| "loss/hidden": 3.43515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21560241151601076, |
| "step": 3540 |
| }, |
| { |
| "epoch": 0.08875, |
| "grad_norm": 30.0, |
| "grad_norm_var": 3.316666666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4909, |
| "loss/crossentropy": 2.0867604553699493, |
| "loss/hidden": 3.391015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1992955395951867, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.089, |
| "grad_norm": 33.25, |
| "grad_norm_var": 286.81555989583336, |
| "learning_rate": 0.0001, |
| "loss": 7.716, |
| "loss/crossentropy": 2.2209325939416886, |
| "loss/hidden": 3.46484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19783683270215988, |
| "step": 3560 |
| }, |
| { |
| "epoch": 0.08925, |
| "grad_norm": 34.75, |
| "grad_norm_var": 286.8197916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5905, |
| "loss/crossentropy": 2.098256954550743, |
| "loss/hidden": 3.309765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.184653827175498, |
| "step": 3570 |
| }, |
| { |
| "epoch": 0.0895, |
| "grad_norm": 31.125, |
| "grad_norm_var": 5.868489583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.625, |
| "loss/crossentropy": 2.0555992782115937, |
| "loss/hidden": 3.45546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19005871675908564, |
| "step": 3580 |
| }, |
| { |
| "epoch": 0.08975, |
| "grad_norm": 37.0, |
| "grad_norm_var": 7.585416666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.542, |
| "loss/crossentropy": 2.0350914053618907, |
| "loss/hidden": 3.396484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.192917075753212, |
| "step": 3590 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 34.25, |
| "grad_norm_var": 26.8322265625, |
| "learning_rate": 0.0001, |
| "loss": 7.6028, |
| "loss/crossentropy": 2.249291920661926, |
| "loss/hidden": 3.33125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20490463990718127, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.09025, |
| "grad_norm": 30.375, |
| "grad_norm_var": 2.9457682291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5738, |
| "loss/crossentropy": 2.201851597428322, |
| "loss/hidden": 3.434765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20158183835446836, |
| "step": 3610 |
| }, |
| { |
| "epoch": 0.0905, |
| "grad_norm": 30.125, |
| "grad_norm_var": 14.446809895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5562, |
| "loss/crossentropy": 2.188534340262413, |
| "loss/hidden": 3.491796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22296689171344042, |
| "step": 3620 |
| }, |
| { |
| "epoch": 0.09075, |
| "grad_norm": 32.0, |
| "grad_norm_var": 15.792643229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5509, |
| "loss/crossentropy": 2.1134337186813354, |
| "loss/hidden": 3.4078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19732800796627997, |
| "step": 3630 |
| }, |
| { |
| "epoch": 0.091, |
| "grad_norm": 29.875, |
| "grad_norm_var": 14.366080729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6349, |
| "loss/crossentropy": 2.0555363953113557, |
| "loss/hidden": 3.4328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19681458938866853, |
| "step": 3640 |
| }, |
| { |
| "epoch": 0.09125, |
| "grad_norm": 33.25, |
| "grad_norm_var": 13.740559895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.6513, |
| "loss/crossentropy": 2.2402344048023224, |
| "loss/hidden": 3.46171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20878477580845356, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.0915, |
| "grad_norm": 32.5, |
| "grad_norm_var": 30.0150390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5723, |
| "loss/crossentropy": 2.14640394449234, |
| "loss/hidden": 3.453515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20020358953624964, |
| "step": 3660 |
| }, |
| { |
| "epoch": 0.09175, |
| "grad_norm": 36.0, |
| "grad_norm_var": 49.2619140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5782, |
| "loss/crossentropy": 2.2063133299350737, |
| "loss/hidden": 3.390234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19907979741692544, |
| "step": 3670 |
| }, |
| { |
| "epoch": 0.092, |
| "grad_norm": 32.0, |
| "grad_norm_var": 15.4400390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5501, |
| "loss/crossentropy": 2.1933626160025597, |
| "loss/hidden": 3.332421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19394716806709766, |
| "step": 3680 |
| }, |
| { |
| "epoch": 0.09225, |
| "grad_norm": 28.375, |
| "grad_norm_var": 13.159375, |
| "learning_rate": 0.0001, |
| "loss": 7.5258, |
| "loss/crossentropy": 2.0778674989938737, |
| "loss/hidden": 3.344921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19343881569802762, |
| "step": 3690 |
| }, |
| { |
| "epoch": 0.0925, |
| "grad_norm": 32.5, |
| "grad_norm_var": 15.096809895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4597, |
| "loss/crossentropy": 2.0565507017076015, |
| "loss/hidden": 3.389453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19122497290372847, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.09275, |
| "grad_norm": 29.625, |
| "grad_norm_var": 2.729622395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5714, |
| "loss/crossentropy": 2.082234078645706, |
| "loss/hidden": 3.437109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2102882768958807, |
| "step": 3710 |
| }, |
| { |
| "epoch": 0.093, |
| "grad_norm": 30.0, |
| "grad_norm_var": 2.6254557291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6155, |
| "loss/crossentropy": 2.2312920093536377, |
| "loss/hidden": 3.34609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18575988691300155, |
| "step": 3720 |
| }, |
| { |
| "epoch": 0.09325, |
| "grad_norm": 33.25, |
| "grad_norm_var": 1.6181640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5344, |
| "loss/crossentropy": 2.2397233605384828, |
| "loss/hidden": 3.343359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20670964010059834, |
| "step": 3730 |
| }, |
| { |
| "epoch": 0.0935, |
| "grad_norm": 33.5, |
| "grad_norm_var": 3.3124348958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5839, |
| "loss/crossentropy": 2.2065307170152666, |
| "loss/hidden": 3.348046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20138480551540852, |
| "step": 3740 |
| }, |
| { |
| "epoch": 0.09375, |
| "grad_norm": 36.25, |
| "grad_norm_var": 9.07265625, |
| "learning_rate": 0.0001, |
| "loss": 7.6468, |
| "loss/crossentropy": 2.192644628882408, |
| "loss/hidden": 3.420703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20751077253371478, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.094, |
| "grad_norm": 32.25, |
| "grad_norm_var": 18.11015625, |
| "learning_rate": 0.0001, |
| "loss": 7.731, |
| "loss/crossentropy": 2.205154886841774, |
| "loss/hidden": 3.38984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20712865255773066, |
| "step": 3760 |
| }, |
| { |
| "epoch": 0.09425, |
| "grad_norm": 30.75, |
| "grad_norm_var": 30.259309895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4267, |
| "loss/crossentropy": 2.0174816213548183, |
| "loss/hidden": 3.363671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1771852731704712, |
| "step": 3770 |
| }, |
| { |
| "epoch": 0.0945, |
| "grad_norm": 32.75, |
| "grad_norm_var": 2.687239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5078, |
| "loss/crossentropy": 2.142752841114998, |
| "loss/hidden": 3.384375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2145843595266342, |
| "step": 3780 |
| }, |
| { |
| "epoch": 0.09475, |
| "grad_norm": 41.5, |
| "grad_norm_var": 28.512239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5586, |
| "loss/crossentropy": 2.2184203058481216, |
| "loss/hidden": 3.433203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2169014524668455, |
| "step": 3790 |
| }, |
| { |
| "epoch": 0.095, |
| "grad_norm": 32.5, |
| "grad_norm_var": 29.637955729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5796, |
| "loss/crossentropy": 2.0424424298107624, |
| "loss/hidden": 3.440234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19471338465809823, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.09525, |
| "grad_norm": 31.25, |
| "grad_norm_var": 2.846809895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4697, |
| "loss/crossentropy": 2.2178969264030455, |
| "loss/hidden": 3.215625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18599100317806005, |
| "step": 3810 |
| }, |
| { |
| "epoch": 0.0955, |
| "grad_norm": 35.75, |
| "grad_norm_var": 5.571809895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5899, |
| "loss/crossentropy": 2.147325333952904, |
| "loss/hidden": 3.373046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19797458127141, |
| "step": 3820 |
| }, |
| { |
| "epoch": 0.09575, |
| "grad_norm": 31.0, |
| "grad_norm_var": 19.242708333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4524, |
| "loss/crossentropy": 2.222689136862755, |
| "loss/hidden": 3.390234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20618323031812907, |
| "step": 3830 |
| }, |
| { |
| "epoch": 0.096, |
| "grad_norm": 31.125, |
| "grad_norm_var": 6.659309895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5371, |
| "loss/crossentropy": 2.045217160880566, |
| "loss/hidden": 3.50703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21105091590434313, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.09625, |
| "grad_norm": 29.5, |
| "grad_norm_var": 3.6624348958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.4832, |
| "loss/crossentropy": 2.1246922612190247, |
| "loss/hidden": 3.37109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19520843997597695, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.0965, |
| "grad_norm": 32.25, |
| "grad_norm_var": 3.840625, |
| "learning_rate": 0.0001, |
| "loss": 7.5247, |
| "loss/crossentropy": 2.1101455599069596, |
| "loss/hidden": 3.391796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19655006285756826, |
| "step": 3860 |
| }, |
| { |
| "epoch": 0.09675, |
| "grad_norm": 31.375, |
| "grad_norm_var": 4.0759765625, |
| "learning_rate": 0.0001, |
| "loss": 7.7123, |
| "loss/crossentropy": 2.1925098091363906, |
| "loss/hidden": 3.57265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21303071565926074, |
| "step": 3870 |
| }, |
| { |
| "epoch": 0.097, |
| "grad_norm": 32.25, |
| "grad_norm_var": 2.940559895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6194, |
| "loss/crossentropy": 2.0254321210086346, |
| "loss/hidden": 3.504296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21549067068845035, |
| "step": 3880 |
| }, |
| { |
| "epoch": 0.09725, |
| "grad_norm": 32.75, |
| "grad_norm_var": 1.6895833333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5341, |
| "loss/crossentropy": 2.18448192179203, |
| "loss/hidden": 3.331640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18761022239923478, |
| "step": 3890 |
| }, |
| { |
| "epoch": 0.0975, |
| "grad_norm": 32.5, |
| "grad_norm_var": 14.326822916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6378, |
| "loss/crossentropy": 2.0941652059555054, |
| "loss/hidden": 3.45703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20138136427849532, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.09775, |
| "grad_norm": 31.875, |
| "grad_norm_var": 2.1186848958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6588, |
| "loss/crossentropy": 2.09155390933156, |
| "loss/hidden": 3.487109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2071656842716038, |
| "step": 3910 |
| }, |
| { |
| "epoch": 0.098, |
| "grad_norm": 32.5, |
| "grad_norm_var": 3.729622395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6663, |
| "loss/crossentropy": 2.1858886659145353, |
| "loss/hidden": 3.41796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20006632767617702, |
| "step": 3920 |
| }, |
| { |
| "epoch": 0.09825, |
| "grad_norm": 40.25, |
| "grad_norm_var": 15.908072916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5567, |
| "loss/crossentropy": 2.2354005187749864, |
| "loss/hidden": 3.355859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20157534964382648, |
| "step": 3930 |
| }, |
| { |
| "epoch": 0.0985, |
| "grad_norm": 31.625, |
| "grad_norm_var": 17.745768229166668, |
| "learning_rate": 0.0001, |
| "loss": 7.5189, |
| "loss/crossentropy": 2.015377716720104, |
| "loss/hidden": 3.390234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1868499366566539, |
| "step": 3940 |
| }, |
| { |
| "epoch": 0.09875, |
| "grad_norm": 33.0, |
| "grad_norm_var": 7.66640625, |
| "learning_rate": 0.0001, |
| "loss": 7.533, |
| "loss/crossentropy": 2.0591939449310304, |
| "loss/hidden": 3.385546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20258171651512386, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.099, |
| "grad_norm": 41.25, |
| "grad_norm_var": 3.3442041663272433e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.4832, |
| "loss/crossentropy": 2.1117863088846205, |
| "loss/hidden": 3.559375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2428264247253537, |
| "step": 3960 |
| }, |
| { |
| "epoch": 0.09925, |
| "grad_norm": 30.875, |
| "grad_norm_var": 3.3442041639803904e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.6253, |
| "loss/crossentropy": 2.1176558315753935, |
| "loss/hidden": 3.55546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22726768516004087, |
| "step": 3970 |
| }, |
| { |
| "epoch": 0.0995, |
| "grad_norm": 31.25, |
| "grad_norm_var": 19.5384765625, |
| "learning_rate": 0.0001, |
| "loss": 7.4413, |
| "loss/crossentropy": 2.088257111608982, |
| "loss/hidden": 3.34609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19193989606574177, |
| "step": 3980 |
| }, |
| { |
| "epoch": 0.09975, |
| "grad_norm": 32.75, |
| "grad_norm_var": 2.28125, |
| "learning_rate": 0.0001, |
| "loss": 7.6017, |
| "loss/crossentropy": 2.134918417036533, |
| "loss/hidden": 3.411328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19602251183241606, |
| "step": 3990 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 32.25, |
| "grad_norm_var": 6.172916666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5138, |
| "loss/crossentropy": 2.1410045489668845, |
| "loss/hidden": 3.4515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20588791109621524, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.10025, |
| "grad_norm": 30.0, |
| "grad_norm_var": 11.4306640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5549, |
| "loss/crossentropy": 2.241489386558533, |
| "loss/hidden": 3.343359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20121449399739505, |
| "step": 4010 |
| }, |
| { |
| "epoch": 0.1005, |
| "grad_norm": 33.75, |
| "grad_norm_var": 5.220247395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.6037, |
| "loss/crossentropy": 2.177129751443863, |
| "loss/hidden": 3.4484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.195396139472723, |
| "step": 4020 |
| }, |
| { |
| "epoch": 0.10075, |
| "grad_norm": 31.375, |
| "grad_norm_var": 25.279622395833332, |
| "learning_rate": 0.0001, |
| "loss": 7.5495, |
| "loss/crossentropy": 2.1074128076434135, |
| "loss/hidden": 3.40859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18998505976051092, |
| "step": 4030 |
| }, |
| { |
| "epoch": 0.101, |
| "grad_norm": 30.375, |
| "grad_norm_var": 12.564518229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4628, |
| "loss/crossentropy": 2.031256873905659, |
| "loss/hidden": 3.369921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19107761420309544, |
| "step": 4040 |
| }, |
| { |
| "epoch": 0.10125, |
| "grad_norm": 29.25, |
| "grad_norm_var": 8.242643229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5906, |
| "loss/crossentropy": 2.2593255966901777, |
| "loss/hidden": 3.33515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20071447864174843, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.1015, |
| "grad_norm": 28.875, |
| "grad_norm_var": 7.72890625, |
| "learning_rate": 0.0001, |
| "loss": 7.5008, |
| "loss/crossentropy": 2.1623566120862963, |
| "loss/hidden": 3.442578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20114662442356349, |
| "step": 4060 |
| }, |
| { |
| "epoch": 0.10175, |
| "grad_norm": 28.75, |
| "grad_norm_var": 2.5291015625, |
| "learning_rate": 0.0001, |
| "loss": 7.4901, |
| "loss/crossentropy": 2.1303680926561355, |
| "loss/hidden": 3.458984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19380100946873427, |
| "step": 4070 |
| }, |
| { |
| "epoch": 0.102, |
| "grad_norm": 33.25, |
| "grad_norm_var": 1.9583333333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6067, |
| "loss/crossentropy": 2.180470046401024, |
| "loss/hidden": 3.40390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20080227889120578, |
| "step": 4080 |
| }, |
| { |
| "epoch": 0.10225, |
| "grad_norm": 31.0, |
| "grad_norm_var": 43.7791015625, |
| "learning_rate": 0.0001, |
| "loss": 7.3949, |
| "loss/crossentropy": 2.04951853454113, |
| "loss/hidden": 3.391796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1938589910045266, |
| "step": 4090 |
| }, |
| { |
| "epoch": 0.1025, |
| "grad_norm": 31.5, |
| "grad_norm_var": 40.233333333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5, |
| "loss/crossentropy": 1.9436089858412742, |
| "loss/hidden": 3.387890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1772780598141253, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.10275, |
| "grad_norm": 30.75, |
| "grad_norm_var": 6.14765625, |
| "learning_rate": 0.0001, |
| "loss": 7.5926, |
| "loss/crossentropy": 2.1081591993570328, |
| "loss/hidden": 3.436328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20672952029854058, |
| "step": 4110 |
| }, |
| { |
| "epoch": 0.103, |
| "grad_norm": 31.875, |
| "grad_norm_var": 11.225455729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6334, |
| "loss/crossentropy": 2.0973087579011915, |
| "loss/hidden": 3.393359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20208985283970832, |
| "step": 4120 |
| }, |
| { |
| "epoch": 0.10325, |
| "grad_norm": 29.375, |
| "grad_norm_var": 25.839583333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4644, |
| "loss/crossentropy": 2.205477836728096, |
| "loss/hidden": 3.3546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19515758529305458, |
| "step": 4130 |
| }, |
| { |
| "epoch": 0.1035, |
| "grad_norm": 31.25, |
| "grad_norm_var": 3.6567057291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4746, |
| "loss/crossentropy": 2.1042226657271383, |
| "loss/hidden": 3.378515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1850555408746004, |
| "step": 4140 |
| }, |
| { |
| "epoch": 0.10375, |
| "grad_norm": 29.875, |
| "grad_norm_var": 2.5952473958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5805, |
| "loss/crossentropy": 2.1080187141895292, |
| "loss/hidden": 3.481640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20642356667667627, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.104, |
| "grad_norm": 34.75, |
| "grad_norm_var": 12.11875, |
| "learning_rate": 0.0001, |
| "loss": 7.6864, |
| "loss/crossentropy": 2.1868012815713884, |
| "loss/hidden": 3.444140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19968394786119462, |
| "step": 4160 |
| }, |
| { |
| "epoch": 0.10425, |
| "grad_norm": 32.25, |
| "grad_norm_var": 11.406184895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6461, |
| "loss/crossentropy": 2.0963368862867355, |
| "loss/hidden": 3.4609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21419077794998884, |
| "step": 4170 |
| }, |
| { |
| "epoch": 0.1045, |
| "grad_norm": 31.0, |
| "grad_norm_var": 11.670833333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6029, |
| "loss/crossentropy": 2.100097879767418, |
| "loss/hidden": 3.35390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18958222791552543, |
| "step": 4180 |
| }, |
| { |
| "epoch": 0.10475, |
| "grad_norm": 29.375, |
| "grad_norm_var": 7.664518229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5954, |
| "loss/crossentropy": 2.2243005722761153, |
| "loss/hidden": 3.40625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19990855641663074, |
| "step": 4190 |
| }, |
| { |
| "epoch": 0.105, |
| "grad_norm": 29.75, |
| "grad_norm_var": 4.333072916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6671, |
| "loss/crossentropy": 2.250749832391739, |
| "loss/hidden": 3.308203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19082491770386695, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.10525, |
| "grad_norm": 29.25, |
| "grad_norm_var": 10.568489583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6118, |
| "loss/crossentropy": 2.155411234498024, |
| "loss/hidden": 3.42421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2127680890262127, |
| "step": 4210 |
| }, |
| { |
| "epoch": 0.1055, |
| "grad_norm": 34.25, |
| "grad_norm_var": 8.6556640625, |
| "learning_rate": 0.0001, |
| "loss": 7.4391, |
| "loss/crossentropy": 2.037487879395485, |
| "loss/hidden": 3.31015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17465929109603168, |
| "step": 4220 |
| }, |
| { |
| "epoch": 0.10575, |
| "grad_norm": 30.625, |
| "grad_norm_var": 8.183072916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6537, |
| "loss/crossentropy": 2.067080709338188, |
| "loss/hidden": 3.316796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1857584908604622, |
| "step": 4230 |
| }, |
| { |
| "epoch": 0.106, |
| "grad_norm": 31.25, |
| "grad_norm_var": 1.4171223958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.608, |
| "loss/crossentropy": 2.279679241776466, |
| "loss/hidden": 3.33984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.206354571133852, |
| "step": 4240 |
| }, |
| { |
| "epoch": 0.10625, |
| "grad_norm": 31.75, |
| "grad_norm_var": 31.7681640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5259, |
| "loss/crossentropy": 2.1007855504751207, |
| "loss/hidden": 3.37265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1838985349982977, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.1065, |
| "grad_norm": 28.625, |
| "grad_norm_var": 2.5104166666666665, |
| "learning_rate": 0.0001, |
| "loss": 7.5786, |
| "loss/crossentropy": 2.1117694169282912, |
| "loss/hidden": 3.465625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2134232448413968, |
| "step": 4260 |
| }, |
| { |
| "epoch": 0.10675, |
| "grad_norm": 33.25, |
| "grad_norm_var": 5.585416666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5424, |
| "loss/crossentropy": 2.1906253546476364, |
| "loss/hidden": 3.380859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19627480674535036, |
| "step": 4270 |
| }, |
| { |
| "epoch": 0.107, |
| "grad_norm": 33.75, |
| "grad_norm_var": 16.780208333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.7039, |
| "loss/crossentropy": 2.167680537700653, |
| "loss/hidden": 3.469921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21632006093859674, |
| "step": 4280 |
| }, |
| { |
| "epoch": 0.10725, |
| "grad_norm": 32.75, |
| "grad_norm_var": 14.226497395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6197, |
| "loss/crossentropy": 2.2265933483839033, |
| "loss/hidden": 3.403125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20777842812240124, |
| "step": 4290 |
| }, |
| { |
| "epoch": 0.1075, |
| "grad_norm": 32.25, |
| "grad_norm_var": 5.06640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5019, |
| "loss/crossentropy": 2.0171881064772608, |
| "loss/hidden": 3.36640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1733700342476368, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.10775, |
| "grad_norm": 32.75, |
| "grad_norm_var": 39.37473958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6624, |
| "loss/crossentropy": 2.052209459245205, |
| "loss/hidden": 3.487109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2088288875296712, |
| "step": 4310 |
| }, |
| { |
| "epoch": 0.108, |
| "grad_norm": 29.375, |
| "grad_norm_var": 2.460724588873515e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.5801, |
| "loss/crossentropy": 2.096518099308014, |
| "loss/hidden": 3.43671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2093063434585929, |
| "step": 4320 |
| }, |
| { |
| "epoch": 0.10825, |
| "grad_norm": 28.25, |
| "grad_norm_var": 2.4607245888865874e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.4793, |
| "loss/crossentropy": 2.1164163142442702, |
| "loss/hidden": 3.405078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19930963944643737, |
| "step": 4330 |
| }, |
| { |
| "epoch": 0.1085, |
| "grad_norm": 33.5, |
| "grad_norm_var": 46.587239583333336, |
| "learning_rate": 0.0001, |
| "loss": 7.5915, |
| "loss/crossentropy": 2.1831407219171526, |
| "loss/hidden": 3.446875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21474861968308687, |
| "step": 4340 |
| }, |
| { |
| "epoch": 0.10875, |
| "grad_norm": 31.0, |
| "grad_norm_var": 17.534375, |
| "learning_rate": 0.0001, |
| "loss": 7.4743, |
| "loss/crossentropy": 2.1157304018735887, |
| "loss/hidden": 3.416015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2028682116419077, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.109, |
| "grad_norm": 28.625, |
| "grad_norm_var": 4.282291666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6076, |
| "loss/crossentropy": 1.980335572361946, |
| "loss/hidden": 3.541796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20984734632074833, |
| "step": 4360 |
| }, |
| { |
| "epoch": 0.10925, |
| "grad_norm": 31.25, |
| "grad_norm_var": 30.812955729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5349, |
| "loss/crossentropy": 2.107056123018265, |
| "loss/hidden": 3.25859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18440892472863196, |
| "step": 4370 |
| }, |
| { |
| "epoch": 0.1095, |
| "grad_norm": 30.375, |
| "grad_norm_var": 21.462239583333332, |
| "learning_rate": 0.0001, |
| "loss": 7.5459, |
| "loss/crossentropy": 2.2860846698284147, |
| "loss/hidden": 3.297265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1911198776215315, |
| "step": 4380 |
| }, |
| { |
| "epoch": 0.10975, |
| "grad_norm": 30.0, |
| "grad_norm_var": 5.008333333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.59, |
| "loss/crossentropy": 2.150055022537708, |
| "loss/hidden": 3.396875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20779079720377922, |
| "step": 4390 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 37.25, |
| "grad_norm_var": 24.933268229166668, |
| "learning_rate": 0.0001, |
| "loss": 7.6256, |
| "loss/crossentropy": 2.225931641459465, |
| "loss/hidden": 3.569921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.23082431070506573, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.11025, |
| "grad_norm": 29.375, |
| "grad_norm_var": 26.770572916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5585, |
| "loss/crossentropy": 2.1318808451294897, |
| "loss/hidden": 3.41484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.205277425237, |
| "step": 4410 |
| }, |
| { |
| "epoch": 0.1105, |
| "grad_norm": 28.25, |
| "grad_norm_var": 6.009830729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5037, |
| "loss/crossentropy": 2.108688759803772, |
| "loss/hidden": 3.426171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19271691460162402, |
| "step": 4420 |
| }, |
| { |
| "epoch": 0.11075, |
| "grad_norm": 30.875, |
| "grad_norm_var": 2.85390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5599, |
| "loss/crossentropy": 2.1696368783712385, |
| "loss/hidden": 3.391796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19752040579915048, |
| "step": 4430 |
| }, |
| { |
| "epoch": 0.111, |
| "grad_norm": 30.25, |
| "grad_norm_var": 18.476822916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5321, |
| "loss/crossentropy": 2.167906680703163, |
| "loss/hidden": 3.48828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2087454443797469, |
| "step": 4440 |
| }, |
| { |
| "epoch": 0.11125, |
| "grad_norm": 30.75, |
| "grad_norm_var": 4.6634765625, |
| "learning_rate": 0.0001, |
| "loss": 7.6168, |
| "loss/crossentropy": 2.0555444791913033, |
| "loss/hidden": 3.325, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17809300348162652, |
| "step": 4450 |
| }, |
| { |
| "epoch": 0.1115, |
| "grad_norm": 31.25, |
| "grad_norm_var": 5.618489583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.669, |
| "loss/crossentropy": 2.2362961381673814, |
| "loss/hidden": 3.480078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21373681984841825, |
| "step": 4460 |
| }, |
| { |
| "epoch": 0.11175, |
| "grad_norm": 31.625, |
| "grad_norm_var": 1.8311848958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.469, |
| "loss/crossentropy": 2.207303923368454, |
| "loss/hidden": 3.400390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21020539589226245, |
| "step": 4470 |
| }, |
| { |
| "epoch": 0.112, |
| "grad_norm": 66.0, |
| "grad_norm_var": 76.79973958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5099, |
| "loss/crossentropy": 2.170773930847645, |
| "loss/hidden": 3.4703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20324019938707352, |
| "step": 4480 |
| }, |
| { |
| "epoch": 0.11225, |
| "grad_norm": 35.0, |
| "grad_norm_var": 76.34895833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5149, |
| "loss/crossentropy": 2.047274041175842, |
| "loss/hidden": 3.337890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18544426914304496, |
| "step": 4490 |
| }, |
| { |
| "epoch": 0.1125, |
| "grad_norm": 28.25, |
| "grad_norm_var": 4.081705729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5745, |
| "loss/crossentropy": 2.1270942091941833, |
| "loss/hidden": 3.311328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19201683439314365, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.11275, |
| "grad_norm": 28.25, |
| "grad_norm_var": 2.6192057291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.632, |
| "loss/crossentropy": 2.0815075978636743, |
| "loss/hidden": 3.438671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20613169986754656, |
| "step": 4510 |
| }, |
| { |
| "epoch": 0.113, |
| "grad_norm": 33.25, |
| "grad_norm_var": 166.34212239583334, |
| "learning_rate": 0.0001, |
| "loss": 7.5383, |
| "loss/crossentropy": 2.144553080201149, |
| "loss/hidden": 3.33125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19253196399658917, |
| "step": 4520 |
| }, |
| { |
| "epoch": 0.11325, |
| "grad_norm": 35.75, |
| "grad_norm_var": 1.734519148252968e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.5876, |
| "loss/crossentropy": 2.212298333644867, |
| "loss/hidden": 3.34765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19719784446060656, |
| "step": 4530 |
| }, |
| { |
| "epoch": 0.1135, |
| "grad_norm": 30.625, |
| "grad_norm_var": 32.881184895833336, |
| "learning_rate": 0.0001, |
| "loss": 7.4563, |
| "loss/crossentropy": 2.095240616798401, |
| "loss/hidden": 3.41796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19202221632003785, |
| "step": 4540 |
| }, |
| { |
| "epoch": 0.11375, |
| "grad_norm": 31.625, |
| "grad_norm_var": 87.4853515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5184, |
| "loss/crossentropy": 2.134147650748491, |
| "loss/hidden": 3.340625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19104634067043663, |
| "step": 4550 |
| }, |
| { |
| "epoch": 0.114, |
| "grad_norm": 32.25, |
| "grad_norm_var": 78.95618489583333, |
| "learning_rate": 0.0001, |
| "loss": 7.4668, |
| "loss/crossentropy": 2.1294006586074827, |
| "loss/hidden": 3.326953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1853051505982876, |
| "step": 4560 |
| }, |
| { |
| "epoch": 0.11425, |
| "grad_norm": 30.5, |
| "grad_norm_var": 4.706705729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5114, |
| "loss/crossentropy": 2.0886638939380644, |
| "loss/hidden": 3.452734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20859680250287055, |
| "step": 4570 |
| }, |
| { |
| "epoch": 0.1145, |
| "grad_norm": 35.5, |
| "grad_norm_var": 5.934830729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.56, |
| "loss/crossentropy": 2.1498399868607523, |
| "loss/hidden": 3.50703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18966037435457112, |
| "step": 4580 |
| }, |
| { |
| "epoch": 0.11475, |
| "grad_norm": 32.75, |
| "grad_norm_var": 3.358268229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6013, |
| "loss/crossentropy": 2.199784816801548, |
| "loss/hidden": 3.347265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18477672804147005, |
| "step": 4590 |
| }, |
| { |
| "epoch": 0.115, |
| "grad_norm": 31.375, |
| "grad_norm_var": 17.598372395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5806, |
| "loss/crossentropy": 2.165570431947708, |
| "loss/hidden": 3.505859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2113142903894186, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.11525, |
| "grad_norm": 31.75, |
| "grad_norm_var": 11.908268229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6265, |
| "loss/crossentropy": 2.07996127307415, |
| "loss/hidden": 3.418359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21436248067766428, |
| "step": 4610 |
| }, |
| { |
| "epoch": 0.1155, |
| "grad_norm": 28.125, |
| "grad_norm_var": 4.520833333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.57, |
| "loss/crossentropy": 2.1484976023435594, |
| "loss/hidden": 3.390234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1913412597030401, |
| "step": 4620 |
| }, |
| { |
| "epoch": 0.11575, |
| "grad_norm": 30.125, |
| "grad_norm_var": 4.1875, |
| "learning_rate": 0.0001, |
| "loss": 7.4701, |
| "loss/crossentropy": 2.0956287920475005, |
| "loss/hidden": 3.39140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19558896012604238, |
| "step": 4630 |
| }, |
| { |
| "epoch": 0.116, |
| "grad_norm": 29.875, |
| "grad_norm_var": 2.106184895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4538, |
| "loss/crossentropy": 2.1474092990159988, |
| "loss/hidden": 3.45234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2039597311988473, |
| "step": 4640 |
| }, |
| { |
| "epoch": 0.11625, |
| "grad_norm": 31.75, |
| "grad_norm_var": 3.520768229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6657, |
| "loss/crossentropy": 2.1128339886665346, |
| "loss/hidden": 3.361328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21120875477790832, |
| "step": 4650 |
| }, |
| { |
| "epoch": 0.1165, |
| "grad_norm": 31.5, |
| "grad_norm_var": 2.133072916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5441, |
| "loss/crossentropy": 2.1643586844205855, |
| "loss/hidden": 3.4765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20440610516816377, |
| "step": 4660 |
| }, |
| { |
| "epoch": 0.11675, |
| "grad_norm": 33.0, |
| "grad_norm_var": 3.504622395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6051, |
| "loss/crossentropy": 2.195269528031349, |
| "loss/hidden": 3.428125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20059011913836003, |
| "step": 4670 |
| }, |
| { |
| "epoch": 0.117, |
| "grad_norm": 30.0, |
| "grad_norm_var": 31.235416666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4526, |
| "loss/crossentropy": 2.082709529995918, |
| "loss/hidden": 3.36640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19617473538964986, |
| "step": 4680 |
| }, |
| { |
| "epoch": 0.11725, |
| "grad_norm": 32.0, |
| "grad_norm_var": 3.45625, |
| "learning_rate": 0.0001, |
| "loss": 7.5593, |
| "loss/crossentropy": 2.1473275452852247, |
| "loss/hidden": 3.481640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20148523468524218, |
| "step": 4690 |
| }, |
| { |
| "epoch": 0.1175, |
| "grad_norm": 31.875, |
| "grad_norm_var": 2.145768229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4609, |
| "loss/crossentropy": 2.14631325006485, |
| "loss/hidden": 3.42890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20976583026349543, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.11775, |
| "grad_norm": 32.5, |
| "grad_norm_var": 33.20826822916667, |
| "learning_rate": 0.0001, |
| "loss": 7.5877, |
| "loss/crossentropy": 2.1093395471572878, |
| "loss/hidden": 3.46953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20207433141767978, |
| "step": 4710 |
| }, |
| { |
| "epoch": 0.118, |
| "grad_norm": 30.75, |
| "grad_norm_var": 10.86875, |
| "learning_rate": 0.0001, |
| "loss": 7.5437, |
| "loss/crossentropy": 2.1447531282901764, |
| "loss/hidden": 3.353515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1963239062577486, |
| "step": 4720 |
| }, |
| { |
| "epoch": 0.11825, |
| "grad_norm": 34.5, |
| "grad_norm_var": 4.473372395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.6248, |
| "loss/crossentropy": 2.1142423778772352, |
| "loss/hidden": 3.283203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19059138614684343, |
| "step": 4730 |
| }, |
| { |
| "epoch": 0.1185, |
| "grad_norm": 27.875, |
| "grad_norm_var": 3.098372395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5631, |
| "loss/crossentropy": 2.07417613863945, |
| "loss/hidden": 3.455859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19420854579657315, |
| "step": 4740 |
| }, |
| { |
| "epoch": 0.11875, |
| "grad_norm": 29.875, |
| "grad_norm_var": 2.734375, |
| "learning_rate": 0.0001, |
| "loss": 7.6499, |
| "loss/crossentropy": 2.0995796024799347, |
| "loss/hidden": 3.518359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21285411342978477, |
| "step": 4750 |
| }, |
| { |
| "epoch": 0.119, |
| "grad_norm": 32.5, |
| "grad_norm_var": 2.7228515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5854, |
| "loss/crossentropy": 2.169099047780037, |
| "loss/hidden": 3.462109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22147531677037477, |
| "step": 4760 |
| }, |
| { |
| "epoch": 0.11925, |
| "grad_norm": 32.5, |
| "grad_norm_var": 34.799739583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.435, |
| "loss/crossentropy": 2.0821361050009726, |
| "loss/hidden": 3.2953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17594938166439533, |
| "step": 4770 |
| }, |
| { |
| "epoch": 0.1195, |
| "grad_norm": 33.75, |
| "grad_norm_var": 23.00390625, |
| "learning_rate": 0.0001, |
| "loss": 7.6152, |
| "loss/crossentropy": 2.183753404021263, |
| "loss/hidden": 3.364453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19579742290079594, |
| "step": 4780 |
| }, |
| { |
| "epoch": 0.11975, |
| "grad_norm": 29.25, |
| "grad_norm_var": 23.6541015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5295, |
| "loss/crossentropy": 2.1669270396232605, |
| "loss/hidden": 3.505859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21605710163712502, |
| "step": 4790 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 34.25, |
| "grad_norm_var": 4.624739583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6351, |
| "loss/crossentropy": 2.116096779704094, |
| "loss/hidden": 3.4625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21523005720227956, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.12025, |
| "grad_norm": 29.5, |
| "grad_norm_var": 6.009375, |
| "learning_rate": 0.0001, |
| "loss": 7.5683, |
| "loss/crossentropy": 2.051844981312752, |
| "loss/hidden": 3.351953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17574754767119885, |
| "step": 4810 |
| }, |
| { |
| "epoch": 0.1205, |
| "grad_norm": 30.5, |
| "grad_norm_var": 2.0087890625, |
| "learning_rate": 0.0001, |
| "loss": 7.5892, |
| "loss/crossentropy": 2.2223299980163573, |
| "loss/hidden": 3.3765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19802077617496253, |
| "step": 4820 |
| }, |
| { |
| "epoch": 0.12075, |
| "grad_norm": 30.125, |
| "grad_norm_var": 3.1080729166666665, |
| "learning_rate": 0.0001, |
| "loss": 7.5951, |
| "loss/crossentropy": 2.1982655793428423, |
| "loss/hidden": 3.444140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2064062263816595, |
| "step": 4830 |
| }, |
| { |
| "epoch": 0.121, |
| "grad_norm": 30.0, |
| "grad_norm_var": 12.412434895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5254, |
| "loss/crossentropy": 1.9753010511398315, |
| "loss/hidden": 3.4484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19706314485520124, |
| "step": 4840 |
| }, |
| { |
| "epoch": 0.12125, |
| "grad_norm": 30.125, |
| "grad_norm_var": 13.1447265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5351, |
| "loss/crossentropy": 2.0562877766788006, |
| "loss/hidden": 3.396875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17667807769030333, |
| "step": 4850 |
| }, |
| { |
| "epoch": 0.1215, |
| "grad_norm": 33.0, |
| "grad_norm_var": 3.465625, |
| "learning_rate": 0.0001, |
| "loss": 7.6024, |
| "loss/crossentropy": 2.1578008987009527, |
| "loss/hidden": 3.344140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18234786652028562, |
| "step": 4860 |
| }, |
| { |
| "epoch": 0.12175, |
| "grad_norm": 30.375, |
| "grad_norm_var": 2.343489583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4726, |
| "loss/crossentropy": 2.1330083698034286, |
| "loss/hidden": 3.333203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18738476932048798, |
| "step": 4870 |
| }, |
| { |
| "epoch": 0.122, |
| "grad_norm": 31.625, |
| "grad_norm_var": 3.2322265625, |
| "learning_rate": 0.0001, |
| "loss": 7.605, |
| "loss/crossentropy": 2.1732089832425117, |
| "loss/hidden": 3.460546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19949225690215827, |
| "step": 4880 |
| }, |
| { |
| "epoch": 0.12225, |
| "grad_norm": 33.5, |
| "grad_norm_var": 2.238997395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5382, |
| "loss/crossentropy": 2.188927575945854, |
| "loss/hidden": 3.419921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19776681065559387, |
| "step": 4890 |
| }, |
| { |
| "epoch": 0.1225, |
| "grad_norm": 29.0, |
| "grad_norm_var": 15.819791666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5728, |
| "loss/crossentropy": 2.202793037891388, |
| "loss/hidden": 3.36328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1992744604125619, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.12275, |
| "grad_norm": 30.25, |
| "grad_norm_var": 16.1775390625, |
| "learning_rate": 0.0001, |
| "loss": 7.426, |
| "loss/crossentropy": 2.177696394920349, |
| "loss/hidden": 3.36796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19968770015984774, |
| "step": 4910 |
| }, |
| { |
| "epoch": 0.123, |
| "grad_norm": 35.5, |
| "grad_norm_var": 7.707291666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.7485, |
| "loss/crossentropy": 2.040875867009163, |
| "loss/hidden": 3.45234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1929738214239478, |
| "step": 4920 |
| }, |
| { |
| "epoch": 0.12325, |
| "grad_norm": 32.25, |
| "grad_norm_var": 6.771809895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5517, |
| "loss/crossentropy": 2.199721184372902, |
| "loss/hidden": 3.406640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20861553251743317, |
| "step": 4930 |
| }, |
| { |
| "epoch": 0.1235, |
| "grad_norm": 28.75, |
| "grad_norm_var": 4.235416666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5576, |
| "loss/crossentropy": 2.133353302627802, |
| "loss/hidden": 3.45546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21631606128066777, |
| "step": 4940 |
| }, |
| { |
| "epoch": 0.12375, |
| "grad_norm": 40.0, |
| "grad_norm_var": 15.1650390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5129, |
| "loss/crossentropy": 2.2380867928266523, |
| "loss/hidden": 3.3703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2078275766223669, |
| "step": 4950 |
| }, |
| { |
| "epoch": 0.124, |
| "grad_norm": 32.5, |
| "grad_norm_var": 9.342643229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4421, |
| "loss/crossentropy": 2.1563473463058473, |
| "loss/hidden": 3.374609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1852986102923751, |
| "step": 4960 |
| }, |
| { |
| "epoch": 0.12425, |
| "grad_norm": 32.5, |
| "grad_norm_var": 2.3916666666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5503, |
| "loss/crossentropy": 2.124380439519882, |
| "loss/hidden": 3.480859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20270956568419934, |
| "step": 4970 |
| }, |
| { |
| "epoch": 0.1245, |
| "grad_norm": 30.875, |
| "grad_norm_var": 5.837434895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.4857, |
| "loss/crossentropy": 2.0482941284775733, |
| "loss/hidden": 3.373828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18650466352701187, |
| "step": 4980 |
| }, |
| { |
| "epoch": 0.12475, |
| "grad_norm": 31.0, |
| "grad_norm_var": 3.0858723958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5446, |
| "loss/crossentropy": 2.160063311457634, |
| "loss/hidden": 3.319140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18700905814766883, |
| "step": 4990 |
| }, |
| { |
| "epoch": 0.125, |
| "grad_norm": 28.75, |
| "grad_norm_var": 5.533333333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5333, |
| "loss/crossentropy": 2.056824396550655, |
| "loss/hidden": 3.4421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18965724110603333, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.12525, |
| "grad_norm": 32.5, |
| "grad_norm_var": 32.25807291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5864, |
| "loss/crossentropy": 2.2407308876514436, |
| "loss/hidden": 3.51328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22782632596790792, |
| "step": 5010 |
| }, |
| { |
| "epoch": 0.1255, |
| "grad_norm": 31.375, |
| "grad_norm_var": 32.7353515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5671, |
| "loss/crossentropy": 2.18543721139431, |
| "loss/hidden": 3.37421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19135277662426234, |
| "step": 5020 |
| }, |
| { |
| "epoch": 0.12575, |
| "grad_norm": 32.5, |
| "grad_norm_var": 2.814322916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.578, |
| "loss/crossentropy": 2.1418938025832177, |
| "loss/hidden": 3.362109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20001544915139674, |
| "step": 5030 |
| }, |
| { |
| "epoch": 0.126, |
| "grad_norm": 33.0, |
| "grad_norm_var": 1.3238932291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5594, |
| "loss/crossentropy": 2.1838817209005357, |
| "loss/hidden": 3.417578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2107716018334031, |
| "step": 5040 |
| }, |
| { |
| "epoch": 0.12625, |
| "grad_norm": 31.25, |
| "grad_norm_var": 3.47890625, |
| "learning_rate": 0.0001, |
| "loss": 7.5275, |
| "loss/crossentropy": 2.2816754072904586, |
| "loss/hidden": 3.34921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.201267159730196, |
| "step": 5050 |
| }, |
| { |
| "epoch": 0.1265, |
| "grad_norm": 33.5, |
| "grad_norm_var": 4.2087890625, |
| "learning_rate": 0.0001, |
| "loss": 7.6079, |
| "loss/crossentropy": 2.0686806365847588, |
| "loss/hidden": 3.603125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2172813605517149, |
| "step": 5060 |
| }, |
| { |
| "epoch": 0.12675, |
| "grad_norm": 32.25, |
| "grad_norm_var": 3.19765625, |
| "learning_rate": 0.0001, |
| "loss": 7.5561, |
| "loss/crossentropy": 2.1800751775503158, |
| "loss/hidden": 3.389453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20199469216167926, |
| "step": 5070 |
| }, |
| { |
| "epoch": 0.127, |
| "grad_norm": 30.5, |
| "grad_norm_var": 1.8643229166666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4702, |
| "loss/crossentropy": 2.2943209201097488, |
| "loss/hidden": 3.355859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1958466824144125, |
| "step": 5080 |
| }, |
| { |
| "epoch": 0.12725, |
| "grad_norm": 30.125, |
| "grad_norm_var": 1.9030598958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4383, |
| "loss/crossentropy": 2.291040873527527, |
| "loss/hidden": 3.308203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18960105255246162, |
| "step": 5090 |
| }, |
| { |
| "epoch": 0.1275, |
| "grad_norm": 32.25, |
| "grad_norm_var": 1.4504557291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6139, |
| "loss/crossentropy": 2.116188834607601, |
| "loss/hidden": 3.5015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19662482757121325, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.12775, |
| "grad_norm": 32.25, |
| "grad_norm_var": 1.5395182291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5074, |
| "loss/crossentropy": 2.200083887577057, |
| "loss/hidden": 3.38125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19970641760155558, |
| "step": 5110 |
| }, |
| { |
| "epoch": 0.128, |
| "grad_norm": 31.125, |
| "grad_norm_var": 1.990625, |
| "learning_rate": 0.0001, |
| "loss": 7.5949, |
| "loss/crossentropy": 2.080552561581135, |
| "loss/hidden": 3.403125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19426519125699998, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.12825, |
| "grad_norm": 31.125, |
| "grad_norm_var": 2.200455729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5036, |
| "loss/crossentropy": 2.138309660553932, |
| "loss/hidden": 3.430859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20698665007948874, |
| "step": 5130 |
| }, |
| { |
| "epoch": 0.1285, |
| "grad_norm": 30.125, |
| "grad_norm_var": 5.530989583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6081, |
| "loss/crossentropy": 2.0686957597732545, |
| "loss/hidden": 3.3734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22128727175295354, |
| "step": 5140 |
| }, |
| { |
| "epoch": 0.12875, |
| "grad_norm": 30.75, |
| "grad_norm_var": 2.120833333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5236, |
| "loss/crossentropy": 2.0967576891183852, |
| "loss/hidden": 3.46640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19716181065887212, |
| "step": 5150 |
| }, |
| { |
| "epoch": 0.129, |
| "grad_norm": 32.25, |
| "grad_norm_var": 1.8468098958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5679, |
| "loss/crossentropy": 2.172739614546299, |
| "loss/hidden": 3.350390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1919636652804911, |
| "step": 5160 |
| }, |
| { |
| "epoch": 0.12925, |
| "grad_norm": 43.75, |
| "grad_norm_var": 17.77890625, |
| "learning_rate": 0.0001, |
| "loss": 7.5027, |
| "loss/crossentropy": 2.1427868396043777, |
| "loss/hidden": 3.51953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20654744990170001, |
| "step": 5170 |
| }, |
| { |
| "epoch": 0.1295, |
| "grad_norm": 33.25, |
| "grad_norm_var": 61.1962890625, |
| "learning_rate": 0.0001, |
| "loss": 7.52, |
| "loss/crossentropy": 2.1186717480421065, |
| "loss/hidden": 3.42421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1938714198768139, |
| "step": 5180 |
| }, |
| { |
| "epoch": 0.12975, |
| "grad_norm": 33.0, |
| "grad_norm_var": 4.07890625, |
| "learning_rate": 0.0001, |
| "loss": 7.6231, |
| "loss/crossentropy": 2.072993017733097, |
| "loss/hidden": 3.37578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18984810579568148, |
| "step": 5190 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 36.25, |
| "grad_norm_var": 20.777083333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.597, |
| "loss/crossentropy": 2.206225660443306, |
| "loss/hidden": 3.376171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2030201606452465, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.13025, |
| "grad_norm": 43.75, |
| "grad_norm_var": 24.43515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5075, |
| "loss/crossentropy": 2.228325179219246, |
| "loss/hidden": 3.366796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20732430163770915, |
| "step": 5210 |
| }, |
| { |
| "epoch": 0.1305, |
| "grad_norm": 30.5, |
| "grad_norm_var": 11.99765625, |
| "learning_rate": 0.0001, |
| "loss": 7.5295, |
| "loss/crossentropy": 2.129982355237007, |
| "loss/hidden": 3.4546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19593548215925694, |
| "step": 5220 |
| }, |
| { |
| "epoch": 0.13075, |
| "grad_norm": 30.75, |
| "grad_norm_var": 4.292708333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5261, |
| "loss/crossentropy": 2.1533152967691422, |
| "loss/hidden": 3.3296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19487107992172242, |
| "step": 5230 |
| }, |
| { |
| "epoch": 0.131, |
| "grad_norm": 30.875, |
| "grad_norm_var": 12.989583333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5566, |
| "loss/crossentropy": 2.195969894528389, |
| "loss/hidden": 3.48046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2189876638352871, |
| "step": 5240 |
| }, |
| { |
| "epoch": 0.13125, |
| "grad_norm": 33.25, |
| "grad_norm_var": 22.575, |
| "learning_rate": 0.0001, |
| "loss": 7.482, |
| "loss/crossentropy": 2.074573493748903, |
| "loss/hidden": 3.412890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19280508980154992, |
| "step": 5250 |
| }, |
| { |
| "epoch": 0.1315, |
| "grad_norm": 29.125, |
| "grad_norm_var": 15.599739583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5395, |
| "loss/crossentropy": 2.227403500676155, |
| "loss/hidden": 3.371484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20514250732958317, |
| "step": 5260 |
| }, |
| { |
| "epoch": 0.13175, |
| "grad_norm": 31.625, |
| "grad_norm_var": 113.15618489583333, |
| "learning_rate": 0.0001, |
| "loss": 7.5209, |
| "loss/crossentropy": 2.055370827019215, |
| "loss/hidden": 3.334375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18272479642182587, |
| "step": 5270 |
| }, |
| { |
| "epoch": 0.132, |
| "grad_norm": 31.625, |
| "grad_norm_var": 1.765625, |
| "learning_rate": 0.0001, |
| "loss": 7.4798, |
| "loss/crossentropy": 2.1356967806816103, |
| "loss/hidden": 3.430859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19737351574003698, |
| "step": 5280 |
| }, |
| { |
| "epoch": 0.13225, |
| "grad_norm": 31.0, |
| "grad_norm_var": 7.879622395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.4715, |
| "loss/crossentropy": 2.177875056862831, |
| "loss/hidden": 3.361328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20798433478921652, |
| "step": 5290 |
| }, |
| { |
| "epoch": 0.1325, |
| "grad_norm": 29.5, |
| "grad_norm_var": 8.760872395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5419, |
| "loss/crossentropy": 2.182955250144005, |
| "loss/hidden": 3.354296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18478553090244532, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.13275, |
| "grad_norm": 29.125, |
| "grad_norm_var": 4.260416666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5816, |
| "loss/crossentropy": 2.0469735309481623, |
| "loss/hidden": 3.55859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18724320270121098, |
| "step": 5310 |
| }, |
| { |
| "epoch": 0.133, |
| "grad_norm": 33.0, |
| "grad_norm_var": 1.8801432291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.7815, |
| "loss/crossentropy": 2.1846155911684035, |
| "loss/hidden": 3.499609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21609773077070712, |
| "step": 5320 |
| }, |
| { |
| "epoch": 0.13325, |
| "grad_norm": 30.875, |
| "grad_norm_var": 5.584375, |
| "learning_rate": 0.0001, |
| "loss": 7.7067, |
| "loss/crossentropy": 2.0075640469789504, |
| "loss/hidden": 3.474609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20185804851353167, |
| "step": 5330 |
| }, |
| { |
| "epoch": 0.1335, |
| "grad_norm": 27.375, |
| "grad_norm_var": 6.120247395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5345, |
| "loss/crossentropy": 2.058630608022213, |
| "loss/hidden": 3.421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19536744449287652, |
| "step": 5340 |
| }, |
| { |
| "epoch": 0.13375, |
| "grad_norm": 52.5, |
| "grad_norm_var": 30.692643229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.6655, |
| "loss/crossentropy": 2.050611114501953, |
| "loss/hidden": 3.423046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19592730849981307, |
| "step": 5350 |
| }, |
| { |
| "epoch": 0.134, |
| "grad_norm": 31.125, |
| "grad_norm_var": 28.90390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5509, |
| "loss/crossentropy": 2.1665500849485397, |
| "loss/hidden": 3.38203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19354073759168386, |
| "step": 5360 |
| }, |
| { |
| "epoch": 0.13425, |
| "grad_norm": 33.5, |
| "grad_norm_var": 2.9931640625, |
| "learning_rate": 0.0001, |
| "loss": 7.6343, |
| "loss/crossentropy": 2.1793171644210814, |
| "loss/hidden": 3.3328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20293663591146469, |
| "step": 5370 |
| }, |
| { |
| "epoch": 0.1345, |
| "grad_norm": 32.5, |
| "grad_norm_var": 1.8098307291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6, |
| "loss/crossentropy": 2.152762657403946, |
| "loss/hidden": 3.441015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21694996021687984, |
| "step": 5380 |
| }, |
| { |
| "epoch": 0.13475, |
| "grad_norm": 30.125, |
| "grad_norm_var": 1.4301432291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5121, |
| "loss/crossentropy": 2.083320555835962, |
| "loss/hidden": 3.3375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18746816255152227, |
| "step": 5390 |
| }, |
| { |
| "epoch": 0.135, |
| "grad_norm": 29.625, |
| "grad_norm_var": 5.109830729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.4172, |
| "loss/crossentropy": 2.0639937698841093, |
| "loss/hidden": 3.3875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18618469405919313, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.13525, |
| "grad_norm": 31.125, |
| "grad_norm_var": 12.337434895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5749, |
| "loss/crossentropy": 2.068472331762314, |
| "loss/hidden": 3.524609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18952864613384007, |
| "step": 5410 |
| }, |
| { |
| "epoch": 0.1355, |
| "grad_norm": 32.25, |
| "grad_norm_var": 7.9541015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5385, |
| "loss/crossentropy": 2.085490897297859, |
| "loss/hidden": 3.41484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1854228163138032, |
| "step": 5420 |
| }, |
| { |
| "epoch": 0.13575, |
| "grad_norm": 30.0, |
| "grad_norm_var": 7.678125, |
| "learning_rate": 0.0001, |
| "loss": 7.5583, |
| "loss/crossentropy": 2.1747709423303605, |
| "loss/hidden": 3.444921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20177022367715836, |
| "step": 5430 |
| }, |
| { |
| "epoch": 0.136, |
| "grad_norm": 31.875, |
| "grad_norm_var": 2.6541015625, |
| "learning_rate": 0.0001, |
| "loss": 7.4911, |
| "loss/crossentropy": 2.167823739349842, |
| "loss/hidden": 3.441015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2082101447507739, |
| "step": 5440 |
| }, |
| { |
| "epoch": 0.13625, |
| "grad_norm": 30.625, |
| "grad_norm_var": 11.2087890625, |
| "learning_rate": 0.0001, |
| "loss": 7.639, |
| "loss/crossentropy": 2.1367167800664904, |
| "loss/hidden": 3.3828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20548735409975052, |
| "step": 5450 |
| }, |
| { |
| "epoch": 0.1365, |
| "grad_norm": 28.375, |
| "grad_norm_var": 15.654166666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4951, |
| "loss/crossentropy": 2.109811532497406, |
| "loss/hidden": 3.4640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1994595667347312, |
| "step": 5460 |
| }, |
| { |
| "epoch": 0.13675, |
| "grad_norm": 31.75, |
| "grad_norm_var": 4.463997395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5263, |
| "loss/crossentropy": 2.2288592010736465, |
| "loss/hidden": 3.4140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20487434454262257, |
| "step": 5470 |
| }, |
| { |
| "epoch": 0.137, |
| "grad_norm": 32.0, |
| "grad_norm_var": 3.8400390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5681, |
| "loss/crossentropy": 2.0951829612255097, |
| "loss/hidden": 3.455078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20034745894372463, |
| "step": 5480 |
| }, |
| { |
| "epoch": 0.13725, |
| "grad_norm": 31.625, |
| "grad_norm_var": 2.299934895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4772, |
| "loss/crossentropy": 2.022165683656931, |
| "loss/hidden": 3.319140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18401878620497883, |
| "step": 5490 |
| }, |
| { |
| "epoch": 0.1375, |
| "grad_norm": 31.625, |
| "grad_norm_var": 4.205989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6869, |
| "loss/crossentropy": 2.095045933127403, |
| "loss/hidden": 3.609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21982598043978213, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.13775, |
| "grad_norm": 29.875, |
| "grad_norm_var": 2.177018229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4483, |
| "loss/crossentropy": 2.07875951230526, |
| "loss/hidden": 3.2890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1898285737261176, |
| "step": 5510 |
| }, |
| { |
| "epoch": 0.138, |
| "grad_norm": 30.0, |
| "grad_norm_var": 3.4181640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5258, |
| "loss/crossentropy": 2.194947564601898, |
| "loss/hidden": 3.4125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21397264376282693, |
| "step": 5520 |
| }, |
| { |
| "epoch": 0.13825, |
| "grad_norm": 32.75, |
| "grad_norm_var": 5.5541015625, |
| "learning_rate": 0.0001, |
| "loss": 7.6574, |
| "loss/crossentropy": 2.2282156944274902, |
| "loss/hidden": 3.375390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19613583907485008, |
| "step": 5530 |
| }, |
| { |
| "epoch": 0.1385, |
| "grad_norm": 29.875, |
| "grad_norm_var": 4.927018229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5361, |
| "loss/crossentropy": 2.1092587068676947, |
| "loss/hidden": 3.342578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1894394876435399, |
| "step": 5540 |
| }, |
| { |
| "epoch": 0.13875, |
| "grad_norm": 30.875, |
| "grad_norm_var": 6.080989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6223, |
| "loss/crossentropy": 2.189313694834709, |
| "loss/hidden": 3.428125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.210250511392951, |
| "step": 5550 |
| }, |
| { |
| "epoch": 0.139, |
| "grad_norm": 41.25, |
| "grad_norm_var": 8.976497395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4876, |
| "loss/crossentropy": 2.046143325418234, |
| "loss/hidden": 3.457421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19476603530347347, |
| "step": 5560 |
| }, |
| { |
| "epoch": 0.13925, |
| "grad_norm": 31.25, |
| "grad_norm_var": 9.076497395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5338, |
| "loss/crossentropy": 2.1882025837898254, |
| "loss/hidden": 3.45, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2058427443727851, |
| "step": 5570 |
| }, |
| { |
| "epoch": 0.1395, |
| "grad_norm": 30.25, |
| "grad_norm_var": 182.91875, |
| "learning_rate": 0.0001, |
| "loss": 7.6344, |
| "loss/crossentropy": 2.160731779038906, |
| "loss/hidden": 3.5171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22695957981050013, |
| "step": 5580 |
| }, |
| { |
| "epoch": 0.13975, |
| "grad_norm": 31.0, |
| "grad_norm_var": 4.140559895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5419, |
| "loss/crossentropy": 1.8924851581454276, |
| "loss/hidden": 3.446484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1802460763603449, |
| "step": 5590 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 30.25, |
| "grad_norm_var": 2.084830729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5498, |
| "loss/crossentropy": 2.242933538556099, |
| "loss/hidden": 3.387109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22144901510328055, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.14025, |
| "grad_norm": 29.75, |
| "grad_norm_var": 1.5622395833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5133, |
| "loss/crossentropy": 2.132666201889515, |
| "loss/hidden": 3.492578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19677439089864493, |
| "step": 5610 |
| }, |
| { |
| "epoch": 0.1405, |
| "grad_norm": 30.5, |
| "grad_norm_var": 14.234375, |
| "learning_rate": 0.0001, |
| "loss": 7.6226, |
| "loss/crossentropy": 2.0671760708093645, |
| "loss/hidden": 3.4203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20731164887547493, |
| "step": 5620 |
| }, |
| { |
| "epoch": 0.14075, |
| "grad_norm": 35.5, |
| "grad_norm_var": 14.44375, |
| "learning_rate": 0.0001, |
| "loss": 7.5063, |
| "loss/crossentropy": 2.060420323908329, |
| "loss/hidden": 3.346484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18428542967885733, |
| "step": 5630 |
| }, |
| { |
| "epoch": 0.141, |
| "grad_norm": 38.5, |
| "grad_norm_var": 19.1509765625, |
| "learning_rate": 0.0001, |
| "loss": 7.5322, |
| "loss/crossentropy": 2.2572733104228973, |
| "loss/hidden": 3.291015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19184892270714043, |
| "step": 5640 |
| }, |
| { |
| "epoch": 0.14125, |
| "grad_norm": 32.0, |
| "grad_norm_var": 20.373893229166665, |
| "learning_rate": 0.0001, |
| "loss": 7.5473, |
| "loss/crossentropy": 2.1559308364987375, |
| "loss/hidden": 3.237109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18201812207698823, |
| "step": 5650 |
| }, |
| { |
| "epoch": 0.1415, |
| "grad_norm": 28.5, |
| "grad_norm_var": 10.033268229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4529, |
| "loss/crossentropy": 2.1479016572237013, |
| "loss/hidden": 3.393359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20532293617725372, |
| "step": 5660 |
| }, |
| { |
| "epoch": 0.14175, |
| "grad_norm": 29.5, |
| "grad_norm_var": 3.6332682291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6001, |
| "loss/crossentropy": 2.140682426095009, |
| "loss/hidden": 3.321484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18856783863157034, |
| "step": 5670 |
| }, |
| { |
| "epoch": 0.142, |
| "grad_norm": 32.75, |
| "grad_norm_var": 2.84140625, |
| "learning_rate": 0.0001, |
| "loss": 7.6348, |
| "loss/crossentropy": 2.25071659386158, |
| "loss/hidden": 3.312109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2029257183894515, |
| "step": 5680 |
| }, |
| { |
| "epoch": 0.14225, |
| "grad_norm": 31.75, |
| "grad_norm_var": 2.643684895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5323, |
| "loss/crossentropy": 2.149606391787529, |
| "loss/hidden": 3.436328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2141027996316552, |
| "step": 5690 |
| }, |
| { |
| "epoch": 0.1425, |
| "grad_norm": 30.25, |
| "grad_norm_var": 2.32890625, |
| "learning_rate": 0.0001, |
| "loss": 7.5615, |
| "loss/crossentropy": 2.2273970007896424, |
| "loss/hidden": 3.4703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22348886616528035, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.14275, |
| "grad_norm": 29.875, |
| "grad_norm_var": 1.8285807291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4366, |
| "loss/crossentropy": 2.2010063380002975, |
| "loss/hidden": 3.289453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18594364672899247, |
| "step": 5710 |
| }, |
| { |
| "epoch": 0.143, |
| "grad_norm": 34.25, |
| "grad_norm_var": 2.6020182291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.508, |
| "loss/crossentropy": 2.0483784288167954, |
| "loss/hidden": 3.40234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2091974811628461, |
| "step": 5720 |
| }, |
| { |
| "epoch": 0.14325, |
| "grad_norm": 33.0, |
| "grad_norm_var": 2.6744140625, |
| "learning_rate": 0.0001, |
| "loss": 7.6646, |
| "loss/crossentropy": 2.025853230059147, |
| "loss/hidden": 3.589453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2227605242282152, |
| "step": 5730 |
| }, |
| { |
| "epoch": 0.1435, |
| "grad_norm": 31.0, |
| "grad_norm_var": 1.8327473958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6064, |
| "loss/crossentropy": 2.075891287624836, |
| "loss/hidden": 3.38359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1890827091410756, |
| "step": 5740 |
| }, |
| { |
| "epoch": 0.14375, |
| "grad_norm": 31.625, |
| "grad_norm_var": 13.943489583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4294, |
| "loss/crossentropy": 2.0676268830895426, |
| "loss/hidden": 3.426953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20394732002168894, |
| "step": 5750 |
| }, |
| { |
| "epoch": 0.144, |
| "grad_norm": 30.0, |
| "grad_norm_var": 14.658072916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5958, |
| "loss/crossentropy": 2.1388431686908005, |
| "loss/hidden": 3.499609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21454429486766458, |
| "step": 5760 |
| }, |
| { |
| "epoch": 0.14425, |
| "grad_norm": 32.0, |
| "grad_norm_var": 13.958072916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5342, |
| "loss/crossentropy": 2.0246446818113326, |
| "loss/hidden": 3.3140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18158567249774932, |
| "step": 5770 |
| }, |
| { |
| "epoch": 0.1445, |
| "grad_norm": 28.375, |
| "grad_norm_var": 14.469205729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5141, |
| "loss/crossentropy": 2.103305173665285, |
| "loss/hidden": 3.3234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1842075905762613, |
| "step": 5780 |
| }, |
| { |
| "epoch": 0.14475, |
| "grad_norm": 28.875, |
| "grad_norm_var": 3.160872395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6213, |
| "loss/crossentropy": 2.096319726109505, |
| "loss/hidden": 3.378515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19167389422655107, |
| "step": 5790 |
| }, |
| { |
| "epoch": 0.145, |
| "grad_norm": 35.25, |
| "grad_norm_var": 3.4916666666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4461, |
| "loss/crossentropy": 2.0800597339868547, |
| "loss/hidden": 3.3453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18269639350473882, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.14525, |
| "grad_norm": 45.5, |
| "grad_norm_var": 22.3306640625, |
| "learning_rate": 0.0001, |
| "loss": 7.4855, |
| "loss/crossentropy": 2.036512078344822, |
| "loss/hidden": 3.462890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1910469362512231, |
| "step": 5810 |
| }, |
| { |
| "epoch": 0.1455, |
| "grad_norm": 33.0, |
| "grad_norm_var": 23.284309895833335, |
| "learning_rate": 0.0001, |
| "loss": 7.4838, |
| "loss/crossentropy": 2.1922324389219283, |
| "loss/hidden": 3.48671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2080824663862586, |
| "step": 5820 |
| }, |
| { |
| "epoch": 0.14575, |
| "grad_norm": 31.25, |
| "grad_norm_var": 2.7171223958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.4657, |
| "loss/crossentropy": 2.0417690485715867, |
| "loss/hidden": 3.4546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19891292043030262, |
| "step": 5830 |
| }, |
| { |
| "epoch": 0.146, |
| "grad_norm": 32.75, |
| "grad_norm_var": 6.515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5864, |
| "loss/crossentropy": 2.12301287651062, |
| "loss/hidden": 3.2953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19781142249703407, |
| "step": 5840 |
| }, |
| { |
| "epoch": 0.14625, |
| "grad_norm": 33.0, |
| "grad_norm_var": 6.574934895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.6264, |
| "loss/crossentropy": 2.1600560665130617, |
| "loss/hidden": 3.448046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22095333114266397, |
| "step": 5850 |
| }, |
| { |
| "epoch": 0.1465, |
| "grad_norm": 31.125, |
| "grad_norm_var": 1.3822265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5231, |
| "loss/crossentropy": 2.185663253068924, |
| "loss/hidden": 3.41875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2029089467599988, |
| "step": 5860 |
| }, |
| { |
| "epoch": 0.14675, |
| "grad_norm": 33.0, |
| "grad_norm_var": 3.020247395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4938, |
| "loss/crossentropy": 2.052996274828911, |
| "loss/hidden": 3.477734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19343450702726842, |
| "step": 5870 |
| }, |
| { |
| "epoch": 0.147, |
| "grad_norm": 31.875, |
| "grad_norm_var": 6.112239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5855, |
| "loss/crossentropy": 2.2062906324863434, |
| "loss/hidden": 3.369921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20249157436192036, |
| "step": 5880 |
| }, |
| { |
| "epoch": 0.14725, |
| "grad_norm": 36.25, |
| "grad_norm_var": 2.4330729166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5644, |
| "loss/crossentropy": 2.0493671208620072, |
| "loss/hidden": 3.42109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19113321453332902, |
| "step": 5890 |
| }, |
| { |
| "epoch": 0.1475, |
| "grad_norm": 31.5, |
| "grad_norm_var": 2.6968098958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5941, |
| "loss/crossentropy": 2.20456420481205, |
| "loss/hidden": 3.378125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20127719175070524, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.14775, |
| "grad_norm": 29.625, |
| "grad_norm_var": 1.8268229166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4958, |
| "loss/crossentropy": 2.2630672723054888, |
| "loss/hidden": 3.375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20422303341329098, |
| "step": 5910 |
| }, |
| { |
| "epoch": 0.148, |
| "grad_norm": 33.5, |
| "grad_norm_var": 2.45, |
| "learning_rate": 0.0001, |
| "loss": 7.4893, |
| "loss/crossentropy": 2.130161929130554, |
| "loss/hidden": 3.442578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2029714561998844, |
| "step": 5920 |
| }, |
| { |
| "epoch": 0.14825, |
| "grad_norm": 31.375, |
| "grad_norm_var": 0.9728515625, |
| "learning_rate": 0.0001, |
| "loss": 7.6087, |
| "loss/crossentropy": 2.2219777315855027, |
| "loss/hidden": 3.374609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20015078466385602, |
| "step": 5930 |
| }, |
| { |
| "epoch": 0.1485, |
| "grad_norm": 31.125, |
| "grad_norm_var": 2.9514973958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5908, |
| "loss/crossentropy": 2.2077976912260056, |
| "loss/hidden": 3.37578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1996950874105096, |
| "step": 5940 |
| }, |
| { |
| "epoch": 0.14875, |
| "grad_norm": 30.0, |
| "grad_norm_var": 3.3619140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5402, |
| "loss/crossentropy": 2.1866880118846894, |
| "loss/hidden": 3.408203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19317448064684867, |
| "step": 5950 |
| }, |
| { |
| "epoch": 0.149, |
| "grad_norm": 31.625, |
| "grad_norm_var": 2.6968098958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4202, |
| "loss/crossentropy": 2.1943173080682756, |
| "loss/hidden": 3.2390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18432049825787544, |
| "step": 5960 |
| }, |
| { |
| "epoch": 0.14925, |
| "grad_norm": 30.75, |
| "grad_norm_var": 16.936393229166665, |
| "learning_rate": 0.0001, |
| "loss": 7.5056, |
| "loss/crossentropy": 2.17142014503479, |
| "loss/hidden": 3.4046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1876060863956809, |
| "step": 5970 |
| }, |
| { |
| "epoch": 0.1495, |
| "grad_norm": 31.5, |
| "grad_norm_var": 4.03515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5442, |
| "loss/crossentropy": 2.2047942698001863, |
| "loss/hidden": 3.369140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1988227294757962, |
| "step": 5980 |
| }, |
| { |
| "epoch": 0.14975, |
| "grad_norm": 32.0, |
| "grad_norm_var": 3.332291666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5693, |
| "loss/crossentropy": 2.281469625234604, |
| "loss/hidden": 3.337109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19730366189032794, |
| "step": 5990 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 30.125, |
| "grad_norm_var": 3.7603515625, |
| "learning_rate": 0.0001, |
| "loss": 7.4556, |
| "loss/crossentropy": 2.256136628985405, |
| "loss/hidden": 3.36796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1971954844892025, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.15025, |
| "grad_norm": 30.0, |
| "grad_norm_var": 2.1285807291666665, |
| "learning_rate": 0.0001, |
| "loss": 7.5146, |
| "loss/crossentropy": 2.105835199356079, |
| "loss/hidden": 3.416015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19158641248941422, |
| "step": 6010 |
| }, |
| { |
| "epoch": 0.1505, |
| "grad_norm": 40.0, |
| "grad_norm_var": 18.542708333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4687, |
| "loss/crossentropy": 2.095852518081665, |
| "loss/hidden": 3.359765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18887621480971575, |
| "step": 6020 |
| }, |
| { |
| "epoch": 0.15075, |
| "grad_norm": 30.5, |
| "grad_norm_var": 19.169791666666665, |
| "learning_rate": 0.0001, |
| "loss": 7.6293, |
| "loss/crossentropy": 2.0981243371963503, |
| "loss/hidden": 3.4875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22096464578062297, |
| "step": 6030 |
| }, |
| { |
| "epoch": 0.151, |
| "grad_norm": 30.5, |
| "grad_norm_var": 2.716080729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5935, |
| "loss/crossentropy": 2.212895154953003, |
| "loss/hidden": 3.510546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22050899863243104, |
| "step": 6040 |
| }, |
| { |
| "epoch": 0.15125, |
| "grad_norm": 30.625, |
| "grad_norm_var": 3.042122395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5, |
| "loss/crossentropy": 2.1280829131603243, |
| "loss/hidden": 3.337109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18497859146445988, |
| "step": 6050 |
| }, |
| { |
| "epoch": 0.1515, |
| "grad_norm": 29.25, |
| "grad_norm_var": 47.15807291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5469, |
| "loss/crossentropy": 2.1731619790196417, |
| "loss/hidden": 3.4578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22194090783596038, |
| "step": 6060 |
| }, |
| { |
| "epoch": 0.15175, |
| "grad_norm": 30.5, |
| "grad_norm_var": 3.218489583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4378, |
| "loss/crossentropy": 2.047077566385269, |
| "loss/hidden": 3.33359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1876080146059394, |
| "step": 6070 |
| }, |
| { |
| "epoch": 0.152, |
| "grad_norm": 33.0, |
| "grad_norm_var": 5.45, |
| "learning_rate": 0.0001, |
| "loss": 7.584, |
| "loss/crossentropy": 2.1896773248910906, |
| "loss/hidden": 3.341796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19448568094521762, |
| "step": 6080 |
| }, |
| { |
| "epoch": 0.15225, |
| "grad_norm": 31.125, |
| "grad_norm_var": 3.7634765625, |
| "learning_rate": 0.0001, |
| "loss": 7.6047, |
| "loss/crossentropy": 2.1123378753662108, |
| "loss/hidden": 3.374609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1931258851662278, |
| "step": 6090 |
| }, |
| { |
| "epoch": 0.1525, |
| "grad_norm": 30.625, |
| "grad_norm_var": 2.5488932291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5301, |
| "loss/crossentropy": 2.2152541011571882, |
| "loss/hidden": 3.363671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19427606835961342, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.15275, |
| "grad_norm": 34.25, |
| "grad_norm_var": 3.7421223958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5936, |
| "loss/crossentropy": 2.109275272488594, |
| "loss/hidden": 3.531640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2041168488562107, |
| "step": 6110 |
| }, |
| { |
| "epoch": 0.153, |
| "grad_norm": 30.0, |
| "grad_norm_var": 2.723893229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5107, |
| "loss/crossentropy": 2.0911407291889192, |
| "loss/hidden": 3.403515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19639678560197354, |
| "step": 6120 |
| }, |
| { |
| "epoch": 0.15325, |
| "grad_norm": 36.0, |
| "grad_norm_var": 3.2561848958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.4788, |
| "loss/crossentropy": 2.1557929456233977, |
| "loss/hidden": 3.5140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2058223508298397, |
| "step": 6130 |
| }, |
| { |
| "epoch": 0.1535, |
| "grad_norm": 30.0, |
| "grad_norm_var": 84.3337890625, |
| "learning_rate": 0.0001, |
| "loss": 7.5656, |
| "loss/crossentropy": 2.0974055036902426, |
| "loss/hidden": 3.4140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19299208922311664, |
| "step": 6140 |
| }, |
| { |
| "epoch": 0.15375, |
| "grad_norm": 30.625, |
| "grad_norm_var": 85.70149739583333, |
| "learning_rate": 0.0001, |
| "loss": 7.4607, |
| "loss/crossentropy": 2.0354842752218247, |
| "loss/hidden": 3.418359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2099142510443926, |
| "step": 6150 |
| }, |
| { |
| "epoch": 0.154, |
| "grad_norm": 29.75, |
| "grad_norm_var": 9.5134765625, |
| "learning_rate": 0.0001, |
| "loss": 7.4221, |
| "loss/crossentropy": 1.9916995614767075, |
| "loss/hidden": 3.351953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17939293198287487, |
| "step": 6160 |
| }, |
| { |
| "epoch": 0.15425, |
| "grad_norm": 33.5, |
| "grad_norm_var": 10.60390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5836, |
| "loss/crossentropy": 2.097960978746414, |
| "loss/hidden": 3.353515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1908213123679161, |
| "step": 6170 |
| }, |
| { |
| "epoch": 0.1545, |
| "grad_norm": 31.125, |
| "grad_norm_var": 3.0942057291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5073, |
| "loss/crossentropy": 2.253797325491905, |
| "loss/hidden": 3.44140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19871626775711776, |
| "step": 6180 |
| }, |
| { |
| "epoch": 0.15475, |
| "grad_norm": 30.375, |
| "grad_norm_var": 1.3900390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5189, |
| "loss/crossentropy": 2.1756670624017715, |
| "loss/hidden": 3.381640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19654417987912892, |
| "step": 6190 |
| }, |
| { |
| "epoch": 0.155, |
| "grad_norm": 32.75, |
| "grad_norm_var": 1.4181640625, |
| "learning_rate": 0.0001, |
| "loss": 7.3147, |
| "loss/crossentropy": 2.1110276162624357, |
| "loss/hidden": 3.294921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1801340376958251, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.15525, |
| "grad_norm": 28.5, |
| "grad_norm_var": 1.6223307291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6015, |
| "loss/crossentropy": 2.1547244489192963, |
| "loss/hidden": 3.31171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18071307614445686, |
| "step": 6210 |
| }, |
| { |
| "epoch": 0.1555, |
| "grad_norm": 33.0, |
| "grad_norm_var": 4.36875, |
| "learning_rate": 0.0001, |
| "loss": 7.5559, |
| "loss/crossentropy": 2.1889162242412565, |
| "loss/hidden": 3.344140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.185367326810956, |
| "step": 6220 |
| }, |
| { |
| "epoch": 0.15575, |
| "grad_norm": 38.75, |
| "grad_norm_var": 25.381184895833332, |
| "learning_rate": 0.0001, |
| "loss": 7.5323, |
| "loss/crossentropy": 2.023681116104126, |
| "loss/hidden": 3.38203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19041543006896972, |
| "step": 6230 |
| }, |
| { |
| "epoch": 0.156, |
| "grad_norm": 31.5, |
| "grad_norm_var": 12.545833333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6086, |
| "loss/crossentropy": 2.260247975587845, |
| "loss/hidden": 3.324609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19489852003753186, |
| "step": 6240 |
| }, |
| { |
| "epoch": 0.15625, |
| "grad_norm": 30.5, |
| "grad_norm_var": 6.642122395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5419, |
| "loss/crossentropy": 2.1487904608249666, |
| "loss/hidden": 3.37421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1957969294860959, |
| "step": 6250 |
| }, |
| { |
| "epoch": 0.1565, |
| "grad_norm": 31.25, |
| "grad_norm_var": 5.493489583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.588, |
| "loss/crossentropy": 2.279403430223465, |
| "loss/hidden": 3.350390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20682295374572277, |
| "step": 6260 |
| }, |
| { |
| "epoch": 0.15675, |
| "grad_norm": 30.125, |
| "grad_norm_var": 8.14765625, |
| "learning_rate": 0.0001, |
| "loss": 7.5894, |
| "loss/crossentropy": 2.1407866299152376, |
| "loss/hidden": 3.426953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20337907522916793, |
| "step": 6270 |
| }, |
| { |
| "epoch": 0.157, |
| "grad_norm": 33.25, |
| "grad_norm_var": 1314.8994140625, |
| "learning_rate": 0.0001, |
| "loss": 7.6872, |
| "loss/crossentropy": 2.1802447110414507, |
| "loss/hidden": 3.540625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22727628983557224, |
| "step": 6280 |
| }, |
| { |
| "epoch": 0.15725, |
| "grad_norm": 30.625, |
| "grad_norm_var": 1378.0379557291667, |
| "learning_rate": 0.0001, |
| "loss": 7.5596, |
| "loss/crossentropy": 2.1761491730809213, |
| "loss/hidden": 3.335546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18902392424643039, |
| "step": 6290 |
| }, |
| { |
| "epoch": 0.1575, |
| "grad_norm": 42.25, |
| "grad_norm_var": 146.03020833333332, |
| "learning_rate": 0.0001, |
| "loss": 7.6132, |
| "loss/crossentropy": 2.106969301402569, |
| "loss/hidden": 3.45859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2025001836940646, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.15775, |
| "grad_norm": 29.25, |
| "grad_norm_var": 14.539322916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.514, |
| "loss/crossentropy": 2.0654825627803803, |
| "loss/hidden": 3.419921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19361731074750424, |
| "step": 6310 |
| }, |
| { |
| "epoch": 0.158, |
| "grad_norm": 31.75, |
| "grad_norm_var": 8.153580729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5587, |
| "loss/crossentropy": 2.025081543624401, |
| "loss/hidden": 3.426953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2247559091076255, |
| "step": 6320 |
| }, |
| { |
| "epoch": 0.15825, |
| "grad_norm": 31.875, |
| "grad_norm_var": 2.0431640625, |
| "learning_rate": 0.0001, |
| "loss": 7.4657, |
| "loss/crossentropy": 2.136493813991547, |
| "loss/hidden": 3.35546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19502629730850457, |
| "step": 6330 |
| }, |
| { |
| "epoch": 0.1585, |
| "grad_norm": 33.0, |
| "grad_norm_var": 2.189322916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5454, |
| "loss/crossentropy": 2.176800549030304, |
| "loss/hidden": 3.38203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.201618373952806, |
| "step": 6340 |
| }, |
| { |
| "epoch": 0.15875, |
| "grad_norm": 28.375, |
| "grad_norm_var": 2.97265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5116, |
| "loss/crossentropy": 2.2223973661661147, |
| "loss/hidden": 3.43359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2056947773322463, |
| "step": 6350 |
| }, |
| { |
| "epoch": 0.159, |
| "grad_norm": 31.5, |
| "grad_norm_var": 3.0580729166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5133, |
| "loss/crossentropy": 2.147826671600342, |
| "loss/hidden": 3.4234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19723634477704763, |
| "step": 6360 |
| }, |
| { |
| "epoch": 0.15925, |
| "grad_norm": 35.0, |
| "grad_norm_var": 18.772916666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.679, |
| "loss/crossentropy": 2.079222206771374, |
| "loss/hidden": 3.47890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2045620433986187, |
| "step": 6370 |
| }, |
| { |
| "epoch": 0.1595, |
| "grad_norm": 28.75, |
| "grad_norm_var": 20.639518229166665, |
| "learning_rate": 0.0001, |
| "loss": 7.5245, |
| "loss/crossentropy": 2.163569709658623, |
| "loss/hidden": 3.45546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20100143328309059, |
| "step": 6380 |
| }, |
| { |
| "epoch": 0.15975, |
| "grad_norm": 36.75, |
| "grad_norm_var": 18.972916666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5705, |
| "loss/crossentropy": 2.0975339651107787, |
| "loss/hidden": 3.435546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19847188219428064, |
| "step": 6390 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 31.375, |
| "grad_norm_var": 64.11770833333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6051, |
| "loss/crossentropy": 2.0802886128425597, |
| "loss/hidden": 3.381640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20291686560958624, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.16025, |
| "grad_norm": 32.25, |
| "grad_norm_var": 74.28958333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4922, |
| "loss/crossentropy": 2.1224244251847266, |
| "loss/hidden": 3.342578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.194674601405859, |
| "step": 6410 |
| }, |
| { |
| "epoch": 0.1605, |
| "grad_norm": 62.5, |
| "grad_norm_var": 69.86041666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6584, |
| "loss/crossentropy": 2.1936039954423903, |
| "loss/hidden": 3.348828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19804685264825822, |
| "step": 6420 |
| }, |
| { |
| "epoch": 0.16075, |
| "grad_norm": 36.5, |
| "grad_norm_var": 66.04576822916667, |
| "learning_rate": 0.0001, |
| "loss": 7.4329, |
| "loss/crossentropy": 2.103436988592148, |
| "loss/hidden": 3.4296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19700358025729656, |
| "step": 6430 |
| }, |
| { |
| "epoch": 0.161, |
| "grad_norm": 28.875, |
| "grad_norm_var": 8.187239583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5397, |
| "loss/crossentropy": 2.107574874162674, |
| "loss/hidden": 3.380078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1931281227618456, |
| "step": 6440 |
| }, |
| { |
| "epoch": 0.16125, |
| "grad_norm": 32.75, |
| "grad_norm_var": 20.245768229166668, |
| "learning_rate": 0.0001, |
| "loss": 7.5289, |
| "loss/crossentropy": 2.251564306020737, |
| "loss/hidden": 3.4125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20565700754523278, |
| "step": 6450 |
| }, |
| { |
| "epoch": 0.1615, |
| "grad_norm": 29.5, |
| "grad_norm_var": 3.187239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4563, |
| "loss/crossentropy": 2.0432717867195604, |
| "loss/hidden": 3.3671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17879284229129552, |
| "step": 6460 |
| }, |
| { |
| "epoch": 0.16175, |
| "grad_norm": 40.0, |
| "grad_norm_var": 8.5416015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5052, |
| "loss/crossentropy": 2.130405417084694, |
| "loss/hidden": 3.423046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1868636028841138, |
| "step": 6470 |
| }, |
| { |
| "epoch": 0.162, |
| "grad_norm": 30.625, |
| "grad_norm_var": 8.284375, |
| "learning_rate": 0.0001, |
| "loss": 7.4387, |
| "loss/crossentropy": 2.1113599717617033, |
| "loss/hidden": 3.377734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20160141140222548, |
| "step": 6480 |
| }, |
| { |
| "epoch": 0.16225, |
| "grad_norm": 30.5, |
| "grad_norm_var": 10.103125, |
| "learning_rate": 0.0001, |
| "loss": 7.4812, |
| "loss/crossentropy": 2.177107959985733, |
| "loss/hidden": 3.363671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19870908968150616, |
| "step": 6490 |
| }, |
| { |
| "epoch": 0.1625, |
| "grad_norm": 29.0, |
| "grad_norm_var": 12.816080729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5634, |
| "loss/crossentropy": 2.2469130218029023, |
| "loss/hidden": 3.4421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22473043091595174, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.16275, |
| "grad_norm": 62.75, |
| "grad_norm_var": 67.71555989583334, |
| "learning_rate": 0.0001, |
| "loss": 7.4504, |
| "loss/crossentropy": 2.1134796291589737, |
| "loss/hidden": 3.395703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20341392308473588, |
| "step": 6510 |
| }, |
| { |
| "epoch": 0.163, |
| "grad_norm": 38.5, |
| "grad_norm_var": 132.73723958333332, |
| "learning_rate": 0.0001, |
| "loss": 7.6099, |
| "loss/crossentropy": 2.0655704945325852, |
| "loss/hidden": 3.45625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2016578745096922, |
| "step": 6520 |
| }, |
| { |
| "epoch": 0.16325, |
| "grad_norm": 32.0, |
| "grad_norm_var": 117.9541015625, |
| "learning_rate": 0.0001, |
| "loss": 7.668, |
| "loss/crossentropy": 2.171022225916386, |
| "loss/hidden": 3.575, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.23951137959957122, |
| "step": 6530 |
| }, |
| { |
| "epoch": 0.1635, |
| "grad_norm": 32.75, |
| "grad_norm_var": 6.513541666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5344, |
| "loss/crossentropy": 2.1631700932979583, |
| "loss/hidden": 3.371875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19134688563644886, |
| "step": 6540 |
| }, |
| { |
| "epoch": 0.16375, |
| "grad_norm": 31.25, |
| "grad_norm_var": 10.718489583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5036, |
| "loss/crossentropy": 2.154293045401573, |
| "loss/hidden": 3.355859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2056902015581727, |
| "step": 6550 |
| }, |
| { |
| "epoch": 0.164, |
| "grad_norm": 30.0, |
| "grad_norm_var": 4.233268229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4959, |
| "loss/crossentropy": 2.077804160118103, |
| "loss/hidden": 3.34921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18005848359316587, |
| "step": 6560 |
| }, |
| { |
| "epoch": 0.16425, |
| "grad_norm": 31.75, |
| "grad_norm_var": 3.2343098958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.3855, |
| "loss/crossentropy": 2.238145849108696, |
| "loss/hidden": 3.3515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20202569160610437, |
| "step": 6570 |
| }, |
| { |
| "epoch": 0.1645, |
| "grad_norm": 29.25, |
| "grad_norm_var": 4.284375, |
| "learning_rate": 0.0001, |
| "loss": 7.5453, |
| "loss/crossentropy": 2.1169906362891195, |
| "loss/hidden": 3.41640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1980356415733695, |
| "step": 6580 |
| }, |
| { |
| "epoch": 0.16475, |
| "grad_norm": 32.5, |
| "grad_norm_var": 1.6848307291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5151, |
| "loss/crossentropy": 2.165386658906937, |
| "loss/hidden": 3.413671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19807699434459208, |
| "step": 6590 |
| }, |
| { |
| "epoch": 0.165, |
| "grad_norm": 30.125, |
| "grad_norm_var": 2.905989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4242, |
| "loss/crossentropy": 2.2100414454936983, |
| "loss/hidden": 3.452734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21563767176121473, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.16525, |
| "grad_norm": 28.25, |
| "grad_norm_var": 21.574934895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.4757, |
| "loss/crossentropy": 2.1352153360843658, |
| "loss/hidden": 3.277734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.176172699034214, |
| "step": 6610 |
| }, |
| { |
| "epoch": 0.1655, |
| "grad_norm": 32.0, |
| "grad_norm_var": 9.960872395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5596, |
| "loss/crossentropy": 2.1773680597543716, |
| "loss/hidden": 3.3703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18952815048396587, |
| "step": 6620 |
| }, |
| { |
| "epoch": 0.16575, |
| "grad_norm": 28.625, |
| "grad_norm_var": 4.380143229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.56, |
| "loss/crossentropy": 2.1314379185438157, |
| "loss/hidden": 3.409765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19869489334523677, |
| "step": 6630 |
| }, |
| { |
| "epoch": 0.166, |
| "grad_norm": 32.75, |
| "grad_norm_var": 65.24765625, |
| "learning_rate": 0.0001, |
| "loss": 7.5285, |
| "loss/crossentropy": 2.1807857722043993, |
| "loss/hidden": 3.44453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20406192895025016, |
| "step": 6640 |
| }, |
| { |
| "epoch": 0.16625, |
| "grad_norm": 36.75, |
| "grad_norm_var": 61.514322916666664, |
| "learning_rate": 0.0001, |
| "loss": 7.5774, |
| "loss/crossentropy": 2.1761706352233885, |
| "loss/hidden": 3.490625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22523548007011412, |
| "step": 6650 |
| }, |
| { |
| "epoch": 0.1665, |
| "grad_norm": 40.0, |
| "grad_norm_var": 10.858072916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4837, |
| "loss/crossentropy": 2.0422482162714006, |
| "loss/hidden": 3.411328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19435476139187813, |
| "step": 6660 |
| }, |
| { |
| "epoch": 0.16675, |
| "grad_norm": 31.125, |
| "grad_norm_var": 9.977083333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6686, |
| "loss/crossentropy": 2.182368849217892, |
| "loss/hidden": 3.3578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20404126346111298, |
| "step": 6670 |
| }, |
| { |
| "epoch": 0.167, |
| "grad_norm": 32.0, |
| "grad_norm_var": 11.715625, |
| "learning_rate": 0.0001, |
| "loss": 7.5378, |
| "loss/crossentropy": 2.246191081404686, |
| "loss/hidden": 3.28359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18508785720914603, |
| "step": 6680 |
| }, |
| { |
| "epoch": 0.16725, |
| "grad_norm": 32.25, |
| "grad_norm_var": 7.314322916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5093, |
| "loss/crossentropy": 2.1791825108230114, |
| "loss/hidden": 3.303515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18074298240244388, |
| "step": 6690 |
| }, |
| { |
| "epoch": 0.1675, |
| "grad_norm": 30.75, |
| "grad_norm_var": 8.992708333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5574, |
| "loss/crossentropy": 2.1036971658468246, |
| "loss/hidden": 3.372265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1981421067379415, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.16775, |
| "grad_norm": 29.5, |
| "grad_norm_var": 14.267643229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.3344, |
| "loss/crossentropy": 2.001809497177601, |
| "loss/hidden": 3.39609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1832934280857444, |
| "step": 6710 |
| }, |
| { |
| "epoch": 0.168, |
| "grad_norm": 61.25, |
| "grad_norm_var": 63.89375, |
| "learning_rate": 0.0001, |
| "loss": 7.5556, |
| "loss/crossentropy": 2.1391955494880674, |
| "loss/hidden": 3.49375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20318875089287758, |
| "step": 6720 |
| }, |
| { |
| "epoch": 0.16825, |
| "grad_norm": 33.75, |
| "grad_norm_var": 56.9791015625, |
| "learning_rate": 0.0001, |
| "loss": 7.556, |
| "loss/crossentropy": 2.155173195898533, |
| "loss/hidden": 3.390234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19701551645994186, |
| "step": 6730 |
| }, |
| { |
| "epoch": 0.1685, |
| "grad_norm": 30.5, |
| "grad_norm_var": 9.07890625, |
| "learning_rate": 0.0001, |
| "loss": 7.444, |
| "loss/crossentropy": 2.1643510669469834, |
| "loss/hidden": 3.38203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19851317517459394, |
| "step": 6740 |
| }, |
| { |
| "epoch": 0.16875, |
| "grad_norm": 32.5, |
| "grad_norm_var": 4.964518229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4711, |
| "loss/crossentropy": 2.1824121534824372, |
| "loss/hidden": 3.38125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20471413098275662, |
| "step": 6750 |
| }, |
| { |
| "epoch": 0.169, |
| "grad_norm": 32.75, |
| "grad_norm_var": 6.1009765625, |
| "learning_rate": 0.0001, |
| "loss": 7.4304, |
| "loss/crossentropy": 2.0600946068763735, |
| "loss/hidden": 3.374609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18951213210821152, |
| "step": 6760 |
| }, |
| { |
| "epoch": 0.16925, |
| "grad_norm": 34.25, |
| "grad_norm_var": 6.034309895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5587, |
| "loss/crossentropy": 2.1303545042872427, |
| "loss/hidden": 3.32578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19376144148409366, |
| "step": 6770 |
| }, |
| { |
| "epoch": 0.1695, |
| "grad_norm": 35.75, |
| "grad_norm_var": 8.519205729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.4825, |
| "loss/crossentropy": 2.223961615562439, |
| "loss/hidden": 3.414453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1993227696046233, |
| "step": 6780 |
| }, |
| { |
| "epoch": 0.16975, |
| "grad_norm": 52.5, |
| "grad_norm_var": 31.4837890625, |
| "learning_rate": 0.0001, |
| "loss": 7.4688, |
| "loss/crossentropy": 2.1645863845944406, |
| "loss/hidden": 3.416015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19716022843495012, |
| "step": 6790 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 32.75, |
| "grad_norm_var": 29.0400390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5153, |
| "loss/crossentropy": 2.251170714199543, |
| "loss/hidden": 3.37578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21400584764778613, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.17025, |
| "grad_norm": 32.5, |
| "grad_norm_var": 4.945833333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4593, |
| "loss/crossentropy": 2.1392437756061553, |
| "loss/hidden": 3.3734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19613203704357146, |
| "step": 6810 |
| }, |
| { |
| "epoch": 0.1705, |
| "grad_norm": 31.5, |
| "grad_norm_var": 9.330989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4589, |
| "loss/crossentropy": 2.1152432590723036, |
| "loss/hidden": 3.330078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1848454039543867, |
| "step": 6820 |
| }, |
| { |
| "epoch": 0.17075, |
| "grad_norm": 33.5, |
| "grad_norm_var": 15.155208333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5984, |
| "loss/crossentropy": 2.208776918053627, |
| "loss/hidden": 3.38203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20963791087269784, |
| "step": 6830 |
| }, |
| { |
| "epoch": 0.171, |
| "grad_norm": 31.0, |
| "grad_norm_var": 14.967708333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5348, |
| "loss/crossentropy": 2.1368553161621096, |
| "loss/hidden": 3.47734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2036329936236143, |
| "step": 6840 |
| }, |
| { |
| "epoch": 0.17125, |
| "grad_norm": 33.25, |
| "grad_norm_var": 11.615625, |
| "learning_rate": 0.0001, |
| "loss": 7.4543, |
| "loss/crossentropy": 2.2001425683498383, |
| "loss/hidden": 3.291015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18403711169958115, |
| "step": 6850 |
| }, |
| { |
| "epoch": 0.1715, |
| "grad_norm": 31.875, |
| "grad_norm_var": 4.045247395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4924, |
| "loss/crossentropy": 2.0965635985136033, |
| "loss/hidden": 3.413671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19117785301059484, |
| "step": 6860 |
| }, |
| { |
| "epoch": 0.17175, |
| "grad_norm": 32.0, |
| "grad_norm_var": 6.587434895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.4879, |
| "loss/crossentropy": 2.106214761734009, |
| "loss/hidden": 3.40546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19950247537344695, |
| "step": 6870 |
| }, |
| { |
| "epoch": 0.172, |
| "grad_norm": 30.75, |
| "grad_norm_var": 8.289518229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4561, |
| "loss/crossentropy": 2.1776740878820418, |
| "loss/hidden": 3.386328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19133987911045552, |
| "step": 6880 |
| }, |
| { |
| "epoch": 0.17225, |
| "grad_norm": 30.25, |
| "grad_norm_var": 2.77265625, |
| "learning_rate": 0.0001, |
| "loss": 7.4422, |
| "loss/crossentropy": 2.082959216833115, |
| "loss/hidden": 3.544921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2135216325521469, |
| "step": 6890 |
| }, |
| { |
| "epoch": 0.1725, |
| "grad_norm": 31.0, |
| "grad_norm_var": 4.1556640625, |
| "learning_rate": 0.0001, |
| "loss": 7.6053, |
| "loss/crossentropy": 2.2018980890512467, |
| "loss/hidden": 3.495703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21013570427894593, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.17275, |
| "grad_norm": 39.5, |
| "grad_norm_var": 6.5322265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5447, |
| "loss/crossentropy": 2.2833516895771027, |
| "loss/hidden": 3.3125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19603817090392112, |
| "step": 6910 |
| }, |
| { |
| "epoch": 0.173, |
| "grad_norm": 31.125, |
| "grad_norm_var": 12.677018229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.4817, |
| "loss/crossentropy": 2.166572627425194, |
| "loss/hidden": 3.38671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19017961733043193, |
| "step": 6920 |
| }, |
| { |
| "epoch": 0.17325, |
| "grad_norm": 32.0, |
| "grad_norm_var": 3.78125, |
| "learning_rate": 0.0001, |
| "loss": 7.6572, |
| "loss/crossentropy": 2.1658833861351012, |
| "loss/hidden": 3.426171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22312654759734868, |
| "step": 6930 |
| }, |
| { |
| "epoch": 0.1735, |
| "grad_norm": 31.5, |
| "grad_norm_var": 3.5747395833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4706, |
| "loss/crossentropy": 2.1533073887228964, |
| "loss/hidden": 3.400390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20075356736779212, |
| "step": 6940 |
| }, |
| { |
| "epoch": 0.17375, |
| "grad_norm": 33.0, |
| "grad_norm_var": 7.611393229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6678, |
| "loss/crossentropy": 2.2583969831466675, |
| "loss/hidden": 3.406640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2209668504074216, |
| "step": 6950 |
| }, |
| { |
| "epoch": 0.174, |
| "grad_norm": 30.0, |
| "grad_norm_var": 11.0087890625, |
| "learning_rate": 0.0001, |
| "loss": 7.3805, |
| "loss/crossentropy": 2.1416700780391693, |
| "loss/hidden": 3.31953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20209096949547528, |
| "step": 6960 |
| }, |
| { |
| "epoch": 0.17425, |
| "grad_norm": 29.0, |
| "grad_norm_var": 5.171875, |
| "learning_rate": 0.0001, |
| "loss": 7.5472, |
| "loss/crossentropy": 2.1253055185079575, |
| "loss/hidden": 3.402734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19859984703361988, |
| "step": 6970 |
| }, |
| { |
| "epoch": 0.1745, |
| "grad_norm": 30.0, |
| "grad_norm_var": 6.6603515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5648, |
| "loss/crossentropy": 2.182023701816797, |
| "loss/hidden": 3.29296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18740688636898994, |
| "step": 6980 |
| }, |
| { |
| "epoch": 0.17475, |
| "grad_norm": 30.75, |
| "grad_norm_var": 2.4947916666666665, |
| "learning_rate": 0.0001, |
| "loss": 7.4586, |
| "loss/crossentropy": 2.1650559276342394, |
| "loss/hidden": 3.394921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1944490736350417, |
| "step": 6990 |
| }, |
| { |
| "epoch": 0.175, |
| "grad_norm": 32.25, |
| "grad_norm_var": 5.6587890625, |
| "learning_rate": 0.0001, |
| "loss": 7.569, |
| "loss/crossentropy": 2.177471086382866, |
| "loss/hidden": 3.3796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19053987860679628, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.17525, |
| "grad_norm": 32.5, |
| "grad_norm_var": 26.516080729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4753, |
| "loss/crossentropy": 2.154606765508652, |
| "loss/hidden": 3.423828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19910989850759506, |
| "step": 7010 |
| }, |
| { |
| "epoch": 0.1755, |
| "grad_norm": 34.5, |
| "grad_norm_var": 39.670572916666664, |
| "learning_rate": 0.0001, |
| "loss": 7.631, |
| "loss/crossentropy": 2.197327023744583, |
| "loss/hidden": 3.541796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21263533756136893, |
| "step": 7020 |
| }, |
| { |
| "epoch": 0.17575, |
| "grad_norm": 32.25, |
| "grad_norm_var": 19.970833333333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5712, |
| "loss/crossentropy": 2.143507385253906, |
| "loss/hidden": 3.41796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20819550901651382, |
| "step": 7030 |
| }, |
| { |
| "epoch": 0.176, |
| "grad_norm": 30.25, |
| "grad_norm_var": 2.755989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6156, |
| "loss/crossentropy": 2.160289117693901, |
| "loss/hidden": 3.45859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2082687295973301, |
| "step": 7040 |
| }, |
| { |
| "epoch": 0.17625, |
| "grad_norm": 34.0, |
| "grad_norm_var": 1.6035807291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5106, |
| "loss/crossentropy": 2.096612122654915, |
| "loss/hidden": 3.430859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2005817864090204, |
| "step": 7050 |
| }, |
| { |
| "epoch": 0.1765, |
| "grad_norm": 29.5, |
| "grad_norm_var": 1.7921223958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4678, |
| "loss/crossentropy": 2.150402194261551, |
| "loss/hidden": 3.31953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18803946152329445, |
| "step": 7060 |
| }, |
| { |
| "epoch": 0.17675, |
| "grad_norm": 30.375, |
| "grad_norm_var": 1.7754557291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5601, |
| "loss/crossentropy": 2.111455664038658, |
| "loss/hidden": 3.38125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1900298461318016, |
| "step": 7070 |
| }, |
| { |
| "epoch": 0.177, |
| "grad_norm": 30.875, |
| "grad_norm_var": 3.0497395833333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5888, |
| "loss/crossentropy": 2.172677582502365, |
| "loss/hidden": 3.380859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1932773429900408, |
| "step": 7080 |
| }, |
| { |
| "epoch": 0.17725, |
| "grad_norm": 29.25, |
| "grad_norm_var": 3.131705729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.527, |
| "loss/crossentropy": 2.2066776901483536, |
| "loss/hidden": 3.416015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20891736317425966, |
| "step": 7090 |
| }, |
| { |
| "epoch": 0.1775, |
| "grad_norm": 33.0, |
| "grad_norm_var": 5.386713240151209e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.6693, |
| "loss/crossentropy": 2.1923015132546424, |
| "loss/hidden": 3.3640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2022488683462143, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.17775, |
| "grad_norm": 31.375, |
| "grad_norm_var": 2.9284656292120494e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.557, |
| "loss/crossentropy": 2.0847490578889847, |
| "loss/hidden": 3.552734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2148862171918154, |
| "step": 7110 |
| }, |
| { |
| "epoch": 0.178, |
| "grad_norm": 30.625, |
| "grad_norm_var": 1.3369140625, |
| "learning_rate": 0.0001, |
| "loss": 7.4864, |
| "loss/crossentropy": 2.1249582156538964, |
| "loss/hidden": 3.296484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19168496485799552, |
| "step": 7120 |
| }, |
| { |
| "epoch": 0.17825, |
| "grad_norm": 31.875, |
| "grad_norm_var": 2.878125, |
| "learning_rate": 0.0001, |
| "loss": 7.36, |
| "loss/crossentropy": 2.173662793636322, |
| "loss/hidden": 3.34921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1978321423754096, |
| "step": 7130 |
| }, |
| { |
| "epoch": 0.1785, |
| "grad_norm": 34.5, |
| "grad_norm_var": 3.9723307291666665, |
| "learning_rate": 0.0001, |
| "loss": 7.5663, |
| "loss/crossentropy": 2.1832578271627425, |
| "loss/hidden": 3.4484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22951248846948147, |
| "step": 7140 |
| }, |
| { |
| "epoch": 0.17875, |
| "grad_norm": 32.75, |
| "grad_norm_var": 2.022330729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5795, |
| "loss/crossentropy": 2.1229765862226486, |
| "loss/hidden": 3.28203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1857854513451457, |
| "step": 7150 |
| }, |
| { |
| "epoch": 0.179, |
| "grad_norm": 30.25, |
| "grad_norm_var": 42.5978515625, |
| "learning_rate": 0.0001, |
| "loss": 7.4524, |
| "loss/crossentropy": 2.1213574737310408, |
| "loss/hidden": 3.4671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22352912444621326, |
| "step": 7160 |
| }, |
| { |
| "epoch": 0.17925, |
| "grad_norm": 29.625, |
| "grad_norm_var": 44.90774739583333, |
| "learning_rate": 0.0001, |
| "loss": 7.5702, |
| "loss/crossentropy": 2.2016272962093355, |
| "loss/hidden": 3.4109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19035741928964853, |
| "step": 7170 |
| }, |
| { |
| "epoch": 0.1795, |
| "grad_norm": 33.25, |
| "grad_norm_var": 2.08515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5485, |
| "loss/crossentropy": 2.1106868594884873, |
| "loss/hidden": 3.41875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19449420645833015, |
| "step": 7180 |
| }, |
| { |
| "epoch": 0.17975, |
| "grad_norm": 33.25, |
| "grad_norm_var": 3.414322916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4434, |
| "loss/crossentropy": 2.1368062049150467, |
| "loss/hidden": 3.389453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1976731013506651, |
| "step": 7190 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 33.5, |
| "grad_norm_var": 3.5483723958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4368, |
| "loss/crossentropy": 2.1658909559249877, |
| "loss/hidden": 3.476171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1897917227819562, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.18025, |
| "grad_norm": 32.5, |
| "grad_norm_var": 3.39765625, |
| "learning_rate": 0.0001, |
| "loss": 7.5235, |
| "loss/crossentropy": 2.0003643825650217, |
| "loss/hidden": 3.39453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1881322082132101, |
| "step": 7210 |
| }, |
| { |
| "epoch": 0.1805, |
| "grad_norm": 32.25, |
| "grad_norm_var": 7.2369140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5351, |
| "loss/crossentropy": 2.1550097078084947, |
| "loss/hidden": 3.417578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1885005658492446, |
| "step": 7220 |
| }, |
| { |
| "epoch": 0.18075, |
| "grad_norm": 29.75, |
| "grad_norm_var": 3.6809895833333335, |
| "learning_rate": 0.0001, |
| "loss": 7.3615, |
| "loss/crossentropy": 2.089699313044548, |
| "loss/hidden": 3.362109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18596011865884066, |
| "step": 7230 |
| }, |
| { |
| "epoch": 0.181, |
| "grad_norm": 29.625, |
| "grad_norm_var": 3.780989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.521, |
| "loss/crossentropy": 2.1315871119499206, |
| "loss/hidden": 3.476953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2100736267864704, |
| "step": 7240 |
| }, |
| { |
| "epoch": 0.18125, |
| "grad_norm": 34.75, |
| "grad_norm_var": 4.068684895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4489, |
| "loss/crossentropy": 2.0509297475218773, |
| "loss/hidden": 3.44296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21798528153449298, |
| "step": 7250 |
| }, |
| { |
| "epoch": 0.1815, |
| "grad_norm": 33.0, |
| "grad_norm_var": 2.546875, |
| "learning_rate": 0.0001, |
| "loss": 7.5147, |
| "loss/crossentropy": 2.097806680202484, |
| "loss/hidden": 3.419921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1954873636364937, |
| "step": 7260 |
| }, |
| { |
| "epoch": 0.18175, |
| "grad_norm": 31.375, |
| "grad_norm_var": 1.9270833333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5335, |
| "loss/crossentropy": 2.1867170676589014, |
| "loss/hidden": 3.471484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20327441468834878, |
| "step": 7270 |
| }, |
| { |
| "epoch": 0.182, |
| "grad_norm": 31.75, |
| "grad_norm_var": 2.0434895833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5098, |
| "loss/crossentropy": 2.059905408322811, |
| "loss/hidden": 3.402734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1984263988211751, |
| "step": 7280 |
| }, |
| { |
| "epoch": 0.18225, |
| "grad_norm": 30.875, |
| "grad_norm_var": 3.0004557291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6258, |
| "loss/crossentropy": 2.19571368098259, |
| "loss/hidden": 3.424609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20168307051062584, |
| "step": 7290 |
| }, |
| { |
| "epoch": 0.1825, |
| "grad_norm": 30.25, |
| "grad_norm_var": 2.7051432291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5923, |
| "loss/crossentropy": 2.12765002399683, |
| "loss/hidden": 3.518359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19315951522439717, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.18275, |
| "grad_norm": 29.375, |
| "grad_norm_var": 1.746875, |
| "learning_rate": 0.0001, |
| "loss": 7.5356, |
| "loss/crossentropy": 2.2317890375852585, |
| "loss/hidden": 3.369921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19801790416240692, |
| "step": 7310 |
| }, |
| { |
| "epoch": 0.183, |
| "grad_norm": 30.5, |
| "grad_norm_var": 3.206184895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4805, |
| "loss/crossentropy": 2.115889000892639, |
| "loss/hidden": 3.43046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20432772096246482, |
| "step": 7320 |
| }, |
| { |
| "epoch": 0.18325, |
| "grad_norm": 29.875, |
| "grad_norm_var": 7.782291666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5582, |
| "loss/crossentropy": 2.20442833006382, |
| "loss/hidden": 3.488671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20083660595119, |
| "step": 7330 |
| }, |
| { |
| "epoch": 0.1835, |
| "grad_norm": 31.5, |
| "grad_norm_var": 3.6947916666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6294, |
| "loss/crossentropy": 2.210834649205208, |
| "loss/hidden": 3.363671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19781918153166772, |
| "step": 7340 |
| }, |
| { |
| "epoch": 0.18375, |
| "grad_norm": 31.25, |
| "grad_norm_var": 3.7791015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5733, |
| "loss/crossentropy": 2.094112278521061, |
| "loss/hidden": 3.41171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1902735486626625, |
| "step": 7350 |
| }, |
| { |
| "epoch": 0.184, |
| "grad_norm": 31.25, |
| "grad_norm_var": 1.3796223958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4645, |
| "loss/crossentropy": 2.1219489932060243, |
| "loss/hidden": 3.385546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1875064203515649, |
| "step": 7360 |
| }, |
| { |
| "epoch": 0.18425, |
| "grad_norm": 31.0, |
| "grad_norm_var": 5.262955729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5375, |
| "loss/crossentropy": 2.1768325984478, |
| "loss/hidden": 3.41328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20637014620006083, |
| "step": 7370 |
| }, |
| { |
| "epoch": 0.1845, |
| "grad_norm": 33.25, |
| "grad_norm_var": 4.51015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5972, |
| "loss/crossentropy": 2.088278591632843, |
| "loss/hidden": 3.42890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19868529513478278, |
| "step": 7380 |
| }, |
| { |
| "epoch": 0.18475, |
| "grad_norm": 28.125, |
| "grad_norm_var": 7.376041666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4234, |
| "loss/crossentropy": 2.162215715646744, |
| "loss/hidden": 3.36640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20086740422993898, |
| "step": 7390 |
| }, |
| { |
| "epoch": 0.185, |
| "grad_norm": 34.0, |
| "grad_norm_var": 141.65201822916666, |
| "learning_rate": 0.0001, |
| "loss": 7.4958, |
| "loss/crossentropy": 2.076205277442932, |
| "loss/hidden": 3.42109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1904180521145463, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.18525, |
| "grad_norm": 31.125, |
| "grad_norm_var": 138.825, |
| "learning_rate": 0.0001, |
| "loss": 7.574, |
| "loss/crossentropy": 2.1355942092835902, |
| "loss/hidden": 3.47734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2197154738008976, |
| "step": 7410 |
| }, |
| { |
| "epoch": 0.1855, |
| "grad_norm": 30.75, |
| "grad_norm_var": 3.405847189186238e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.4496, |
| "loss/crossentropy": 2.152878683805466, |
| "loss/hidden": 3.36953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18649988640099763, |
| "step": 7420 |
| }, |
| { |
| "epoch": 0.18575, |
| "grad_norm": 30.25, |
| "grad_norm_var": 3.40584718949382e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.4482, |
| "loss/crossentropy": 2.100820633769035, |
| "loss/hidden": 3.38828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1907477645203471, |
| "step": 7430 |
| }, |
| { |
| "epoch": 0.186, |
| "grad_norm": 30.75, |
| "grad_norm_var": 2.299934895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.436, |
| "loss/crossentropy": 2.1752007991075515, |
| "loss/hidden": 3.323828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19189766012132167, |
| "step": 7440 |
| }, |
| { |
| "epoch": 0.18625, |
| "grad_norm": 30.25, |
| "grad_norm_var": 5.689322916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5691, |
| "loss/crossentropy": 2.242092598974705, |
| "loss/hidden": 3.385546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19479812681674957, |
| "step": 7450 |
| }, |
| { |
| "epoch": 0.1865, |
| "grad_norm": 30.625, |
| "grad_norm_var": 5.6369140625, |
| "learning_rate": 0.0001, |
| "loss": 7.6412, |
| "loss/crossentropy": 2.1058298379182814, |
| "loss/hidden": 3.4734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22841914147138595, |
| "step": 7460 |
| }, |
| { |
| "epoch": 0.18675, |
| "grad_norm": 29.0, |
| "grad_norm_var": 4.435872395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.3797, |
| "loss/crossentropy": 2.0960675440728664, |
| "loss/hidden": 3.39375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1850858678109944, |
| "step": 7470 |
| }, |
| { |
| "epoch": 0.187, |
| "grad_norm": 30.125, |
| "grad_norm_var": 5.34765625, |
| "learning_rate": 0.0001, |
| "loss": 7.5432, |
| "loss/crossentropy": 2.180506870150566, |
| "loss/hidden": 3.459765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20674382094293833, |
| "step": 7480 |
| }, |
| { |
| "epoch": 0.18725, |
| "grad_norm": 31.25, |
| "grad_norm_var": 4.718489583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5242, |
| "loss/crossentropy": 2.101006045937538, |
| "loss/hidden": 3.503125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19774735551327466, |
| "step": 7490 |
| }, |
| { |
| "epoch": 0.1875, |
| "grad_norm": 31.375, |
| "grad_norm_var": 1.7791666666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5166, |
| "loss/crossentropy": 2.069665388762951, |
| "loss/hidden": 3.41484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19810831565409898, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.18775, |
| "grad_norm": 29.75, |
| "grad_norm_var": 2.07890625, |
| "learning_rate": 0.0001, |
| "loss": 7.4959, |
| "loss/crossentropy": 2.0420530915260313, |
| "loss/hidden": 3.340625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18399292901158332, |
| "step": 7510 |
| }, |
| { |
| "epoch": 0.188, |
| "grad_norm": 30.875, |
| "grad_norm_var": 4.0587890625, |
| "learning_rate": 0.0001, |
| "loss": 7.5325, |
| "loss/crossentropy": 2.1312338694930077, |
| "loss/hidden": 3.404296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19504649545997382, |
| "step": 7520 |
| }, |
| { |
| "epoch": 0.18825, |
| "grad_norm": 43.0, |
| "grad_norm_var": 9.308333333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6141, |
| "loss/crossentropy": 2.2894584849476813, |
| "loss/hidden": 3.409375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20183086302131414, |
| "step": 7530 |
| }, |
| { |
| "epoch": 0.1885, |
| "grad_norm": 34.0, |
| "grad_norm_var": 10.281705729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5206, |
| "loss/crossentropy": 2.1557983949780466, |
| "loss/hidden": 3.327734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19000910818576813, |
| "step": 7540 |
| }, |
| { |
| "epoch": 0.18875, |
| "grad_norm": 29.25, |
| "grad_norm_var": 4.21015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5249, |
| "loss/crossentropy": 2.124994584918022, |
| "loss/hidden": 3.344140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19348742607980968, |
| "step": 7550 |
| }, |
| { |
| "epoch": 0.189, |
| "grad_norm": 29.75, |
| "grad_norm_var": 4.334830729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4858, |
| "loss/crossentropy": 2.154696786403656, |
| "loss/hidden": 3.33359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19012149404734374, |
| "step": 7560 |
| }, |
| { |
| "epoch": 0.18925, |
| "grad_norm": 30.25, |
| "grad_norm_var": 3.9872395833333334, |
| "learning_rate": 0.0001, |
| "loss": 7.487, |
| "loss/crossentropy": 2.1938005700707435, |
| "loss/hidden": 3.40546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19971045572310686, |
| "step": 7570 |
| }, |
| { |
| "epoch": 0.1895, |
| "grad_norm": 30.375, |
| "grad_norm_var": 3.5624176423088947e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.4698, |
| "loss/crossentropy": 2.120267179608345, |
| "loss/hidden": 3.358984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19198095686733724, |
| "step": 7580 |
| }, |
| { |
| "epoch": 0.18975, |
| "grad_norm": 30.75, |
| "grad_norm_var": 4.756184895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4743, |
| "loss/crossentropy": 2.1569736182689665, |
| "loss/hidden": 3.53984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21590765863656997, |
| "step": 7590 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 30.125, |
| "grad_norm_var": 2.9166015625, |
| "learning_rate": 0.0001, |
| "loss": 7.4344, |
| "loss/crossentropy": 2.1558305487036704, |
| "loss/hidden": 3.376171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18447545487433673, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.19025, |
| "grad_norm": 31.375, |
| "grad_norm_var": 4.1025390625, |
| "learning_rate": 0.0001, |
| "loss": 7.4889, |
| "loss/crossentropy": 2.0612099058926105, |
| "loss/hidden": 3.427734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19961078523192555, |
| "step": 7610 |
| }, |
| { |
| "epoch": 0.1905, |
| "grad_norm": 28.625, |
| "grad_norm_var": 3.3087890625, |
| "learning_rate": 0.0001, |
| "loss": 7.6558, |
| "loss/crossentropy": 2.04723744392395, |
| "loss/hidden": 3.5171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22622217051684856, |
| "step": 7620 |
| }, |
| { |
| "epoch": 0.19075, |
| "grad_norm": 29.25, |
| "grad_norm_var": 5.770833333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4133, |
| "loss/crossentropy": 2.1897004157304765, |
| "loss/hidden": 3.312109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1826148485764861, |
| "step": 7630 |
| }, |
| { |
| "epoch": 0.191, |
| "grad_norm": 31.0, |
| "grad_norm_var": 4.260416666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4913, |
| "loss/crossentropy": 2.157025161385536, |
| "loss/hidden": 3.5171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22510031294077634, |
| "step": 7640 |
| }, |
| { |
| "epoch": 0.19125, |
| "grad_norm": 29.875, |
| "grad_norm_var": 2.2270182291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.517, |
| "loss/crossentropy": 2.090579715371132, |
| "loss/hidden": 3.3890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1889673013240099, |
| "step": 7650 |
| }, |
| { |
| "epoch": 0.1915, |
| "grad_norm": 31.375, |
| "grad_norm_var": 3.0770833333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6179, |
| "loss/crossentropy": 2.1681595921516417, |
| "loss/hidden": 3.555078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2365275712683797, |
| "step": 7660 |
| }, |
| { |
| "epoch": 0.19175, |
| "grad_norm": 31.875, |
| "grad_norm_var": 3.4280598958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5348, |
| "loss/crossentropy": 2.207126745581627, |
| "loss/hidden": 3.34921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19211382810026406, |
| "step": 7670 |
| }, |
| { |
| "epoch": 0.192, |
| "grad_norm": 28.875, |
| "grad_norm_var": 3.796875, |
| "learning_rate": 0.0001, |
| "loss": 7.553, |
| "loss/crossentropy": 2.161148224771023, |
| "loss/hidden": 3.52734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22304172217845916, |
| "step": 7680 |
| }, |
| { |
| "epoch": 0.19225, |
| "grad_norm": 29.0, |
| "grad_norm_var": 3.8889973958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5009, |
| "loss/crossentropy": 2.0679882526397706, |
| "loss/hidden": 3.46015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2086691778153181, |
| "step": 7690 |
| }, |
| { |
| "epoch": 0.1925, |
| "grad_norm": 30.75, |
| "grad_norm_var": 3.2552083333333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5372, |
| "loss/crossentropy": 2.0694913983345034, |
| "loss/hidden": 3.37265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19288846384733915, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.19275, |
| "grad_norm": 32.0, |
| "grad_norm_var": 1.8072916666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5833, |
| "loss/crossentropy": 2.2242089927196504, |
| "loss/hidden": 3.306640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1934451697394252, |
| "step": 7710 |
| }, |
| { |
| "epoch": 0.193, |
| "grad_norm": 34.5, |
| "grad_norm_var": 12.787955729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6402, |
| "loss/crossentropy": 2.157019394636154, |
| "loss/hidden": 3.405078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19785581808537245, |
| "step": 7720 |
| }, |
| { |
| "epoch": 0.19325, |
| "grad_norm": 32.0, |
| "grad_norm_var": 12.268489583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.3766, |
| "loss/crossentropy": 2.226413035392761, |
| "loss/hidden": 3.40625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20680981017649175, |
| "step": 7730 |
| }, |
| { |
| "epoch": 0.1935, |
| "grad_norm": 35.5, |
| "grad_norm_var": 6.34140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5761, |
| "loss/crossentropy": 2.0320019692182543, |
| "loss/hidden": 3.401953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19264598488807677, |
| "step": 7740 |
| }, |
| { |
| "epoch": 0.19375, |
| "grad_norm": 34.5, |
| "grad_norm_var": 5.897916666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.453, |
| "loss/crossentropy": 2.0598455399274824, |
| "loss/hidden": 3.284375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1851517017930746, |
| "step": 7750 |
| }, |
| { |
| "epoch": 0.194, |
| "grad_norm": 29.0, |
| "grad_norm_var": 6.7025390625, |
| "learning_rate": 0.0001, |
| "loss": 7.4278, |
| "loss/crossentropy": 2.1927490830421448, |
| "loss/hidden": 3.414453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20509992372244595, |
| "step": 7760 |
| }, |
| { |
| "epoch": 0.19425, |
| "grad_norm": 31.375, |
| "grad_norm_var": 5.700455729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.4874, |
| "loss/crossentropy": 2.154856140911579, |
| "loss/hidden": 3.430859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20458675045520067, |
| "step": 7770 |
| }, |
| { |
| "epoch": 0.1945, |
| "grad_norm": 37.5, |
| "grad_norm_var": 6.71015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5452, |
| "loss/crossentropy": 2.1202862530946733, |
| "loss/hidden": 3.3078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18985650166869164, |
| "step": 7780 |
| }, |
| { |
| "epoch": 0.19475, |
| "grad_norm": 36.25, |
| "grad_norm_var": 6.918684895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5427, |
| "loss/crossentropy": 2.1549450784921644, |
| "loss/hidden": 3.434765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18924216069281102, |
| "step": 7790 |
| }, |
| { |
| "epoch": 0.195, |
| "grad_norm": 35.25, |
| "grad_norm_var": 6.2291015625, |
| "learning_rate": 0.0001, |
| "loss": 7.4733, |
| "loss/crossentropy": 2.0557655200362204, |
| "loss/hidden": 3.39765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1919045101851225, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.19525, |
| "grad_norm": 46.75, |
| "grad_norm_var": 50.040625, |
| "learning_rate": 0.0001, |
| "loss": 7.4911, |
| "loss/crossentropy": 2.171146012097597, |
| "loss/hidden": 3.449609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20546058733016254, |
| "step": 7810 |
| }, |
| { |
| "epoch": 0.1955, |
| "grad_norm": 33.25, |
| "grad_norm_var": 15.828580729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5692, |
| "loss/crossentropy": 2.1003697514533997, |
| "loss/hidden": 3.489453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20100659225136042, |
| "step": 7820 |
| }, |
| { |
| "epoch": 0.19575, |
| "grad_norm": 36.0, |
| "grad_norm_var": 29.226497395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5267, |
| "loss/crossentropy": 2.1091859377920628, |
| "loss/hidden": 3.35546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19987453976646066, |
| "step": 7830 |
| }, |
| { |
| "epoch": 0.196, |
| "grad_norm": 48.5, |
| "grad_norm_var": 42.45201822916667, |
| "learning_rate": 0.0001, |
| "loss": 7.4625, |
| "loss/crossentropy": 2.111352452635765, |
| "loss/hidden": 3.31171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18483758307993411, |
| "step": 7840 |
| }, |
| { |
| "epoch": 0.19625, |
| "grad_norm": 30.5, |
| "grad_norm_var": 25.65390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5415, |
| "loss/crossentropy": 2.0419967979192735, |
| "loss/hidden": 3.464453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2173704007640481, |
| "step": 7850 |
| }, |
| { |
| "epoch": 0.1965, |
| "grad_norm": 30.875, |
| "grad_norm_var": 21.546875, |
| "learning_rate": 0.0001, |
| "loss": 7.4951, |
| "loss/crossentropy": 2.084785957634449, |
| "loss/hidden": 3.304296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17884304635226728, |
| "step": 7860 |
| }, |
| { |
| "epoch": 0.19675, |
| "grad_norm": 31.75, |
| "grad_norm_var": 18.153059895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.4509, |
| "loss/crossentropy": 2.1023138776421546, |
| "loss/hidden": 3.479296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20335167981684207, |
| "step": 7870 |
| }, |
| { |
| "epoch": 0.197, |
| "grad_norm": 36.25, |
| "grad_norm_var": 12.093489583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5467, |
| "loss/crossentropy": 2.171536546945572, |
| "loss/hidden": 3.41953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21221924368292094, |
| "step": 7880 |
| }, |
| { |
| "epoch": 0.19725, |
| "grad_norm": 28.75, |
| "grad_norm_var": 9.983333333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.3785, |
| "loss/crossentropy": 2.1119479715824125, |
| "loss/hidden": 3.39765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18313394635915756, |
| "step": 7890 |
| }, |
| { |
| "epoch": 0.1975, |
| "grad_norm": 37.25, |
| "grad_norm_var": 11.928580729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4945, |
| "loss/crossentropy": 1.9945223838090897, |
| "loss/hidden": 3.434765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19973532520234585, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.19775, |
| "grad_norm": 32.25, |
| "grad_norm_var": 11.839583333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4542, |
| "loss/crossentropy": 2.1550634860992433, |
| "loss/hidden": 3.448046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19187356438487768, |
| "step": 7910 |
| }, |
| { |
| "epoch": 0.198, |
| "grad_norm": 31.5, |
| "grad_norm_var": 12.584309895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.4245, |
| "loss/crossentropy": 2.092045524716377, |
| "loss/hidden": 3.315625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18641792330890894, |
| "step": 7920 |
| }, |
| { |
| "epoch": 0.19825, |
| "grad_norm": 28.875, |
| "grad_norm_var": 12.959375, |
| "learning_rate": 0.0001, |
| "loss": 7.4616, |
| "loss/crossentropy": 2.1302111998200415, |
| "loss/hidden": 3.40390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2105752557516098, |
| "step": 7930 |
| }, |
| { |
| "epoch": 0.1985, |
| "grad_norm": 30.75, |
| "grad_norm_var": 14.9431640625, |
| "learning_rate": 0.0001, |
| "loss": 7.4869, |
| "loss/crossentropy": 2.1739411771297457, |
| "loss/hidden": 3.455078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19759192429482936, |
| "step": 7940 |
| }, |
| { |
| "epoch": 0.19875, |
| "grad_norm": 33.25, |
| "grad_norm_var": 12.703580729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4435, |
| "loss/crossentropy": 2.0400329381227493, |
| "loss/hidden": 3.3203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18711388055235148, |
| "step": 7950 |
| }, |
| { |
| "epoch": 0.199, |
| "grad_norm": 40.0, |
| "grad_norm_var": 10.454622395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.3235, |
| "loss/crossentropy": 2.1173346668481825, |
| "loss/hidden": 3.383203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.200854654237628, |
| "step": 7960 |
| }, |
| { |
| "epoch": 0.19925, |
| "grad_norm": 31.125, |
| "grad_norm_var": 18.376822916666665, |
| "learning_rate": 0.0001, |
| "loss": 7.551, |
| "loss/crossentropy": 2.256546127796173, |
| "loss/hidden": 3.430859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2098003163933754, |
| "step": 7970 |
| }, |
| { |
| "epoch": 0.1995, |
| "grad_norm": 30.625, |
| "grad_norm_var": 11.159375, |
| "learning_rate": 0.0001, |
| "loss": 7.5863, |
| "loss/crossentropy": 2.0988403081893923, |
| "loss/hidden": 3.372265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2052627831697464, |
| "step": 7980 |
| }, |
| { |
| "epoch": 0.19975, |
| "grad_norm": 34.75, |
| "grad_norm_var": 3.539322916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5501, |
| "loss/crossentropy": 2.157486143708229, |
| "loss/hidden": 3.4296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19415573356673121, |
| "step": 7990 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 28.75, |
| "grad_norm_var": 5.282291666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5487, |
| "loss/crossentropy": 2.204603946208954, |
| "loss/hidden": 3.328515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19015649259090422, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.20025, |
| "grad_norm": 29.5, |
| "grad_norm_var": 3.59375, |
| "learning_rate": 0.0001, |
| "loss": 7.4113, |
| "loss/crossentropy": 2.1252332285046576, |
| "loss/hidden": 3.287109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1751571564003825, |
| "step": 8010 |
| }, |
| { |
| "epoch": 0.2005, |
| "grad_norm": 31.0, |
| "grad_norm_var": 3.603125, |
| "learning_rate": 0.0001, |
| "loss": 7.498, |
| "loss/crossentropy": 2.048645743727684, |
| "loss/hidden": 3.35625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1930268170312047, |
| "step": 8020 |
| }, |
| { |
| "epoch": 0.20075, |
| "grad_norm": 31.5, |
| "grad_norm_var": 3.3822265625, |
| "learning_rate": 0.0001, |
| "loss": 7.4496, |
| "loss/crossentropy": 2.1435810789465903, |
| "loss/hidden": 3.340625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19256617035716772, |
| "step": 8030 |
| }, |
| { |
| "epoch": 0.201, |
| "grad_norm": 31.875, |
| "grad_norm_var": 5.8837890625, |
| "learning_rate": 0.0001, |
| "loss": 7.5991, |
| "loss/crossentropy": 2.06428082883358, |
| "loss/hidden": 3.397265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18795610759407283, |
| "step": 8040 |
| }, |
| { |
| "epoch": 0.20125, |
| "grad_norm": 31.75, |
| "grad_norm_var": 4.08515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5106, |
| "loss/crossentropy": 2.2008739590644835, |
| "loss/hidden": 3.329296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1890289282426238, |
| "step": 8050 |
| }, |
| { |
| "epoch": 0.2015, |
| "grad_norm": 33.25, |
| "grad_norm_var": 2.7708333333333335, |
| "learning_rate": 0.0001, |
| "loss": 7.4302, |
| "loss/crossentropy": 2.1280903786420824, |
| "loss/hidden": 3.511328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20874056480824948, |
| "step": 8060 |
| }, |
| { |
| "epoch": 0.20175, |
| "grad_norm": 29.5, |
| "grad_norm_var": 3.2035807291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4529, |
| "loss/crossentropy": 2.1864554077386855, |
| "loss/hidden": 3.390234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19697167426347734, |
| "step": 8070 |
| }, |
| { |
| "epoch": 0.202, |
| "grad_norm": 35.0, |
| "grad_norm_var": 2.278125, |
| "learning_rate": 0.0001, |
| "loss": 7.5744, |
| "loss/crossentropy": 2.211556833982468, |
| "loss/hidden": 3.433203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20347684845328332, |
| "step": 8080 |
| }, |
| { |
| "epoch": 0.20225, |
| "grad_norm": 28.875, |
| "grad_norm_var": 33.37890625, |
| "learning_rate": 0.0001, |
| "loss": 7.5018, |
| "loss/crossentropy": 2.185184845328331, |
| "loss/hidden": 3.358203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2022322118282318, |
| "step": 8090 |
| }, |
| { |
| "epoch": 0.2025, |
| "grad_norm": 31.875, |
| "grad_norm_var": 3.3931640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5218, |
| "loss/crossentropy": 2.1828758120536804, |
| "loss/hidden": 3.337890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1962041450664401, |
| "step": 8100 |
| }, |
| { |
| "epoch": 0.20275, |
| "grad_norm": 29.875, |
| "grad_norm_var": 1.9488932291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4982, |
| "loss/crossentropy": 2.159992370009422, |
| "loss/hidden": 3.446484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20945403575897217, |
| "step": 8110 |
| }, |
| { |
| "epoch": 0.203, |
| "grad_norm": 31.25, |
| "grad_norm_var": 4.748958333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4657, |
| "loss/crossentropy": 2.1400886207818983, |
| "loss/hidden": 3.352734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19184730034321545, |
| "step": 8120 |
| }, |
| { |
| "epoch": 0.20325, |
| "grad_norm": 32.0, |
| "grad_norm_var": 1.0733723958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5559, |
| "loss/crossentropy": 2.112087991833687, |
| "loss/hidden": 3.475390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20091654695570468, |
| "step": 8130 |
| }, |
| { |
| "epoch": 0.2035, |
| "grad_norm": 30.625, |
| "grad_norm_var": 1.921875, |
| "learning_rate": 0.0001, |
| "loss": 7.5644, |
| "loss/crossentropy": 2.139628532528877, |
| "loss/hidden": 3.52265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19385650400072335, |
| "step": 8140 |
| }, |
| { |
| "epoch": 0.20375, |
| "grad_norm": 33.25, |
| "grad_norm_var": 2.379166666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5002, |
| "loss/crossentropy": 2.091212958097458, |
| "loss/hidden": 3.446484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2040353151038289, |
| "step": 8150 |
| }, |
| { |
| "epoch": 0.204, |
| "grad_norm": 32.0, |
| "grad_norm_var": 2.2228515625, |
| "learning_rate": 0.0001, |
| "loss": 7.4494, |
| "loss/crossentropy": 2.2567614316940308, |
| "loss/hidden": 3.35078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18545747231692075, |
| "step": 8160 |
| }, |
| { |
| "epoch": 0.20425, |
| "grad_norm": 31.125, |
| "grad_norm_var": 229.56666666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4626, |
| "loss/crossentropy": 2.1603645354509355, |
| "loss/hidden": 3.539453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21300703901797532, |
| "step": 8170 |
| }, |
| { |
| "epoch": 0.2045, |
| "grad_norm": 28.625, |
| "grad_norm_var": 4.2806640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5117, |
| "loss/crossentropy": 2.1204755783081053, |
| "loss/hidden": 3.456640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18535650707781315, |
| "step": 8180 |
| }, |
| { |
| "epoch": 0.20475, |
| "grad_norm": 31.25, |
| "grad_norm_var": 2.1072265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5637, |
| "loss/crossentropy": 2.1639575958251953, |
| "loss/hidden": 3.398828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20935236364603044, |
| "step": 8190 |
| }, |
| { |
| "epoch": 0.205, |
| "grad_norm": 28.0, |
| "grad_norm_var": 2.2494140625, |
| "learning_rate": 0.0001, |
| "loss": 7.3019, |
| "loss/crossentropy": 2.056389382481575, |
| "loss/hidden": 3.3203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1874819153919816, |
| "step": 8200 |
| }, |
| { |
| "epoch": 0.20525, |
| "grad_norm": 32.5, |
| "grad_norm_var": 4.274934895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5162, |
| "loss/crossentropy": 2.1393348813056945, |
| "loss/hidden": 3.41640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20481376871466636, |
| "step": 8210 |
| }, |
| { |
| "epoch": 0.2055, |
| "grad_norm": 29.625, |
| "grad_norm_var": 5.553125, |
| "learning_rate": 0.0001, |
| "loss": 7.5046, |
| "loss/crossentropy": 2.1898138865828516, |
| "loss/hidden": 3.375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1965622954070568, |
| "step": 8220 |
| }, |
| { |
| "epoch": 0.20575, |
| "grad_norm": 34.5, |
| "grad_norm_var": 2.899825551450536e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.6057, |
| "loss/crossentropy": 2.139435464143753, |
| "loss/hidden": 3.51328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20182280670851468, |
| "step": 8230 |
| }, |
| { |
| "epoch": 0.206, |
| "grad_norm": 37.75, |
| "grad_norm_var": 2.899825549761839e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.5603, |
| "loss/crossentropy": 2.248063361644745, |
| "loss/hidden": 3.44609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21399107333272696, |
| "step": 8240 |
| }, |
| { |
| "epoch": 0.20625, |
| "grad_norm": 33.5, |
| "grad_norm_var": 114.23118489583334, |
| "learning_rate": 0.0001, |
| "loss": 7.4545, |
| "loss/crossentropy": 2.140778873860836, |
| "loss/hidden": 3.3421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18022785130888225, |
| "step": 8250 |
| }, |
| { |
| "epoch": 0.2065, |
| "grad_norm": 30.5, |
| "grad_norm_var": 110.5072265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5364, |
| "loss/crossentropy": 2.259766247868538, |
| "loss/hidden": 3.312109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19342643208801746, |
| "step": 8260 |
| }, |
| { |
| "epoch": 0.20675, |
| "grad_norm": 31.75, |
| "grad_norm_var": 0.7666015625, |
| "learning_rate": 0.0001, |
| "loss": 7.6531, |
| "loss/crossentropy": 2.158949288725853, |
| "loss/hidden": 3.43515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19808744061738254, |
| "step": 8270 |
| }, |
| { |
| "epoch": 0.207, |
| "grad_norm": 32.0, |
| "grad_norm_var": 1.4311848958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6092, |
| "loss/crossentropy": 2.111227548122406, |
| "loss/hidden": 3.402734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20227425117045642, |
| "step": 8280 |
| }, |
| { |
| "epoch": 0.20725, |
| "grad_norm": 31.5, |
| "grad_norm_var": 1.56640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5372, |
| "loss/crossentropy": 2.0839753821492195, |
| "loss/hidden": 3.428125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18574036825448276, |
| "step": 8290 |
| }, |
| { |
| "epoch": 0.2075, |
| "grad_norm": 31.375, |
| "grad_norm_var": 0.8384765625, |
| "learning_rate": 0.0001, |
| "loss": 7.536, |
| "loss/crossentropy": 2.1465844094753264, |
| "loss/hidden": 3.407421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2083981443196535, |
| "step": 8300 |
| }, |
| { |
| "epoch": 0.20775, |
| "grad_norm": 32.5, |
| "grad_norm_var": 20.8416015625, |
| "learning_rate": 0.0001, |
| "loss": 7.4722, |
| "loss/crossentropy": 2.1032681286334993, |
| "loss/hidden": 3.439453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19353157542645932, |
| "step": 8310 |
| }, |
| { |
| "epoch": 0.208, |
| "grad_norm": 29.75, |
| "grad_norm_var": 2.1395833333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4453, |
| "loss/crossentropy": 2.092882976680994, |
| "loss/hidden": 3.41640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20083120074123145, |
| "step": 8320 |
| }, |
| { |
| "epoch": 0.20825, |
| "grad_norm": 29.125, |
| "grad_norm_var": 2.42265625, |
| "learning_rate": 0.0001, |
| "loss": 7.4658, |
| "loss/crossentropy": 2.047010327875614, |
| "loss/hidden": 3.26484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17298451280221344, |
| "step": 8330 |
| }, |
| { |
| "epoch": 0.2085, |
| "grad_norm": 31.625, |
| "grad_norm_var": 2.8114583333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6227, |
| "loss/crossentropy": 2.1705314934253694, |
| "loss/hidden": 3.44921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2104344118386507, |
| "step": 8340 |
| }, |
| { |
| "epoch": 0.20875, |
| "grad_norm": 30.75, |
| "grad_norm_var": 15.13515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5292, |
| "loss/crossentropy": 2.251467025279999, |
| "loss/hidden": 3.44296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.206053901091218, |
| "step": 8350 |
| }, |
| { |
| "epoch": 0.209, |
| "grad_norm": 30.25, |
| "grad_norm_var": 9.75, |
| "learning_rate": 0.0001, |
| "loss": 7.5297, |
| "loss/crossentropy": 2.2416324824094773, |
| "loss/hidden": 3.38203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18910795077681541, |
| "step": 8360 |
| }, |
| { |
| "epoch": 0.20925, |
| "grad_norm": 49.75, |
| "grad_norm_var": 177.46145833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4604, |
| "loss/crossentropy": 1.9895980581641197, |
| "loss/hidden": 3.43203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19730695113539695, |
| "step": 8370 |
| }, |
| { |
| "epoch": 0.2095, |
| "grad_norm": 56.25, |
| "grad_norm_var": 334.9145182291667, |
| "learning_rate": 0.0001, |
| "loss": 7.5698, |
| "loss/crossentropy": 2.2018245279788973, |
| "loss/hidden": 3.38828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21164307594299317, |
| "step": 8380 |
| }, |
| { |
| "epoch": 0.20975, |
| "grad_norm": 32.75, |
| "grad_norm_var": 72.22890625, |
| "learning_rate": 0.0001, |
| "loss": 7.5902, |
| "loss/crossentropy": 2.1519288808107375, |
| "loss/hidden": 3.3796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19217228647321463, |
| "step": 8390 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 30.0, |
| "grad_norm_var": 7.153059895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.451, |
| "loss/crossentropy": 2.1779698938131333, |
| "loss/hidden": 3.44140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1994494464248419, |
| "step": 8400 |
| }, |
| { |
| "epoch": 0.21025, |
| "grad_norm": 32.0, |
| "grad_norm_var": 7.124739583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5019, |
| "loss/crossentropy": 2.140475983917713, |
| "loss/hidden": 3.3734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20144703481346368, |
| "step": 8410 |
| }, |
| { |
| "epoch": 0.2105, |
| "grad_norm": 31.625, |
| "grad_norm_var": 1.2639973958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4268, |
| "loss/crossentropy": 2.2145755022764204, |
| "loss/hidden": 3.328515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19062853194773197, |
| "step": 8420 |
| }, |
| { |
| "epoch": 0.21075, |
| "grad_norm": 31.0, |
| "grad_norm_var": 2.620247395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5149, |
| "loss/crossentropy": 2.3047141253948213, |
| "loss/hidden": 3.34140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20336541533470154, |
| "step": 8430 |
| }, |
| { |
| "epoch": 0.211, |
| "grad_norm": 31.875, |
| "grad_norm_var": 2.7514973958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5319, |
| "loss/crossentropy": 2.232821634411812, |
| "loss/hidden": 3.441015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21666326355189086, |
| "step": 8440 |
| }, |
| { |
| "epoch": 0.21125, |
| "grad_norm": 33.5, |
| "grad_norm_var": 1.49765625, |
| "learning_rate": 0.0001, |
| "loss": 7.5718, |
| "loss/crossentropy": 2.092620450258255, |
| "loss/hidden": 3.391796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20793931670486926, |
| "step": 8450 |
| }, |
| { |
| "epoch": 0.2115, |
| "grad_norm": 32.5, |
| "grad_norm_var": 3.1306640625, |
| "learning_rate": 0.0001, |
| "loss": 7.4344, |
| "loss/crossentropy": 2.151240213960409, |
| "loss/hidden": 3.40078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2046731175854802, |
| "step": 8460 |
| }, |
| { |
| "epoch": 0.21175, |
| "grad_norm": 33.75, |
| "grad_norm_var": 2.81640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5581, |
| "loss/crossentropy": 2.1520677715539933, |
| "loss/hidden": 3.379296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19603268206119537, |
| "step": 8470 |
| }, |
| { |
| "epoch": 0.212, |
| "grad_norm": 31.0, |
| "grad_norm_var": 4.092643229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5382, |
| "loss/crossentropy": 2.1510326713323593, |
| "loss/hidden": 3.33359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18077819589525462, |
| "step": 8480 |
| }, |
| { |
| "epoch": 0.21225, |
| "grad_norm": 32.75, |
| "grad_norm_var": 4.753059895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4642, |
| "loss/crossentropy": 2.0811485938727854, |
| "loss/hidden": 3.351171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18939817287027835, |
| "step": 8490 |
| }, |
| { |
| "epoch": 0.2125, |
| "grad_norm": 31.25, |
| "grad_norm_var": 5.855989583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5158, |
| "loss/crossentropy": 2.0734725266695024, |
| "loss/hidden": 3.316796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18070020200684667, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.21275, |
| "grad_norm": 29.25, |
| "grad_norm_var": 5.5462890625, |
| "learning_rate": 0.0001, |
| "loss": 7.4723, |
| "loss/crossentropy": 2.144814722239971, |
| "loss/hidden": 3.3828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19649898763746024, |
| "step": 8510 |
| }, |
| { |
| "epoch": 0.213, |
| "grad_norm": 28.625, |
| "grad_norm_var": 3.2309895833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.3777, |
| "loss/crossentropy": 2.0877407863736153, |
| "loss/hidden": 3.42734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19544295389205218, |
| "step": 8520 |
| }, |
| { |
| "epoch": 0.21325, |
| "grad_norm": 30.5, |
| "grad_norm_var": 1.6583333333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5179, |
| "loss/crossentropy": 2.1433625385165214, |
| "loss/hidden": 3.4734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19359017722308636, |
| "step": 8530 |
| }, |
| { |
| "epoch": 0.2135, |
| "grad_norm": 34.25, |
| "grad_norm_var": 1.6510416666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4658, |
| "loss/crossentropy": 2.221453693509102, |
| "loss/hidden": 3.43125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20547619834542274, |
| "step": 8540 |
| }, |
| { |
| "epoch": 0.21375, |
| "grad_norm": 32.0, |
| "grad_norm_var": 2.5791015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5163, |
| "loss/crossentropy": 2.213938871026039, |
| "loss/hidden": 3.328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20213278364390136, |
| "step": 8550 |
| }, |
| { |
| "epoch": 0.214, |
| "grad_norm": 34.25, |
| "grad_norm_var": 2.784375, |
| "learning_rate": 0.0001, |
| "loss": 7.6361, |
| "loss/crossentropy": 2.133354830741882, |
| "loss/hidden": 3.512109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.207733928412199, |
| "step": 8560 |
| }, |
| { |
| "epoch": 0.21425, |
| "grad_norm": 34.0, |
| "grad_norm_var": 7.219205729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5661, |
| "loss/crossentropy": 2.0587344974279405, |
| "loss/hidden": 3.550390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22256299555301667, |
| "step": 8570 |
| }, |
| { |
| "epoch": 0.2145, |
| "grad_norm": 31.0, |
| "grad_norm_var": 7.805989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5549, |
| "loss/crossentropy": 2.0949306935071945, |
| "loss/hidden": 3.4796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19931861553341151, |
| "step": 8580 |
| }, |
| { |
| "epoch": 0.21475, |
| "grad_norm": 28.5, |
| "grad_norm_var": 8.095247395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.478, |
| "loss/crossentropy": 2.1063201159238814, |
| "loss/hidden": 3.459375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20397470220923425, |
| "step": 8590 |
| }, |
| { |
| "epoch": 0.215, |
| "grad_norm": 33.5, |
| "grad_norm_var": 28.859375, |
| "learning_rate": 0.0001, |
| "loss": 7.5632, |
| "loss/crossentropy": 2.177382430434227, |
| "loss/hidden": 3.46875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2048982411623001, |
| "step": 8600 |
| }, |
| { |
| "epoch": 0.21525, |
| "grad_norm": 31.75, |
| "grad_norm_var": 1.7955729166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5325, |
| "loss/crossentropy": 2.0719266816973687, |
| "loss/hidden": 3.395703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19100363925099373, |
| "step": 8610 |
| }, |
| { |
| "epoch": 0.2155, |
| "grad_norm": 29.25, |
| "grad_norm_var": 1.7372395833333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5877, |
| "loss/crossentropy": 2.147695118188858, |
| "loss/hidden": 3.41875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19447963647544383, |
| "step": 8620 |
| }, |
| { |
| "epoch": 0.21575, |
| "grad_norm": 31.375, |
| "grad_norm_var": 5.406184895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5509, |
| "loss/crossentropy": 2.1029836043715475, |
| "loss/hidden": 3.45, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19613635465502738, |
| "step": 8630 |
| }, |
| { |
| "epoch": 0.216, |
| "grad_norm": 29.0, |
| "grad_norm_var": 10.32265625, |
| "learning_rate": 0.0001, |
| "loss": 7.555, |
| "loss/crossentropy": 2.1027876287698746, |
| "loss/hidden": 3.34453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18709390722215175, |
| "step": 8640 |
| }, |
| { |
| "epoch": 0.21625, |
| "grad_norm": 31.75, |
| "grad_norm_var": 7.4978515625, |
| "learning_rate": 0.0001, |
| "loss": 7.481, |
| "loss/crossentropy": 2.078503981232643, |
| "loss/hidden": 3.3578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19765212927013637, |
| "step": 8650 |
| }, |
| { |
| "epoch": 0.2165, |
| "grad_norm": 28.375, |
| "grad_norm_var": 6.212434895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5951, |
| "loss/crossentropy": 2.1425766557455064, |
| "loss/hidden": 3.522265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2285369848832488, |
| "step": 8660 |
| }, |
| { |
| "epoch": 0.21675, |
| "grad_norm": 37.0, |
| "grad_norm_var": 5.02890625, |
| "learning_rate": 0.0001, |
| "loss": 7.4619, |
| "loss/crossentropy": 2.094095268845558, |
| "loss/hidden": 3.33671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18905023839324714, |
| "step": 8670 |
| }, |
| { |
| "epoch": 0.217, |
| "grad_norm": 32.25, |
| "grad_norm_var": 5.205989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6255, |
| "loss/crossentropy": 2.0726657152175902, |
| "loss/hidden": 3.424609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19401923939585686, |
| "step": 8680 |
| }, |
| { |
| "epoch": 0.21725, |
| "grad_norm": 31.125, |
| "grad_norm_var": 1.8240281960963725e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.476, |
| "loss/crossentropy": 2.1063093028962614, |
| "loss/hidden": 3.351953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18630194589495658, |
| "step": 8690 |
| }, |
| { |
| "epoch": 0.2175, |
| "grad_norm": 33.0, |
| "grad_norm_var": 9.2947265625, |
| "learning_rate": 0.0001, |
| "loss": 7.6565, |
| "loss/crossentropy": 2.188694643974304, |
| "loss/hidden": 3.561328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21225023418664932, |
| "step": 8700 |
| }, |
| { |
| "epoch": 0.21775, |
| "grad_norm": 30.0, |
| "grad_norm_var": 3.644791666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4697, |
| "loss/crossentropy": 2.1252513483166693, |
| "loss/hidden": 3.334765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19880297109484674, |
| "step": 8710 |
| }, |
| { |
| "epoch": 0.218, |
| "grad_norm": 30.875, |
| "grad_norm_var": 11.028059895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.4975, |
| "loss/crossentropy": 2.1700349181890486, |
| "loss/hidden": 3.351953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20139609836041927, |
| "step": 8720 |
| }, |
| { |
| "epoch": 0.21825, |
| "grad_norm": 32.25, |
| "grad_norm_var": 1.4189153379885778e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.616, |
| "loss/crossentropy": 2.032346047461033, |
| "loss/hidden": 3.425, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1754102895502001, |
| "step": 8730 |
| }, |
| { |
| "epoch": 0.2185, |
| "grad_norm": 31.25, |
| "grad_norm_var": 2.25390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5088, |
| "loss/crossentropy": 2.1633499920368195, |
| "loss/hidden": 3.448828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1996068175882101, |
| "step": 8740 |
| }, |
| { |
| "epoch": 0.21875, |
| "grad_norm": 29.375, |
| "grad_norm_var": 1.6122395833333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5088, |
| "loss/crossentropy": 2.1443722933530807, |
| "loss/hidden": 3.501171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2017217763699591, |
| "step": 8750 |
| }, |
| { |
| "epoch": 0.219, |
| "grad_norm": 32.25, |
| "grad_norm_var": 14.970572916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5224, |
| "loss/crossentropy": 2.176077055931091, |
| "loss/hidden": 3.4203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2048908168449998, |
| "step": 8760 |
| }, |
| { |
| "epoch": 0.21925, |
| "grad_norm": 31.0, |
| "grad_norm_var": 14.087955729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5417, |
| "loss/crossentropy": 2.183764386177063, |
| "loss/hidden": 3.444140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20730934757739305, |
| "step": 8770 |
| }, |
| { |
| "epoch": 0.2195, |
| "grad_norm": 46.5, |
| "grad_norm_var": 23.55390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5464, |
| "loss/crossentropy": 2.0760655224323274, |
| "loss/hidden": 3.3859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18335200790315867, |
| "step": 8780 |
| }, |
| { |
| "epoch": 0.21975, |
| "grad_norm": 31.125, |
| "grad_norm_var": 16.1369140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5001, |
| "loss/crossentropy": 2.215297505259514, |
| "loss/hidden": 3.33671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19053485784679652, |
| "step": 8790 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 41.75, |
| "grad_norm_var": 10.250455729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5664, |
| "loss/crossentropy": 2.1419076189398765, |
| "loss/hidden": 3.390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1872857511974871, |
| "step": 8800 |
| }, |
| { |
| "epoch": 0.22025, |
| "grad_norm": 31.625, |
| "grad_norm_var": 13.099934895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5152, |
| "loss/crossentropy": 2.136802351474762, |
| "loss/hidden": 3.368359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19524938464164734, |
| "step": 8810 |
| }, |
| { |
| "epoch": 0.2205, |
| "grad_norm": 29.0, |
| "grad_norm_var": 18.0634765625, |
| "learning_rate": 0.0001, |
| "loss": 7.6215, |
| "loss/crossentropy": 2.231374368071556, |
| "loss/hidden": 3.314453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18930297438055277, |
| "step": 8820 |
| }, |
| { |
| "epoch": 0.22075, |
| "grad_norm": 30.125, |
| "grad_norm_var": 18.358333333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5354, |
| "loss/crossentropy": 2.2225961208343508, |
| "loss/hidden": 3.41328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19855661746114492, |
| "step": 8830 |
| }, |
| { |
| "epoch": 0.221, |
| "grad_norm": 31.625, |
| "grad_norm_var": 18.312239583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6216, |
| "loss/crossentropy": 2.1328855097293853, |
| "loss/hidden": 3.40390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20832530297338964, |
| "step": 8840 |
| }, |
| { |
| "epoch": 0.22125, |
| "grad_norm": 28.375, |
| "grad_norm_var": 8.085416666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.3697, |
| "loss/crossentropy": 2.0526474595069883, |
| "loss/hidden": 3.5015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21284359116107227, |
| "step": 8850 |
| }, |
| { |
| "epoch": 0.2215, |
| "grad_norm": 37.25, |
| "grad_norm_var": 9.942708333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4967, |
| "loss/crossentropy": 2.1749773651361464, |
| "loss/hidden": 3.441015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20905132126063108, |
| "step": 8860 |
| }, |
| { |
| "epoch": 0.22175, |
| "grad_norm": 32.0, |
| "grad_norm_var": 6.322330729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.503, |
| "loss/crossentropy": 2.108053085952997, |
| "loss/hidden": 3.41953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20318386573344469, |
| "step": 8870 |
| }, |
| { |
| "epoch": 0.222, |
| "grad_norm": 34.0, |
| "grad_norm_var": 7.685416666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.3618, |
| "loss/crossentropy": 2.1234827637672424, |
| "loss/hidden": 3.36796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18484860006719828, |
| "step": 8880 |
| }, |
| { |
| "epoch": 0.22225, |
| "grad_norm": 29.75, |
| "grad_norm_var": 8.220247395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.393, |
| "loss/crossentropy": 2.099182015657425, |
| "loss/hidden": 3.350390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19038616605103015, |
| "step": 8890 |
| }, |
| { |
| "epoch": 0.2225, |
| "grad_norm": 31.0, |
| "grad_norm_var": 7.569205729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4768, |
| "loss/crossentropy": 2.148054042458534, |
| "loss/hidden": 3.44765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2065868368372321, |
| "step": 8900 |
| }, |
| { |
| "epoch": 0.22275, |
| "grad_norm": 28.75, |
| "grad_norm_var": 6.009375, |
| "learning_rate": 0.0001, |
| "loss": 7.5001, |
| "loss/crossentropy": 2.126410482823849, |
| "loss/hidden": 3.4921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.210067143663764, |
| "step": 8910 |
| }, |
| { |
| "epoch": 0.223, |
| "grad_norm": 37.5, |
| "grad_norm_var": 7.870572916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4678, |
| "loss/crossentropy": 2.1258621901273727, |
| "loss/hidden": 3.428515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19669083636254073, |
| "step": 8920 |
| }, |
| { |
| "epoch": 0.22325, |
| "grad_norm": 30.0, |
| "grad_norm_var": 8.908268229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5039, |
| "loss/crossentropy": 2.1579898312687873, |
| "loss/hidden": 3.319140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18557702358812095, |
| "step": 8930 |
| }, |
| { |
| "epoch": 0.2235, |
| "grad_norm": 28.5, |
| "grad_norm_var": 8.448372395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5477, |
| "loss/crossentropy": 2.2186635404825212, |
| "loss/hidden": 3.36015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19810736775398255, |
| "step": 8940 |
| }, |
| { |
| "epoch": 0.22375, |
| "grad_norm": 36.25, |
| "grad_norm_var": 7.304166666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5522, |
| "loss/crossentropy": 2.1860512644052505, |
| "loss/hidden": 3.33203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18810464218258857, |
| "step": 8950 |
| }, |
| { |
| "epoch": 0.224, |
| "grad_norm": 35.75, |
| "grad_norm_var": 4.498372395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5105, |
| "loss/crossentropy": 2.244818753004074, |
| "loss/hidden": 3.350390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19993619099259377, |
| "step": 8960 |
| }, |
| { |
| "epoch": 0.22425, |
| "grad_norm": 33.25, |
| "grad_norm_var": 4.34375, |
| "learning_rate": 0.0001, |
| "loss": 7.6077, |
| "loss/crossentropy": 2.1209320187568665, |
| "loss/hidden": 3.47421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2167982429265976, |
| "step": 8970 |
| }, |
| { |
| "epoch": 0.2245, |
| "grad_norm": 31.0, |
| "grad_norm_var": 18.992122395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5306, |
| "loss/crossentropy": 2.008339713513851, |
| "loss/hidden": 3.43515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19748742282390594, |
| "step": 8980 |
| }, |
| { |
| "epoch": 0.22475, |
| "grad_norm": 29.5, |
| "grad_norm_var": 20.906705729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5837, |
| "loss/crossentropy": 2.245701877772808, |
| "loss/hidden": 3.403125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19117865581065416, |
| "step": 8990 |
| }, |
| { |
| "epoch": 0.225, |
| "grad_norm": 32.75, |
| "grad_norm_var": 5.1119140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5251, |
| "loss/crossentropy": 2.050145834684372, |
| "loss/hidden": 3.37890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19412143267691134, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.22525, |
| "grad_norm": 32.75, |
| "grad_norm_var": 7.176822916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5851, |
| "loss/crossentropy": 2.190397572517395, |
| "loss/hidden": 3.349609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20755842179059983, |
| "step": 9010 |
| }, |
| { |
| "epoch": 0.2255, |
| "grad_norm": 30.75, |
| "grad_norm_var": 8.977083333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4264, |
| "loss/crossentropy": 2.284222900867462, |
| "loss/hidden": 3.332421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21356079690158367, |
| "step": 9020 |
| }, |
| { |
| "epoch": 0.22575, |
| "grad_norm": 31.75, |
| "grad_norm_var": 3.0639973958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5988, |
| "loss/crossentropy": 2.1513551443815233, |
| "loss/hidden": 3.351953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19452885985374452, |
| "step": 9030 |
| }, |
| { |
| "epoch": 0.226, |
| "grad_norm": 31.25, |
| "grad_norm_var": 1.7830729166666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4915, |
| "loss/crossentropy": 2.2022497206926346, |
| "loss/hidden": 3.415625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1973773717880249, |
| "step": 9040 |
| }, |
| { |
| "epoch": 0.22625, |
| "grad_norm": 28.25, |
| "grad_norm_var": 2.584830729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5443, |
| "loss/crossentropy": 2.1286503240466117, |
| "loss/hidden": 3.40546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18610758930444718, |
| "step": 9050 |
| }, |
| { |
| "epoch": 0.2265, |
| "grad_norm": 27.75, |
| "grad_norm_var": 3.345572916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5659, |
| "loss/crossentropy": 2.1127908319234847, |
| "loss/hidden": 3.375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19702097605913876, |
| "step": 9060 |
| }, |
| { |
| "epoch": 0.22675, |
| "grad_norm": 29.0, |
| "grad_norm_var": 2.716080729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5575, |
| "loss/crossentropy": 2.0748134687542916, |
| "loss/hidden": 3.331640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19112884253263474, |
| "step": 9070 |
| }, |
| { |
| "epoch": 0.227, |
| "grad_norm": 30.875, |
| "grad_norm_var": 1.0764973958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6302, |
| "loss/crossentropy": 2.1477744698524477, |
| "loss/hidden": 3.391796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.198454162850976, |
| "step": 9080 |
| }, |
| { |
| "epoch": 0.22725, |
| "grad_norm": 31.0, |
| "grad_norm_var": 2.3197265625, |
| "learning_rate": 0.0001, |
| "loss": 7.4827, |
| "loss/crossentropy": 2.1293207883834837, |
| "loss/hidden": 3.55625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21182843409478663, |
| "step": 9090 |
| }, |
| { |
| "epoch": 0.2275, |
| "grad_norm": 34.25, |
| "grad_norm_var": 3.1333333333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5027, |
| "loss/crossentropy": 2.145271519571543, |
| "loss/hidden": 3.391015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18950196234509348, |
| "step": 9100 |
| }, |
| { |
| "epoch": 0.22775, |
| "grad_norm": 30.5, |
| "grad_norm_var": 2.9337890625, |
| "learning_rate": 0.0001, |
| "loss": 7.5571, |
| "loss/crossentropy": 2.1665233090519904, |
| "loss/hidden": 3.48046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20093025229871272, |
| "step": 9110 |
| }, |
| { |
| "epoch": 0.228, |
| "grad_norm": 31.5, |
| "grad_norm_var": 1.9718098958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4992, |
| "loss/crossentropy": 2.119904951751232, |
| "loss/hidden": 3.37890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19110188409686088, |
| "step": 9120 |
| }, |
| { |
| "epoch": 0.22825, |
| "grad_norm": 32.5, |
| "grad_norm_var": 1.21875, |
| "learning_rate": 0.0001, |
| "loss": 7.5861, |
| "loss/crossentropy": 2.169039398431778, |
| "loss/hidden": 3.344140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18969318978488445, |
| "step": 9130 |
| }, |
| { |
| "epoch": 0.2285, |
| "grad_norm": 31.125, |
| "grad_norm_var": 2.0134765625, |
| "learning_rate": 0.0001, |
| "loss": 7.4457, |
| "loss/crossentropy": 2.202808904647827, |
| "loss/hidden": 3.415625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21587605867534876, |
| "step": 9140 |
| }, |
| { |
| "epoch": 0.22875, |
| "grad_norm": 30.125, |
| "grad_norm_var": 1.9014973958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5869, |
| "loss/crossentropy": 2.188230502605438, |
| "loss/hidden": 3.32578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20162757411599158, |
| "step": 9150 |
| }, |
| { |
| "epoch": 0.229, |
| "grad_norm": 31.125, |
| "grad_norm_var": 1.6268229166666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5041, |
| "loss/crossentropy": 2.166210785508156, |
| "loss/hidden": 3.346875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19896902665495872, |
| "step": 9160 |
| }, |
| { |
| "epoch": 0.22925, |
| "grad_norm": 31.875, |
| "grad_norm_var": 1.3582682291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4795, |
| "loss/crossentropy": 2.1638981848955154, |
| "loss/hidden": 3.28046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18999446034431458, |
| "step": 9170 |
| }, |
| { |
| "epoch": 0.2295, |
| "grad_norm": 34.0, |
| "grad_norm_var": 2.6177083333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4529, |
| "loss/crossentropy": 2.170040412247181, |
| "loss/hidden": 3.399609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22052502054721118, |
| "step": 9180 |
| }, |
| { |
| "epoch": 0.22975, |
| "grad_norm": 31.0, |
| "grad_norm_var": 3.2400390625, |
| "learning_rate": 0.0001, |
| "loss": 7.4625, |
| "loss/crossentropy": 2.114721930027008, |
| "loss/hidden": 3.40859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19851209484040738, |
| "step": 9190 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 31.375, |
| "grad_norm_var": 2.2457682291666665, |
| "learning_rate": 0.0001, |
| "loss": 7.5862, |
| "loss/crossentropy": 2.1541184708476067, |
| "loss/hidden": 3.374609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1960083631798625, |
| "step": 9200 |
| }, |
| { |
| "epoch": 0.23025, |
| "grad_norm": 31.25, |
| "grad_norm_var": 2.676041666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4881, |
| "loss/crossentropy": 2.1309454582631586, |
| "loss/hidden": 3.355078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19452987853437662, |
| "step": 9210 |
| }, |
| { |
| "epoch": 0.2305, |
| "grad_norm": 31.75, |
| "grad_norm_var": 3.671875, |
| "learning_rate": 0.0001, |
| "loss": 7.4938, |
| "loss/crossentropy": 2.18781658411026, |
| "loss/hidden": 3.385546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2072685670107603, |
| "step": 9220 |
| }, |
| { |
| "epoch": 0.23075, |
| "grad_norm": 33.25, |
| "grad_norm_var": 2.2556640625, |
| "learning_rate": 0.0001, |
| "loss": 7.4355, |
| "loss/crossentropy": 2.1591438576579094, |
| "loss/hidden": 3.337890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18837961312383414, |
| "step": 9230 |
| }, |
| { |
| "epoch": 0.231, |
| "grad_norm": 30.25, |
| "grad_norm_var": 2.2087890625, |
| "learning_rate": 0.0001, |
| "loss": 7.4499, |
| "loss/crossentropy": 2.133663722872734, |
| "loss/hidden": 3.376953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18666253574192523, |
| "step": 9240 |
| }, |
| { |
| "epoch": 0.23125, |
| "grad_norm": 31.875, |
| "grad_norm_var": 2.1806640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5916, |
| "loss/crossentropy": 2.12740980386734, |
| "loss/hidden": 3.405859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19293057937175034, |
| "step": 9250 |
| }, |
| { |
| "epoch": 0.2315, |
| "grad_norm": 32.5, |
| "grad_norm_var": 2.1947265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5307, |
| "loss/crossentropy": 2.1379271537065505, |
| "loss/hidden": 3.387109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19501187074929477, |
| "step": 9260 |
| }, |
| { |
| "epoch": 0.23175, |
| "grad_norm": 30.5, |
| "grad_norm_var": 2.8264973958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.4995, |
| "loss/crossentropy": 2.220619598031044, |
| "loss/hidden": 3.3421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19177700616419316, |
| "step": 9270 |
| }, |
| { |
| "epoch": 0.232, |
| "grad_norm": 31.25, |
| "grad_norm_var": 4.855208333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.461, |
| "loss/crossentropy": 2.1386048540472986, |
| "loss/hidden": 3.401953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19117407780140638, |
| "step": 9280 |
| }, |
| { |
| "epoch": 0.23225, |
| "grad_norm": 33.75, |
| "grad_norm_var": 3.1122395833333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5308, |
| "loss/crossentropy": 2.201369822025299, |
| "loss/hidden": 3.293359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18732962422072888, |
| "step": 9290 |
| }, |
| { |
| "epoch": 0.2325, |
| "grad_norm": 30.75, |
| "grad_norm_var": 3.4559895833333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4868, |
| "loss/crossentropy": 2.172528564929962, |
| "loss/hidden": 3.3359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19425926376134156, |
| "step": 9300 |
| }, |
| { |
| "epoch": 0.23275, |
| "grad_norm": 27.5, |
| "grad_norm_var": 2.83515625, |
| "learning_rate": 0.0001, |
| "loss": 7.4503, |
| "loss/crossentropy": 2.021423862874508, |
| "loss/hidden": 3.48125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20009233951568603, |
| "step": 9310 |
| }, |
| { |
| "epoch": 0.233, |
| "grad_norm": 32.75, |
| "grad_norm_var": 5.4125, |
| "learning_rate": 0.0001, |
| "loss": 7.5898, |
| "loss/crossentropy": 2.197957542538643, |
| "loss/hidden": 3.426953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21913623586297035, |
| "step": 9320 |
| }, |
| { |
| "epoch": 0.23325, |
| "grad_norm": 31.75, |
| "grad_norm_var": 3.70390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5196, |
| "loss/crossentropy": 2.00966841802001, |
| "loss/hidden": 3.449609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21357689071446656, |
| "step": 9330 |
| }, |
| { |
| "epoch": 0.2335, |
| "grad_norm": 32.75, |
| "grad_norm_var": 1.5629557291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5156, |
| "loss/crossentropy": 2.200144296884537, |
| "loss/hidden": 3.342578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19718017429113388, |
| "step": 9340 |
| }, |
| { |
| "epoch": 0.23375, |
| "grad_norm": 30.75, |
| "grad_norm_var": 1.6541666666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4899, |
| "loss/crossentropy": 1.991941288113594, |
| "loss/hidden": 3.426171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1878137281164527, |
| "step": 9350 |
| }, |
| { |
| "epoch": 0.234, |
| "grad_norm": 31.125, |
| "grad_norm_var": 1.6934895833333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5461, |
| "loss/crossentropy": 2.0645473077893257, |
| "loss/hidden": 3.451953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20249587260186672, |
| "step": 9360 |
| }, |
| { |
| "epoch": 0.23425, |
| "grad_norm": 30.625, |
| "grad_norm_var": 3.3462890625, |
| "learning_rate": 0.0001, |
| "loss": 7.4449, |
| "loss/crossentropy": 2.203160837292671, |
| "loss/hidden": 3.38515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20446450710296632, |
| "step": 9370 |
| }, |
| { |
| "epoch": 0.2345, |
| "grad_norm": 32.25, |
| "grad_norm_var": 4.572330729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.451, |
| "loss/crossentropy": 2.1672566562891005, |
| "loss/hidden": 3.419140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22198531460016965, |
| "step": 9380 |
| }, |
| { |
| "epoch": 0.23475, |
| "grad_norm": 30.5, |
| "grad_norm_var": 3.42265625, |
| "learning_rate": 0.0001, |
| "loss": 7.6011, |
| "loss/crossentropy": 2.0229784891009333, |
| "loss/hidden": 3.34609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18394104316830634, |
| "step": 9390 |
| }, |
| { |
| "epoch": 0.235, |
| "grad_norm": 30.75, |
| "grad_norm_var": 4.330208333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4942, |
| "loss/crossentropy": 2.1677876338362694, |
| "loss/hidden": 3.42578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22315683960914612, |
| "step": 9400 |
| }, |
| { |
| "epoch": 0.23525, |
| "grad_norm": 31.375, |
| "grad_norm_var": 19.045572916666668, |
| "learning_rate": 0.0001, |
| "loss": 7.5569, |
| "loss/crossentropy": 2.2457904130220414, |
| "loss/hidden": 3.35625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20503853298723698, |
| "step": 9410 |
| }, |
| { |
| "epoch": 0.2355, |
| "grad_norm": 31.0, |
| "grad_norm_var": 18.926822916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4461, |
| "loss/crossentropy": 2.142396827042103, |
| "loss/hidden": 3.38515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1910883378237486, |
| "step": 9420 |
| }, |
| { |
| "epoch": 0.23575, |
| "grad_norm": 31.25, |
| "grad_norm_var": 8.741666666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.475, |
| "loss/crossentropy": 2.063981272280216, |
| "loss/hidden": 3.4921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20397737752646208, |
| "step": 9430 |
| }, |
| { |
| "epoch": 0.236, |
| "grad_norm": 30.0, |
| "grad_norm_var": 9.062434895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4995, |
| "loss/crossentropy": 1.9866131611168385, |
| "loss/hidden": 3.516015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19697492588311433, |
| "step": 9440 |
| }, |
| { |
| "epoch": 0.23625, |
| "grad_norm": 40.25, |
| "grad_norm_var": 12.64140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5931, |
| "loss/crossentropy": 2.168704579770565, |
| "loss/hidden": 3.46796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19889446310698985, |
| "step": 9450 |
| }, |
| { |
| "epoch": 0.2365, |
| "grad_norm": 29.25, |
| "grad_norm_var": 51.16223958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4255, |
| "loss/crossentropy": 2.241490375995636, |
| "loss/hidden": 3.33125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19278081078082324, |
| "step": 9460 |
| }, |
| { |
| "epoch": 0.23675, |
| "grad_norm": 31.125, |
| "grad_norm_var": 49.08118489583333, |
| "learning_rate": 0.0001, |
| "loss": 7.44, |
| "loss/crossentropy": 2.0530834168195726, |
| "loss/hidden": 3.3125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1845833698287606, |
| "step": 9470 |
| }, |
| { |
| "epoch": 0.237, |
| "grad_norm": 35.25, |
| "grad_norm_var": 5.190559895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4893, |
| "loss/crossentropy": 2.1680017322301866, |
| "loss/hidden": 3.387890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19126822520047426, |
| "step": 9480 |
| }, |
| { |
| "epoch": 0.23725, |
| "grad_norm": 29.75, |
| "grad_norm_var": 4.1728515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5044, |
| "loss/crossentropy": 2.1760893553495406, |
| "loss/hidden": 3.3609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1902306279167533, |
| "step": 9490 |
| }, |
| { |
| "epoch": 0.2375, |
| "grad_norm": 30.5, |
| "grad_norm_var": 7.610872395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5567, |
| "loss/crossentropy": 2.169127979874611, |
| "loss/hidden": 3.3953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1897792138159275, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.23775, |
| "grad_norm": 34.25, |
| "grad_norm_var": 31.978059895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5548, |
| "loss/crossentropy": 2.0474624037742615, |
| "loss/hidden": 3.3921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1897742820903659, |
| "step": 9510 |
| }, |
| { |
| "epoch": 0.238, |
| "grad_norm": 28.875, |
| "grad_norm_var": 33.4025390625, |
| "learning_rate": 0.0001, |
| "loss": 7.46, |
| "loss/crossentropy": 2.167206625640392, |
| "loss/hidden": 3.341796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1971887955442071, |
| "step": 9520 |
| }, |
| { |
| "epoch": 0.23825, |
| "grad_norm": 38.0, |
| "grad_norm_var": 10.677018229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.4721, |
| "loss/crossentropy": 2.160684567689896, |
| "loss/hidden": 3.3671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2026047335937619, |
| "step": 9530 |
| }, |
| { |
| "epoch": 0.2385, |
| "grad_norm": 33.25, |
| "grad_norm_var": 6.915559895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.4439, |
| "loss/crossentropy": 2.101367045938969, |
| "loss/hidden": 3.316796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18591827657073737, |
| "step": 9540 |
| }, |
| { |
| "epoch": 0.23875, |
| "grad_norm": 29.75, |
| "grad_norm_var": 4.285416666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6017, |
| "loss/crossentropy": 2.144346782565117, |
| "loss/hidden": 3.458203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22905603051185608, |
| "step": 9550 |
| }, |
| { |
| "epoch": 0.239, |
| "grad_norm": 31.125, |
| "grad_norm_var": 5.608333333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.391, |
| "loss/crossentropy": 2.2163674563169478, |
| "loss/hidden": 3.491015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20480377711355685, |
| "step": 9560 |
| }, |
| { |
| "epoch": 0.23925, |
| "grad_norm": 32.75, |
| "grad_norm_var": 5.554166666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4754, |
| "loss/crossentropy": 2.05806076079607, |
| "loss/hidden": 3.43046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1894669458270073, |
| "step": 9570 |
| }, |
| { |
| "epoch": 0.2395, |
| "grad_norm": 35.75, |
| "grad_norm_var": 6.116666666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.64, |
| "loss/crossentropy": 2.2116071820259093, |
| "loss/hidden": 3.38046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20237535443156957, |
| "step": 9580 |
| }, |
| { |
| "epoch": 0.23975, |
| "grad_norm": 27.625, |
| "grad_norm_var": 6.3166015625, |
| "learning_rate": 0.0001, |
| "loss": 7.4412, |
| "loss/crossentropy": 2.0997579991817474, |
| "loss/hidden": 3.375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18707791212946176, |
| "step": 9590 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 31.125, |
| "grad_norm_var": 4.391666666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5587, |
| "loss/crossentropy": 2.1704364523291586, |
| "loss/hidden": 3.42109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20182078052312136, |
| "step": 9600 |
| }, |
| { |
| "epoch": 0.24025, |
| "grad_norm": 29.75, |
| "grad_norm_var": 2.959375, |
| "learning_rate": 0.0001, |
| "loss": 7.5494, |
| "loss/crossentropy": 2.1063621580600738, |
| "loss/hidden": 3.4671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2080043438822031, |
| "step": 9610 |
| }, |
| { |
| "epoch": 0.2405, |
| "grad_norm": 30.875, |
| "grad_norm_var": 2.5872395833333335, |
| "learning_rate": 0.0001, |
| "loss": 7.4507, |
| "loss/crossentropy": 2.1176367297768595, |
| "loss/hidden": 3.31015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19296143669635057, |
| "step": 9620 |
| }, |
| { |
| "epoch": 0.24075, |
| "grad_norm": 30.125, |
| "grad_norm_var": 2.3082682291666665, |
| "learning_rate": 0.0001, |
| "loss": 7.5035, |
| "loss/crossentropy": 2.216022199392319, |
| "loss/hidden": 3.486328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2059534139931202, |
| "step": 9630 |
| }, |
| { |
| "epoch": 0.241, |
| "grad_norm": 31.5, |
| "grad_norm_var": 10.676041666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5155, |
| "loss/crossentropy": 2.209742319583893, |
| "loss/hidden": 3.3421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19856051169335842, |
| "step": 9640 |
| }, |
| { |
| "epoch": 0.24125, |
| "grad_norm": 31.75, |
| "grad_norm_var": 1.10390625, |
| "learning_rate": 0.0001, |
| "loss": 7.4983, |
| "loss/crossentropy": 2.040982872247696, |
| "loss/hidden": 3.367578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19508841075003147, |
| "step": 9650 |
| }, |
| { |
| "epoch": 0.2415, |
| "grad_norm": 29.5, |
| "grad_norm_var": 1.1697916666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4469, |
| "loss/crossentropy": 2.1563887119293215, |
| "loss/hidden": 3.2921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18813634980469943, |
| "step": 9660 |
| }, |
| { |
| "epoch": 0.24175, |
| "grad_norm": 29.75, |
| "grad_norm_var": 2.5768229166666665, |
| "learning_rate": 0.0001, |
| "loss": 7.5616, |
| "loss/crossentropy": 2.056604099273682, |
| "loss/hidden": 3.371484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19395184125751258, |
| "step": 9670 |
| }, |
| { |
| "epoch": 0.242, |
| "grad_norm": 30.25, |
| "grad_norm_var": 2.067643229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4244, |
| "loss/crossentropy": 2.174482125043869, |
| "loss/hidden": 3.484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20453137308359146, |
| "step": 9680 |
| }, |
| { |
| "epoch": 0.24225, |
| "grad_norm": 31.0, |
| "grad_norm_var": 3.905989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5003, |
| "loss/crossentropy": 2.119745084643364, |
| "loss/hidden": 3.43671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20908331871032715, |
| "step": 9690 |
| }, |
| { |
| "epoch": 0.2425, |
| "grad_norm": 29.25, |
| "grad_norm_var": 3.4785807291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.3362, |
| "loss/crossentropy": 2.122311297059059, |
| "loss/hidden": 3.3046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18319036178290843, |
| "step": 9700 |
| }, |
| { |
| "epoch": 0.24275, |
| "grad_norm": 29.875, |
| "grad_norm_var": 1.9541666666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4396, |
| "loss/crossentropy": 1.9478351414203643, |
| "loss/hidden": 3.437109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20451803579926492, |
| "step": 9710 |
| }, |
| { |
| "epoch": 0.243, |
| "grad_norm": 32.75, |
| "grad_norm_var": 3.3997395833333335, |
| "learning_rate": 0.0001, |
| "loss": 7.4967, |
| "loss/crossentropy": 2.0708641201257705, |
| "loss/hidden": 3.393359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19053229298442603, |
| "step": 9720 |
| }, |
| { |
| "epoch": 0.24325, |
| "grad_norm": 33.5, |
| "grad_norm_var": 3.635872395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5016, |
| "loss/crossentropy": 2.0913224294781685, |
| "loss/hidden": 3.3953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1940496889874339, |
| "step": 9730 |
| }, |
| { |
| "epoch": 0.2435, |
| "grad_norm": 30.625, |
| "grad_norm_var": 3.3291015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5077, |
| "loss/crossentropy": 2.221079145371914, |
| "loss/hidden": 3.315625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1924398820847273, |
| "step": 9740 |
| }, |
| { |
| "epoch": 0.24375, |
| "grad_norm": 31.5, |
| "grad_norm_var": 1.56875, |
| "learning_rate": 0.0001, |
| "loss": 7.4463, |
| "loss/crossentropy": 2.148006671667099, |
| "loss/hidden": 3.3765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20756886284798384, |
| "step": 9750 |
| }, |
| { |
| "epoch": 0.244, |
| "grad_norm": 30.25, |
| "grad_norm_var": 8.2978515625, |
| "learning_rate": 0.0001, |
| "loss": 7.4544, |
| "loss/crossentropy": 2.2184116363525392, |
| "loss/hidden": 3.359765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19135836903005837, |
| "step": 9760 |
| }, |
| { |
| "epoch": 0.24425, |
| "grad_norm": 29.75, |
| "grad_norm_var": 4.690625, |
| "learning_rate": 0.0001, |
| "loss": 7.4095, |
| "loss/crossentropy": 2.1005363285541536, |
| "loss/hidden": 3.36640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19765366949141025, |
| "step": 9770 |
| }, |
| { |
| "epoch": 0.2445, |
| "grad_norm": 30.5, |
| "grad_norm_var": 1.5327473958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5002, |
| "loss/crossentropy": 2.167180609703064, |
| "loss/hidden": 3.337890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19258719477802516, |
| "step": 9780 |
| }, |
| { |
| "epoch": 0.24475, |
| "grad_norm": 29.875, |
| "grad_norm_var": 2.4697916666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.3885, |
| "loss/crossentropy": 1.9792610332369804, |
| "loss/hidden": 3.3671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18384026363492012, |
| "step": 9790 |
| }, |
| { |
| "epoch": 0.245, |
| "grad_norm": 33.75, |
| "grad_norm_var": 2.5447265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5547, |
| "loss/crossentropy": 2.0596527442336083, |
| "loss/hidden": 3.569921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20697965249419212, |
| "step": 9800 |
| }, |
| { |
| "epoch": 0.24525, |
| "grad_norm": 32.75, |
| "grad_norm_var": 5.042122395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4887, |
| "loss/crossentropy": 2.132730546593666, |
| "loss/hidden": 3.498828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19533955454826354, |
| "step": 9810 |
| }, |
| { |
| "epoch": 0.2455, |
| "grad_norm": 32.25, |
| "grad_norm_var": 5.31015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5278, |
| "loss/crossentropy": 2.151031643152237, |
| "loss/hidden": 3.450390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19322178810834884, |
| "step": 9820 |
| }, |
| { |
| "epoch": 0.24575, |
| "grad_norm": 30.75, |
| "grad_norm_var": 1.6059895833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5534, |
| "loss/crossentropy": 2.143332117795944, |
| "loss/hidden": 3.337109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1897228866815567, |
| "step": 9830 |
| }, |
| { |
| "epoch": 0.246, |
| "grad_norm": 31.75, |
| "grad_norm_var": 3.4395182291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5657, |
| "loss/crossentropy": 2.1097486332058906, |
| "loss/hidden": 3.45390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20945656076073646, |
| "step": 9840 |
| }, |
| { |
| "epoch": 0.24625, |
| "grad_norm": 32.75, |
| "grad_norm_var": 2.9452473958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5013, |
| "loss/crossentropy": 2.065919445455074, |
| "loss/hidden": 3.401171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18999069584533573, |
| "step": 9850 |
| }, |
| { |
| "epoch": 0.2465, |
| "grad_norm": 31.875, |
| "grad_norm_var": 2.2322265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5501, |
| "loss/crossentropy": 2.183481493592262, |
| "loss/hidden": 3.504296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21892244052141904, |
| "step": 9860 |
| }, |
| { |
| "epoch": 0.24675, |
| "grad_norm": 32.75, |
| "grad_norm_var": 2.4926432291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.478, |
| "loss/crossentropy": 2.1966081470251084, |
| "loss/hidden": 3.402734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20018710754811764, |
| "step": 9870 |
| }, |
| { |
| "epoch": 0.247, |
| "grad_norm": 31.0, |
| "grad_norm_var": 2.2244140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5127, |
| "loss/crossentropy": 1.9914133831858636, |
| "loss/hidden": 3.429296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19236233234405517, |
| "step": 9880 |
| }, |
| { |
| "epoch": 0.24725, |
| "grad_norm": 32.25, |
| "grad_norm_var": 1.8900390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5007, |
| "loss/crossentropy": 2.1605743557214736, |
| "loss/hidden": 3.342578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1983661765232682, |
| "step": 9890 |
| }, |
| { |
| "epoch": 0.2475, |
| "grad_norm": 32.75, |
| "grad_norm_var": 2.0666015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5307, |
| "loss/crossentropy": 2.0920628547668456, |
| "loss/hidden": 3.404296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19224398005753757, |
| "step": 9900 |
| }, |
| { |
| "epoch": 0.24775, |
| "grad_norm": 29.75, |
| "grad_norm_var": 28.910416666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4845, |
| "loss/crossentropy": 2.13549183011055, |
| "loss/hidden": 3.435546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2024377616122365, |
| "step": 9910 |
| }, |
| { |
| "epoch": 0.248, |
| "grad_norm": 33.5, |
| "grad_norm_var": 3.2122395833333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5567, |
| "loss/crossentropy": 2.077779620885849, |
| "loss/hidden": 3.429296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1914847193285823, |
| "step": 9920 |
| }, |
| { |
| "epoch": 0.24825, |
| "grad_norm": 32.5, |
| "grad_norm_var": 2.3416015625, |
| "learning_rate": 0.0001, |
| "loss": 7.6034, |
| "loss/crossentropy": 2.24170383810997, |
| "loss/hidden": 3.416796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21483709737658502, |
| "step": 9930 |
| }, |
| { |
| "epoch": 0.2485, |
| "grad_norm": 29.25, |
| "grad_norm_var": 1.9525390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5277, |
| "loss/crossentropy": 2.0574432730674745, |
| "loss/hidden": 3.403125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19156391881406307, |
| "step": 9940 |
| }, |
| { |
| "epoch": 0.24875, |
| "grad_norm": 29.375, |
| "grad_norm_var": 1.6181640625, |
| "learning_rate": 0.0001, |
| "loss": 7.4432, |
| "loss/crossentropy": 2.231716367602348, |
| "loss/hidden": 3.271484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19992320109158754, |
| "step": 9950 |
| }, |
| { |
| "epoch": 0.249, |
| "grad_norm": 30.625, |
| "grad_norm_var": 2.496809895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5233, |
| "loss/crossentropy": 2.2805121034383773, |
| "loss/hidden": 3.376171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21222201287746428, |
| "step": 9960 |
| }, |
| { |
| "epoch": 0.24925, |
| "grad_norm": 30.25, |
| "grad_norm_var": 3.6143229166666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5022, |
| "loss/crossentropy": 2.1643686085939406, |
| "loss/hidden": 3.45078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2017618851736188, |
| "step": 9970 |
| }, |
| { |
| "epoch": 0.2495, |
| "grad_norm": 33.25, |
| "grad_norm_var": 2.9947265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5351, |
| "loss/crossentropy": 2.052297804504633, |
| "loss/hidden": 3.387890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19462402295321227, |
| "step": 9980 |
| }, |
| { |
| "epoch": 0.24975, |
| "grad_norm": 28.625, |
| "grad_norm_var": 1.8958333333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4501, |
| "loss/crossentropy": 2.1721100926399233, |
| "loss/hidden": 3.3234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20133505929261447, |
| "step": 9990 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 28.0, |
| "grad_norm_var": 2.2645833333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5168, |
| "loss/crossentropy": 2.0236786626279355, |
| "loss/hidden": 3.387890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18019386120140551, |
| "step": 10000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 40000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 5000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.8575100320088064e+19, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|