| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.2, |
| "eval_steps": 2000, |
| "global_step": 4000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0005, |
| "grad_norm": 30.875, |
| "learning_rate": 0.0001, |
| "loss": 7.1506, |
| "loss/crossentropy": 1.9750229328870774, |
| "loss/hidden": 3.38984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18868114035576583, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.001, |
| "grad_norm": 30.75, |
| "grad_norm_var": 2.09765625, |
| "learning_rate": 0.0001, |
| "loss": 7.266, |
| "loss/crossentropy": 1.915299428999424, |
| "loss/hidden": 3.368359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19173294119536877, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.0015, |
| "grad_norm": 31.625, |
| "grad_norm_var": 35.572330729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.1477, |
| "loss/crossentropy": 1.845322072505951, |
| "loss/hidden": 3.42421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1835887383669615, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.002, |
| "grad_norm": 30.25, |
| "grad_norm_var": 5.803580729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.125, |
| "loss/crossentropy": 1.8556978717446326, |
| "loss/hidden": 3.5, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22780380193144084, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.0025, |
| "grad_norm": 39.5, |
| "grad_norm_var": 6.737239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.2665, |
| "loss/crossentropy": 2.051687541604042, |
| "loss/hidden": 3.45078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21537381634116173, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.003, |
| "grad_norm": 36.5, |
| "grad_norm_var": 11.058333333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.2095, |
| "loss/crossentropy": 1.9898784533143044, |
| "loss/hidden": 3.3953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19060547631233932, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.0035, |
| "grad_norm": 27.0, |
| "grad_norm_var": 6.45390625, |
| "learning_rate": 0.0001, |
| "loss": 7.2606, |
| "loss/crossentropy": 1.8448080085217953, |
| "loss/hidden": 3.394140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18068002099171282, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.004, |
| "grad_norm": 38.75, |
| "grad_norm_var": 1.3401023445121106e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.4871, |
| "loss/crossentropy": 2.0318232350051404, |
| "loss/hidden": 3.733984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.337183965742588, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.0045, |
| "grad_norm": 35.25, |
| "grad_norm_var": 1.3401023442516444e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.1923, |
| "loss/crossentropy": 1.7826939225196838, |
| "loss/hidden": 3.587890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2118432404473424, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.005, |
| "grad_norm": 32.75, |
| "grad_norm_var": 2.7309895833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.2487, |
| "loss/crossentropy": 1.88408655077219, |
| "loss/hidden": 3.48671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1903762748464942, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.0055, |
| "grad_norm": 34.25, |
| "grad_norm_var": 4.268489583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.1643, |
| "loss/crossentropy": 1.83259879052639, |
| "loss/hidden": 3.41953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19554968569427728, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.006, |
| "grad_norm": 33.0, |
| "grad_norm_var": 6.548958333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.1535, |
| "loss/crossentropy": 1.8173740945756436, |
| "loss/hidden": 3.34609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17036083210259675, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.0065, |
| "grad_norm": 32.25, |
| "grad_norm_var": 3.220572916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.2113, |
| "loss/crossentropy": 1.8991591855883598, |
| "loss/hidden": 3.4359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20231554415076972, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.007, |
| "grad_norm": 120.0, |
| "grad_norm_var": 494.52890625, |
| "learning_rate": 0.0001, |
| "loss": 7.1589, |
| "loss/crossentropy": 1.9234379842877387, |
| "loss/hidden": 3.348828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19592595770955085, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.0075, |
| "grad_norm": 30.375, |
| "grad_norm_var": 496.27265625, |
| "learning_rate": 0.0001, |
| "loss": 7.1392, |
| "loss/crossentropy": 1.7669467806816102, |
| "loss/hidden": 3.39453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1691664818674326, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.008, |
| "grad_norm": 35.25, |
| "grad_norm_var": 202.11354166666666, |
| "learning_rate": 0.0001, |
| "loss": 7.2551, |
| "loss/crossentropy": 1.979496531933546, |
| "loss/hidden": 3.51484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2397671105340123, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.0085, |
| "grad_norm": 29.75, |
| "grad_norm_var": 41.73118489583333, |
| "learning_rate": 0.0001, |
| "loss": 7.0709, |
| "loss/crossentropy": 1.6596970088779925, |
| "loss/hidden": 3.46875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1801933947019279, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.009, |
| "grad_norm": 31.375, |
| "grad_norm_var": 3.1510416666666665, |
| "learning_rate": 0.0001, |
| "loss": 7.1329, |
| "loss/crossentropy": 1.8317318260669708, |
| "loss/hidden": 3.470703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2027322521433234, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.0095, |
| "grad_norm": 31.25, |
| "grad_norm_var": 1.034375, |
| "learning_rate": 0.0001, |
| "loss": 7.2704, |
| "loss/crossentropy": 1.7871993221342564, |
| "loss/hidden": 3.3296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17167234625667332, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 29.375, |
| "grad_norm_var": 1.4218098958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.2074, |
| "loss/crossentropy": 1.9208836354315282, |
| "loss/hidden": 3.355859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18774686167016624, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.0105, |
| "grad_norm": 29.75, |
| "grad_norm_var": 5.548958333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.2446, |
| "loss/crossentropy": 1.8792764976620675, |
| "loss/hidden": 3.430859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19080359637737274, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.011, |
| "grad_norm": 32.25, |
| "grad_norm_var": 11.7619140625, |
| "learning_rate": 0.0001, |
| "loss": 7.2031, |
| "loss/crossentropy": 1.926865078508854, |
| "loss/hidden": 3.387890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19636590238660573, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.0115, |
| "grad_norm": 29.25, |
| "grad_norm_var": 4.170247395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.0576, |
| "loss/crossentropy": 1.8266212515532971, |
| "loss/hidden": 3.377734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18201391287148, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.012, |
| "grad_norm": 31.5, |
| "grad_norm_var": 1.81015625, |
| "learning_rate": 0.0001, |
| "loss": 7.1432, |
| "loss/crossentropy": 1.8445213377475738, |
| "loss/hidden": 3.34140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18868241235613822, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.0125, |
| "grad_norm": 33.75, |
| "grad_norm_var": 1.9625138843884541e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.0655, |
| "loss/crossentropy": 1.8239912115037442, |
| "loss/hidden": 3.298828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17756748497486113, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.013, |
| "grad_norm": 31.875, |
| "grad_norm_var": 1.56640625, |
| "learning_rate": 0.0001, |
| "loss": 7.1575, |
| "loss/crossentropy": 1.7626003332436084, |
| "loss/hidden": 3.4109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18398213125765323, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.0135, |
| "grad_norm": 32.25, |
| "grad_norm_var": 1.1129557291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.1441, |
| "loss/crossentropy": 1.7845010846853255, |
| "loss/hidden": 3.344140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18147525601089, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.014, |
| "grad_norm": 30.25, |
| "grad_norm_var": 2.9822265625, |
| "learning_rate": 0.0001, |
| "loss": 7.1286, |
| "loss/crossentropy": 1.8358447797596456, |
| "loss/hidden": 3.358203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17241306640207768, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.0145, |
| "grad_norm": 33.0, |
| "grad_norm_var": 10.982291666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.1123, |
| "loss/crossentropy": 1.843992917239666, |
| "loss/hidden": 3.3671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19916406068950893, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.015, |
| "grad_norm": 31.5, |
| "grad_norm_var": 3.6176432291666667, |
| "learning_rate": 0.0001, |
| "loss": 6.9761, |
| "loss/crossentropy": 1.710184234380722, |
| "loss/hidden": 3.385546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1904242929071188, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.0155, |
| "grad_norm": 30.625, |
| "grad_norm_var": 1.4795028269701094e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.1128, |
| "loss/crossentropy": 1.783938717842102, |
| "loss/hidden": 3.38515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19371993821114303, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.016, |
| "grad_norm": 27.375, |
| "grad_norm_var": 9.558072916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.1587, |
| "loss/crossentropy": 1.799688772857189, |
| "loss/hidden": 3.35078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18227657950483261, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.0165, |
| "grad_norm": 30.75, |
| "grad_norm_var": 5.827235584899985e+17, |
| "learning_rate": 0.0001, |
| "loss": 7.1719, |
| "loss/crossentropy": 1.8475290067493915, |
| "loss/hidden": 3.490234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20651640743017197, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.017, |
| "grad_norm": 31.875, |
| "grad_norm_var": 1.0473683707078467e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.2024, |
| "loss/crossentropy": 1.7877734430134296, |
| "loss/hidden": 3.341015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17529369578696788, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.0175, |
| "grad_norm": 29.625, |
| "grad_norm_var": 1.0473683706481477e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.0127, |
| "loss/crossentropy": 1.8476789727807046, |
| "loss/hidden": 3.376953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18340907394886016, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.018, |
| "grad_norm": 31.5, |
| "grad_norm_var": 4.201822916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.0837, |
| "loss/crossentropy": 1.9127952009439468, |
| "loss/hidden": 3.274609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18515819907188416, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.0185, |
| "grad_norm": 33.25, |
| "grad_norm_var": 3.4580729166666666, |
| "learning_rate": 0.0001, |
| "loss": 7.1494, |
| "loss/crossentropy": 1.7446002267301082, |
| "loss/hidden": 3.410546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18972037807106973, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.019, |
| "grad_norm": 32.25, |
| "grad_norm_var": 4.0712890625, |
| "learning_rate": 0.0001, |
| "loss": 6.9798, |
| "loss/crossentropy": 1.6596938122063876, |
| "loss/hidden": 3.39296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16941323587670923, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.0195, |
| "grad_norm": 31.5, |
| "grad_norm_var": 1.8014398298089062e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.1659, |
| "loss/crossentropy": 1.8092470526695252, |
| "loss/hidden": 3.278515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16989028006792067, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 29.25, |
| "grad_norm_var": 1.801439829596395e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.1246, |
| "loss/crossentropy": 1.803744176030159, |
| "loss/hidden": 3.365625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19061805782839655, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.0205, |
| "grad_norm": 30.75, |
| "grad_norm_var": 1.1895833333333334, |
| "learning_rate": 0.0001, |
| "loss": 6.8644, |
| "loss/crossentropy": 1.711807917803526, |
| "loss/hidden": 3.348046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17410435527563095, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.021, |
| "grad_norm": 28.75, |
| "grad_norm_var": 1.0518229166666666, |
| "learning_rate": 0.0001, |
| "loss": 6.9733, |
| "loss/crossentropy": 1.9412737876176833, |
| "loss/hidden": 3.32109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1845760691910982, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.0215, |
| "grad_norm": 33.75, |
| "grad_norm_var": 3.36875, |
| "learning_rate": 0.0001, |
| "loss": 7.0425, |
| "loss/crossentropy": 1.6975354842841626, |
| "loss/hidden": 3.30703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17426773644983767, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.022, |
| "grad_norm": 28.875, |
| "grad_norm_var": 4.533072916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.0644, |
| "loss/crossentropy": 1.8431582309305667, |
| "loss/hidden": 3.309765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19988675275817513, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.0225, |
| "grad_norm": 28.5, |
| "grad_norm_var": 4.65, |
| "learning_rate": 0.0001, |
| "loss": 7.1091, |
| "loss/crossentropy": 1.845390348136425, |
| "loss/hidden": 3.395703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18364266194403173, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.023, |
| "grad_norm": 30.75, |
| "grad_norm_var": 4.459375, |
| "learning_rate": 0.0001, |
| "loss": 7.0581, |
| "loss/crossentropy": 1.7513741821050643, |
| "loss/hidden": 3.42109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.186102606728673, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.0235, |
| "grad_norm": 27.375, |
| "grad_norm_var": 4.786458333333333, |
| "learning_rate": 0.0001, |
| "loss": 6.9763, |
| "loss/crossentropy": 1.779174941033125, |
| "loss/hidden": 3.373046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17763521214947103, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.024, |
| "grad_norm": 32.75, |
| "grad_norm_var": 4.1, |
| "learning_rate": 0.0001, |
| "loss": 6.9638, |
| "loss/crossentropy": 1.7178381219506265, |
| "loss/hidden": 3.36484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17294319327920676, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.0245, |
| "grad_norm": 33.75, |
| "grad_norm_var": 3.40625, |
| "learning_rate": 0.0001, |
| "loss": 6.9397, |
| "loss/crossentropy": 1.8609587274491788, |
| "loss/hidden": 3.309765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1921778223477304, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.025, |
| "grad_norm": 30.125, |
| "grad_norm_var": 7.0625, |
| "learning_rate": 0.0001, |
| "loss": 7.1176, |
| "loss/crossentropy": 1.8291713461279868, |
| "loss/hidden": 3.390234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18730791788548232, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.0255, |
| "grad_norm": 30.375, |
| "grad_norm_var": 6.520572916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.097, |
| "loss/crossentropy": 1.6978721603751183, |
| "loss/hidden": 3.354296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16910959454253316, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.026, |
| "grad_norm": 31.5, |
| "grad_norm_var": 5.492708333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.1184, |
| "loss/crossentropy": 1.7646001767367125, |
| "loss/hidden": 3.49609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18606224549002945, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.0265, |
| "grad_norm": 33.25, |
| "grad_norm_var": 3.2478515625, |
| "learning_rate": 0.0001, |
| "loss": 6.9289, |
| "loss/crossentropy": 1.7254683546721936, |
| "loss/hidden": 3.414453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19350956091657281, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.027, |
| "grad_norm": 28.5, |
| "grad_norm_var": 3.2426432291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.0072, |
| "loss/crossentropy": 1.8291743457317353, |
| "loss/hidden": 3.2703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17015220914036036, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.0275, |
| "grad_norm": 29.375, |
| "grad_norm_var": 6.1978515625, |
| "learning_rate": 0.0001, |
| "loss": 7.0714, |
| "loss/crossentropy": 1.7038650900125503, |
| "loss/hidden": 3.35546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17573642041534185, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.028, |
| "grad_norm": 28.875, |
| "grad_norm_var": 5.530143229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.0376, |
| "loss/crossentropy": 2.000048974901438, |
| "loss/hidden": 3.3921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20670556500554085, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.0285, |
| "grad_norm": 30.125, |
| "grad_norm_var": 37.509830729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.0782, |
| "loss/crossentropy": 1.7484589993953705, |
| "loss/hidden": 3.4453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20398099757730961, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.029, |
| "grad_norm": 30.75, |
| "grad_norm_var": 37.80930989583333, |
| "learning_rate": 0.0001, |
| "loss": 7.1094, |
| "loss/crossentropy": 1.747946521639824, |
| "loss/hidden": 3.325, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1723929913714528, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.0295, |
| "grad_norm": 31.5, |
| "grad_norm_var": 1.9410807291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.0532, |
| "loss/crossentropy": 1.714518916606903, |
| "loss/hidden": 3.395703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17450172062963248, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 31.375, |
| "grad_norm_var": 6.620995009586922e+17, |
| "learning_rate": 0.0001, |
| "loss": 7.2589, |
| "loss/crossentropy": 1.7456246592104434, |
| "loss/hidden": 3.3328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18539317091926932, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.0305, |
| "grad_norm": 31.625, |
| "grad_norm_var": 6.620995011655063e+17, |
| "learning_rate": 0.0001, |
| "loss": 7.1014, |
| "loss/crossentropy": 1.6763587422668933, |
| "loss/hidden": 3.4015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1931827544234693, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.031, |
| "grad_norm": 31.5, |
| "grad_norm_var": 4.528125, |
| "learning_rate": 0.0001, |
| "loss": 7.115, |
| "loss/crossentropy": 1.849663856625557, |
| "loss/hidden": 3.41953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21164124589413405, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.0315, |
| "grad_norm": 31.25, |
| "grad_norm_var": 3.027083333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.1975, |
| "loss/crossentropy": 1.765239630639553, |
| "loss/hidden": 3.3609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18264974560588598, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.032, |
| "grad_norm": 29.25, |
| "grad_norm_var": 3.428580729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.1206, |
| "loss/crossentropy": 1.8783695727586747, |
| "loss/hidden": 3.369921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18768006665632128, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.0325, |
| "grad_norm": 30.75, |
| "grad_norm_var": 3.9385416666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.1671, |
| "loss/crossentropy": 1.8120282679796218, |
| "loss/hidden": 3.41484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21209220625460148, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.033, |
| "grad_norm": 31.75, |
| "grad_norm_var": 1.77265625, |
| "learning_rate": 0.0001, |
| "loss": 7.0683, |
| "loss/crossentropy": 1.6486516989767552, |
| "loss/hidden": 3.3765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17768741883337497, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.0335, |
| "grad_norm": 28.5, |
| "grad_norm_var": 1.9622395833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.0341, |
| "loss/crossentropy": 1.5188174404203891, |
| "loss/hidden": 3.355078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17400255370885134, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.034, |
| "grad_norm": 29.25, |
| "grad_norm_var": 3.075, |
| "learning_rate": 0.0001, |
| "loss": 7.0187, |
| "loss/crossentropy": 1.7111039966344834, |
| "loss/hidden": 3.42734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20188356712460517, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.0345, |
| "grad_norm": 30.5, |
| "grad_norm_var": 1.5458333333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.1392, |
| "loss/crossentropy": 1.7463210627436638, |
| "loss/hidden": 3.380078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18064118530601264, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.035, |
| "grad_norm": 30.0, |
| "grad_norm_var": 1.6020833333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.0488, |
| "loss/crossentropy": 1.913002396374941, |
| "loss/hidden": 3.248046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17795131383463741, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.0355, |
| "grad_norm": 3674210304.0, |
| "grad_norm_var": 2.2729279965717071e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.1836, |
| "loss/crossentropy": 1.7232265777885913, |
| "loss/hidden": 3.417578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19430895978584886, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.036, |
| "grad_norm": 29.125, |
| "grad_norm_var": 8.437388195823355e+17, |
| "learning_rate": 0.0001, |
| "loss": 6.9841, |
| "loss/crossentropy": 1.8030119113624097, |
| "loss/hidden": 3.39453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18302876157686115, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.0365, |
| "grad_norm": 30.375, |
| "grad_norm_var": 2.85, |
| "learning_rate": 0.0001, |
| "loss": 6.9804, |
| "loss/crossentropy": 1.9009442821145057, |
| "loss/hidden": 3.266796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16866004383191466, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.037, |
| "grad_norm": 30.0, |
| "grad_norm_var": 9.339322916666667, |
| "learning_rate": 0.0001, |
| "loss": 6.9876, |
| "loss/crossentropy": 1.6418433368206025, |
| "loss/hidden": 3.438671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.191958365496248, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.0375, |
| "grad_norm": 30.875, |
| "grad_norm_var": 7.639322916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.0538, |
| "loss/crossentropy": 1.853764034062624, |
| "loss/hidden": 3.32578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17473467853851615, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.038, |
| "grad_norm": 31.125, |
| "grad_norm_var": 1.0613932291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.1458, |
| "loss/crossentropy": 1.8514880582690239, |
| "loss/hidden": 3.378515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19726306498050689, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.0385, |
| "grad_norm": 28.875, |
| "grad_norm_var": 1.7997395833333334, |
| "learning_rate": 0.0001, |
| "loss": 7.0766, |
| "loss/crossentropy": 1.8405121728777885, |
| "loss/hidden": 3.324609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19442977402359246, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.039, |
| "grad_norm": 29.0, |
| "grad_norm_var": 2.3802083333333335, |
| "learning_rate": 0.0001, |
| "loss": 7.0214, |
| "loss/crossentropy": 1.9466332450509072, |
| "loss/hidden": 3.289453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.170109105668962, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.0395, |
| "grad_norm": 30.0, |
| "grad_norm_var": 1.6124348958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.1306, |
| "loss/crossentropy": 1.8399325378239155, |
| "loss/hidden": 3.46015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20626397961750625, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 31.75, |
| "grad_norm_var": 1.6559895833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.1375, |
| "loss/crossentropy": 1.9278223380446433, |
| "loss/hidden": 3.41015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2024382423609495, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.0405, |
| "grad_norm": 27.5, |
| "grad_norm_var": 16.089322916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.0363, |
| "loss/crossentropy": 1.859210267663002, |
| "loss/hidden": 3.345703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18585832975804806, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.041, |
| "grad_norm": 28.25, |
| "grad_norm_var": 38.77265625, |
| "learning_rate": 0.0001, |
| "loss": 6.9378, |
| "loss/crossentropy": 1.8994540706276895, |
| "loss/hidden": 3.376953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2018324811011553, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.0415, |
| "grad_norm": 32.0, |
| "grad_norm_var": 38.8375, |
| "learning_rate": 0.0001, |
| "loss": 7.002, |
| "loss/crossentropy": 1.8244094364345074, |
| "loss/hidden": 3.415625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20930232629179954, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.042, |
| "grad_norm": 30.25, |
| "grad_norm_var": 2.0634765625, |
| "learning_rate": 0.0001, |
| "loss": 6.9688, |
| "loss/crossentropy": 1.8976417139172554, |
| "loss/hidden": 3.33515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1871755332686007, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.0425, |
| "grad_norm": 50.75, |
| "grad_norm_var": 28.351497395833334, |
| "learning_rate": 0.0001, |
| "loss": 6.992, |
| "loss/crossentropy": 1.899886740744114, |
| "loss/hidden": 3.417578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18904313631355762, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.043, |
| "grad_norm": 29.0, |
| "grad_norm_var": 27.3056640625, |
| "learning_rate": 0.0001, |
| "loss": 7.0939, |
| "loss/crossentropy": 1.8286892741918563, |
| "loss/hidden": 3.362109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18909739144146442, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.0435, |
| "grad_norm": 28.375, |
| "grad_norm_var": 1.3247395833333333, |
| "learning_rate": 0.0001, |
| "loss": 6.9381, |
| "loss/crossentropy": 1.9782623961567878, |
| "loss/hidden": 3.305859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1766037069261074, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.044, |
| "grad_norm": 29.0, |
| "grad_norm_var": 2.1988932291666665, |
| "learning_rate": 0.0001, |
| "loss": 6.8414, |
| "loss/crossentropy": 1.8968854755163194, |
| "loss/hidden": 3.413671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20138736004009844, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.0445, |
| "grad_norm": 32.75, |
| "grad_norm_var": 1.92890625, |
| "learning_rate": 0.0001, |
| "loss": 7.1271, |
| "loss/crossentropy": 1.8630956932902336, |
| "loss/hidden": 3.428125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21029497124254704, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.045, |
| "grad_norm": 29.25, |
| "grad_norm_var": 2.037239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.0435, |
| "loss/crossentropy": 1.8676601111888886, |
| "loss/hidden": 3.351953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19789310321211814, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.0455, |
| "grad_norm": 30.25, |
| "grad_norm_var": 4.2265225949129395e+17, |
| "learning_rate": 0.0001, |
| "loss": 7.1233, |
| "loss/crossentropy": 1.8434145867824554, |
| "loss/hidden": 3.378125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18832013495266436, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.046, |
| "grad_norm": 29.375, |
| "grad_norm_var": 4.2265225969445555e+17, |
| "learning_rate": 0.0001, |
| "loss": 6.8733, |
| "loss/crossentropy": 1.81582195982337, |
| "loss/hidden": 3.416796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18773540575057268, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.0465, |
| "grad_norm": 33.0, |
| "grad_norm_var": 4.476822916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.0752, |
| "loss/crossentropy": 1.8667447365820409, |
| "loss/hidden": 3.336328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18054497512057424, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.047, |
| "grad_norm": 28.625, |
| "grad_norm_var": 6.144205729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.0032, |
| "loss/crossentropy": 1.8144822165369987, |
| "loss/hidden": 3.271484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1632128401659429, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.0475, |
| "grad_norm": 30.375, |
| "grad_norm_var": 5.01875, |
| "learning_rate": 0.0001, |
| "loss": 6.8626, |
| "loss/crossentropy": 1.8152224607765675, |
| "loss/hidden": 3.394140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18933067489415406, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.048, |
| "grad_norm": 37.0, |
| "grad_norm_var": 7.297916666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.0437, |
| "loss/crossentropy": 1.6399064034223556, |
| "loss/hidden": 3.39140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18825935963541268, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.0485, |
| "grad_norm": 29.75, |
| "grad_norm_var": 4.739583333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.0331, |
| "loss/crossentropy": 1.6737658925354482, |
| "loss/hidden": 3.412890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17548465421423315, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.049, |
| "grad_norm": 30.0, |
| "grad_norm_var": 18.1541015625, |
| "learning_rate": 0.0001, |
| "loss": 6.9385, |
| "loss/crossentropy": 1.8608146458864212, |
| "loss/hidden": 3.35234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19196428768336773, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.0495, |
| "grad_norm": 33.75, |
| "grad_norm_var": 4.003125, |
| "learning_rate": 0.0001, |
| "loss": 7.0686, |
| "loss/crossentropy": 1.8301926247775555, |
| "loss/hidden": 3.347265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18049606634303927, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 31.75, |
| "grad_norm_var": 1.0473683721235639e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.0193, |
| "loss/crossentropy": 1.7465273767709732, |
| "loss/hidden": 3.369921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17173261381685734, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.0505, |
| "grad_norm": 29.75, |
| "grad_norm_var": 22.408268229166666, |
| "learning_rate": 0.0001, |
| "loss": 6.9709, |
| "loss/crossentropy": 1.7683202728629113, |
| "loss/hidden": 3.419921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.210743809863925, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.051, |
| "grad_norm": 28.625, |
| "grad_norm_var": 2.371875, |
| "learning_rate": 0.0001, |
| "loss": 7.0597, |
| "loss/crossentropy": 2.046058624982834, |
| "loss/hidden": 3.3375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18963768277317286, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.0515, |
| "grad_norm": 30.0, |
| "grad_norm_var": 1.3184895833333334, |
| "learning_rate": 0.0001, |
| "loss": 7.0245, |
| "loss/crossentropy": 1.745854178071022, |
| "loss/hidden": 3.30390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17351055853068828, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.052, |
| "grad_norm": 34.75, |
| "grad_norm_var": 2.8108723958333335, |
| "learning_rate": 0.0001, |
| "loss": 6.9474, |
| "loss/crossentropy": 1.8277953140437604, |
| "loss/hidden": 3.33984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16915141120553018, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.0525, |
| "grad_norm": 32.5, |
| "grad_norm_var": 3.39765625, |
| "learning_rate": 0.0001, |
| "loss": 6.9366, |
| "loss/crossentropy": 1.9404960587620734, |
| "loss/hidden": 3.35625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18970660548657178, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.053, |
| "grad_norm": 35.75, |
| "grad_norm_var": 1.1892317588406927e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.0954, |
| "loss/crossentropy": 1.8612810902297496, |
| "loss/hidden": 3.31171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17269262354820966, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.0535, |
| "grad_norm": 29.875, |
| "grad_norm_var": 1.1892317588497805e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.0259, |
| "loss/crossentropy": 1.743497943878174, |
| "loss/hidden": 3.2609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1666251303628087, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.054, |
| "grad_norm": 29.625, |
| "grad_norm_var": 2.903059895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.0055, |
| "loss/crossentropy": 1.9657445706427097, |
| "loss/hidden": 3.32734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18259168425574898, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.0545, |
| "grad_norm": 30.25, |
| "grad_norm_var": 51.16015625, |
| "learning_rate": 0.0001, |
| "loss": 7.1126, |
| "loss/crossentropy": 2.0204195216298104, |
| "loss/hidden": 3.334765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20481194872409106, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.055, |
| "grad_norm": 29.625, |
| "grad_norm_var": 2.90390625, |
| "learning_rate": 0.0001, |
| "loss": 7.0413, |
| "loss/crossentropy": 1.589720468968153, |
| "loss/hidden": 3.275, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18000307623296977, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.0555, |
| "grad_norm": 29.375, |
| "grad_norm_var": 2.2613932291666665, |
| "learning_rate": 0.0001, |
| "loss": 6.9722, |
| "loss/crossentropy": 1.7191244810819626, |
| "loss/hidden": 3.45390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18164545409381389, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.056, |
| "grad_norm": 28.875, |
| "grad_norm_var": 1.7520833333333334, |
| "learning_rate": 0.0001, |
| "loss": 6.9492, |
| "loss/crossentropy": 1.8928776159882545, |
| "loss/hidden": 3.358203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18985262140631676, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.0565, |
| "grad_norm": 30.0, |
| "grad_norm_var": 1.2447265625, |
| "learning_rate": 0.0001, |
| "loss": 7.1367, |
| "loss/crossentropy": 1.7702923499047756, |
| "loss/hidden": 3.32109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17983693201094866, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.057, |
| "grad_norm": 30.25, |
| "grad_norm_var": 3.3080729166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.0322, |
| "loss/crossentropy": 1.8519952863454818, |
| "loss/hidden": 3.465234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20197003111243247, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.0575, |
| "grad_norm": 31.125, |
| "grad_norm_var": 3.1962890625, |
| "learning_rate": 0.0001, |
| "loss": 7.0557, |
| "loss/crossentropy": 1.8624355979263783, |
| "loss/hidden": 3.526953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20604186709970235, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.058, |
| "grad_norm": 28.5, |
| "grad_norm_var": 22.8462890625, |
| "learning_rate": 0.0001, |
| "loss": 6.9562, |
| "loss/crossentropy": 1.8102556586265564, |
| "loss/hidden": 3.44609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20240887869149446, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.0585, |
| "grad_norm": 32.25, |
| "grad_norm_var": 23.950455729166666, |
| "learning_rate": 0.0001, |
| "loss": 6.9857, |
| "loss/crossentropy": 1.8860370084643363, |
| "loss/hidden": 3.339453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18206186592578888, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.059, |
| "grad_norm": 30.125, |
| "grad_norm_var": 1.6518229166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.056, |
| "loss/crossentropy": 1.9338740326464177, |
| "loss/hidden": 3.42265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22607974465936423, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.0595, |
| "grad_norm": 29.5, |
| "grad_norm_var": 11.267708333333333, |
| "learning_rate": 0.0001, |
| "loss": 6.931, |
| "loss/crossentropy": 1.9357615426182746, |
| "loss/hidden": 3.351953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1852928228676319, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 39.25, |
| "grad_norm_var": 1.2635411532464435e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.0138, |
| "loss/crossentropy": 1.669256182014942, |
| "loss/hidden": 3.31328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1792891369201243, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.0605, |
| "grad_norm": 30.125, |
| "grad_norm_var": 2.2555340145024479e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.003, |
| "loss/crossentropy": 1.8537344850599766, |
| "loss/hidden": 3.6078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18713028654456138, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.061, |
| "grad_norm": 30.75, |
| "grad_norm_var": 1.1529214881025404e+18, |
| "learning_rate": 0.0001, |
| "loss": 6.9982, |
| "loss/crossentropy": 1.8868144243955611, |
| "loss/hidden": 3.259375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16826356202363968, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.0615, |
| "grad_norm": 38.0, |
| "grad_norm_var": 11.041080729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.1145, |
| "loss/crossentropy": 1.7373395457863807, |
| "loss/hidden": 3.26328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16631986051797867, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.062, |
| "grad_norm": 28.625, |
| "grad_norm_var": 6.718489583333334, |
| "learning_rate": 0.0001, |
| "loss": 6.8881, |
| "loss/crossentropy": 1.610298927500844, |
| "loss/hidden": 3.358984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1909397032111883, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.0625, |
| "grad_norm": 29.625, |
| "grad_norm_var": 4.344205729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.0797, |
| "loss/crossentropy": 1.7361410059034825, |
| "loss/hidden": 3.366796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18541559688746928, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.063, |
| "grad_norm": 27.875, |
| "grad_norm_var": 3.3889973958333335, |
| "learning_rate": 0.0001, |
| "loss": 6.9329, |
| "loss/crossentropy": 1.7078735738992692, |
| "loss/hidden": 3.4015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18024133574217557, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.0635, |
| "grad_norm": 35.0, |
| "grad_norm_var": 6.6166015625, |
| "learning_rate": 0.0001, |
| "loss": 6.9738, |
| "loss/crossentropy": 1.8044774197041988, |
| "loss/hidden": 3.276171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1794836211949587, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.064, |
| "grad_norm": 29.375, |
| "grad_norm_var": 13.601822916666666, |
| "learning_rate": 0.0001, |
| "loss": 6.9062, |
| "loss/crossentropy": 1.8313415050506592, |
| "loss/hidden": 3.3140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18087668968364595, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.0645, |
| "grad_norm": 29.75, |
| "grad_norm_var": 3.6020182291666667, |
| "learning_rate": 0.0001, |
| "loss": 6.9407, |
| "loss/crossentropy": 1.6438103877007961, |
| "loss/hidden": 3.41875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1820345466956496, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.065, |
| "grad_norm": 30.25, |
| "grad_norm_var": 1.2379557291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.0302, |
| "loss/crossentropy": 1.7621051207184792, |
| "loss/hidden": 3.41171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19308385904878378, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.0655, |
| "grad_norm": 29.375, |
| "grad_norm_var": 3.46640625, |
| "learning_rate": 0.0001, |
| "loss": 7.1178, |
| "loss/crossentropy": 1.871315811574459, |
| "loss/hidden": 3.3875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19272034596651794, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.066, |
| "grad_norm": 31.625, |
| "grad_norm_var": 3.609375, |
| "learning_rate": 0.0001, |
| "loss": 7.0298, |
| "loss/crossentropy": 1.8252998240292073, |
| "loss/hidden": 3.36875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21978344805538655, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.0665, |
| "grad_norm": 33.5, |
| "grad_norm_var": 1.3990009840566536e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.068, |
| "loss/crossentropy": 1.639507355540991, |
| "loss/hidden": 3.60703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18024437148123978, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.067, |
| "grad_norm": 28.75, |
| "grad_norm_var": 1.3990009842291443e+18, |
| "learning_rate": 0.0001, |
| "loss": 6.9556, |
| "loss/crossentropy": 1.8158223167061807, |
| "loss/hidden": 3.3203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18003626042045653, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.0675, |
| "grad_norm": 29.75, |
| "grad_norm_var": 3.21640625, |
| "learning_rate": 0.0001, |
| "loss": 6.7859, |
| "loss/crossentropy": 1.6335266396403312, |
| "loss/hidden": 3.38046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1845483684912324, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.068, |
| "grad_norm": 30.75, |
| "grad_norm_var": 2.5497395833333334, |
| "learning_rate": 0.0001, |
| "loss": 6.8607, |
| "loss/crossentropy": 1.7433619983494282, |
| "loss/hidden": 3.3484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17121702507138253, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.0685, |
| "grad_norm": 28.0, |
| "grad_norm_var": 4.353580729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.1422, |
| "loss/crossentropy": 1.8455571182072164, |
| "loss/hidden": 3.333203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2054300512187183, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.069, |
| "grad_norm": 29.625, |
| "grad_norm_var": 3.388541666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.0213, |
| "loss/crossentropy": 1.8241696588695049, |
| "loss/hidden": 3.321875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18985041994601487, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.0695, |
| "grad_norm": 31.25, |
| "grad_norm_var": 8.0431640625, |
| "learning_rate": 0.0001, |
| "loss": 7.0, |
| "loss/crossentropy": 1.7940153643488883, |
| "loss/hidden": 3.331640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18176266234368085, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 33.0, |
| "grad_norm_var": 14.3041015625, |
| "learning_rate": 0.0001, |
| "loss": 6.898, |
| "loss/crossentropy": 1.8607503667473793, |
| "loss/hidden": 3.326953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17468307819217443, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.0705, |
| "grad_norm": 28.125, |
| "grad_norm_var": 13.432291666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.031, |
| "loss/crossentropy": 1.6316836021840573, |
| "loss/hidden": 3.240234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.15119749261066318, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.071, |
| "grad_norm": 28.25, |
| "grad_norm_var": 45.9634765625, |
| "learning_rate": 0.0001, |
| "loss": 7.1507, |
| "loss/crossentropy": 1.8821631267666816, |
| "loss/hidden": 3.465625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19027305245399476, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.0715, |
| "grad_norm": 28.375, |
| "grad_norm_var": 46.1884765625, |
| "learning_rate": 0.0001, |
| "loss": 7.063, |
| "loss/crossentropy": 1.6992614693939685, |
| "loss/hidden": 3.4625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2002884623594582, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.072, |
| "grad_norm": 29.625, |
| "grad_norm_var": 6.732291666666667, |
| "learning_rate": 0.0001, |
| "loss": 6.9439, |
| "loss/crossentropy": 1.7733798533678056, |
| "loss/hidden": 3.307421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17554995641112328, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.0725, |
| "grad_norm": 30.625, |
| "grad_norm_var": 24.97265625, |
| "learning_rate": 0.0001, |
| "loss": 7.0264, |
| "loss/crossentropy": 1.8444553710520268, |
| "loss/hidden": 3.412109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1976129287853837, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.073, |
| "grad_norm": 41.5, |
| "grad_norm_var": 18.2275390625, |
| "learning_rate": 0.0001, |
| "loss": 7.0056, |
| "loss/crossentropy": 1.778428715467453, |
| "loss/hidden": 3.3046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17879956895485521, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.0735, |
| "grad_norm": 40.75, |
| "grad_norm_var": 14.88515625, |
| "learning_rate": 0.0001, |
| "loss": 6.8647, |
| "loss/crossentropy": 1.8260969623923302, |
| "loss/hidden": 3.431640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18223165888339282, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.074, |
| "grad_norm": 30.75, |
| "grad_norm_var": 12.42265625, |
| "learning_rate": 0.0001, |
| "loss": 6.9814, |
| "loss/crossentropy": 1.852180902659893, |
| "loss/hidden": 3.188671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.15915404492989182, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.0745, |
| "grad_norm": 32.0, |
| "grad_norm_var": 17.264518229166665, |
| "learning_rate": 0.0001, |
| "loss": 6.9467, |
| "loss/crossentropy": 1.8016018435359, |
| "loss/hidden": 3.30234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17374343778938056, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.075, |
| "grad_norm": 27.75, |
| "grad_norm_var": 16.795572916666668, |
| "learning_rate": 0.0001, |
| "loss": 6.9688, |
| "loss/crossentropy": 1.7803546212613583, |
| "loss/hidden": 3.230078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1623454326763749, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.0755, |
| "grad_norm": 27.125, |
| "grad_norm_var": 11.0072265625, |
| "learning_rate": 0.0001, |
| "loss": 6.9148, |
| "loss/crossentropy": 1.7990518882870674, |
| "loss/hidden": 3.341015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1776049867272377, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.076, |
| "grad_norm": 28.875, |
| "grad_norm_var": 9.0009765625, |
| "learning_rate": 0.0001, |
| "loss": 6.9834, |
| "loss/crossentropy": 1.7659361466765404, |
| "loss/hidden": 3.229296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17018448635935784, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.0765, |
| "grad_norm": 28.75, |
| "grad_norm_var": 5.566666666666666, |
| "learning_rate": 0.0001, |
| "loss": 6.9513, |
| "loss/crossentropy": 1.948898734152317, |
| "loss/hidden": 3.368359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20332392100244762, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.077, |
| "grad_norm": 37.0, |
| "grad_norm_var": 12.0337890625, |
| "learning_rate": 0.0001, |
| "loss": 6.9845, |
| "loss/crossentropy": 1.897236557304859, |
| "loss/hidden": 3.305078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1786106862127781, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.0775, |
| "grad_norm": 30.75, |
| "grad_norm_var": 10.74140625, |
| "learning_rate": 0.0001, |
| "loss": 6.9651, |
| "loss/crossentropy": 1.668473443388939, |
| "loss/hidden": 3.30390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18010491924360394, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.078, |
| "grad_norm": 35.0, |
| "grad_norm_var": 11.645768229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.0873, |
| "loss/crossentropy": 1.8844516187906266, |
| "loss/hidden": 3.323828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19164156243205072, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.0785, |
| "grad_norm": 36.5, |
| "grad_norm_var": 9.326497395833334, |
| "learning_rate": 0.0001, |
| "loss": 6.9175, |
| "loss/crossentropy": 1.7603260070085525, |
| "loss/hidden": 3.276953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17738686297088863, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.079, |
| "grad_norm": 28.25, |
| "grad_norm_var": 11.4259765625, |
| "learning_rate": 0.0001, |
| "loss": 7.0352, |
| "loss/crossentropy": 1.8728493131697177, |
| "loss/hidden": 3.341796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19688725294545292, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.0795, |
| "grad_norm": 29.25, |
| "grad_norm_var": 8.5375, |
| "learning_rate": 0.0001, |
| "loss": 6.955, |
| "loss/crossentropy": 1.8099886417388915, |
| "loss/hidden": 3.29375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18610341083258392, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 36.0, |
| "grad_norm_var": 19.722330729166668, |
| "learning_rate": 0.0001, |
| "loss": 6.9313, |
| "loss/crossentropy": 1.7017989411950112, |
| "loss/hidden": 3.35234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17710780492052436, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.0805, |
| "grad_norm": 32.25, |
| "grad_norm_var": 21.603125, |
| "learning_rate": 0.0001, |
| "loss": 7.069, |
| "loss/crossentropy": 1.7873531341552735, |
| "loss/hidden": 3.333203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1812642457894981, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.081, |
| "grad_norm": 28.875, |
| "grad_norm_var": 3.2207682291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.0405, |
| "loss/crossentropy": 1.7903928458690643, |
| "loss/hidden": 3.394921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19645511778071523, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.0815, |
| "grad_norm": 29.75, |
| "grad_norm_var": 2.874739583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.0022, |
| "loss/crossentropy": 1.6019535034894943, |
| "loss/hidden": 3.271484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1628541074693203, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.082, |
| "grad_norm": 31.375, |
| "grad_norm_var": 6.37265625, |
| "learning_rate": 0.0001, |
| "loss": 6.7734, |
| "loss/crossentropy": 1.7893570616841317, |
| "loss/hidden": 3.371484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2000499103218317, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.0825, |
| "grad_norm": 30.5, |
| "grad_norm_var": 6.910416666666666, |
| "learning_rate": 0.0001, |
| "loss": 6.9578, |
| "loss/crossentropy": 1.6443258710205555, |
| "loss/hidden": 3.259765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16416865289211274, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.083, |
| "grad_norm": 30.5, |
| "grad_norm_var": 35.25182291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.0861, |
| "loss/crossentropy": 1.8358689159154893, |
| "loss/hidden": 3.28359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1853348884731531, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.0835, |
| "grad_norm": 30.0, |
| "grad_norm_var": 15.6587890625, |
| "learning_rate": 0.0001, |
| "loss": 6.9008, |
| "loss/crossentropy": 1.9014468491077423, |
| "loss/hidden": 3.34140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19975380562245845, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.084, |
| "grad_norm": 28.25, |
| "grad_norm_var": 4.9666015625, |
| "learning_rate": 0.0001, |
| "loss": 7.0062, |
| "loss/crossentropy": 1.7637556672096253, |
| "loss/hidden": 3.40703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19306765552610158, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.0845, |
| "grad_norm": 44.0, |
| "grad_norm_var": 14.08125, |
| "learning_rate": 0.0001, |
| "loss": 6.9184, |
| "loss/crossentropy": 1.7980270460247993, |
| "loss/hidden": 3.336328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17251317510381342, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.085, |
| "grad_norm": 30.0, |
| "grad_norm_var": 16.656184895833334, |
| "learning_rate": 0.0001, |
| "loss": 6.8985, |
| "loss/crossentropy": 1.9003560155630113, |
| "loss/hidden": 3.336328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19372209012508393, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.0855, |
| "grad_norm": 28.375, |
| "grad_norm_var": 4.02265625, |
| "learning_rate": 0.0001, |
| "loss": 6.8638, |
| "loss/crossentropy": 1.7488896727561951, |
| "loss/hidden": 3.31484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16111841816455125, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.086, |
| "grad_norm": 4362076160.0, |
| "grad_norm_var": 1.1892317599584748e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.061, |
| "loss/crossentropy": 1.7708093903958797, |
| "loss/hidden": 3.35625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19512660000473261, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.0865, |
| "grad_norm": 30.375, |
| "grad_norm_var": 1.1892317591996554e+18, |
| "learning_rate": 0.0001, |
| "loss": 6.8861, |
| "loss/crossentropy": 1.6944726780056953, |
| "loss/hidden": 3.333203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16455791369080544, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.087, |
| "grad_norm": 29.375, |
| "grad_norm_var": 3.2905598958333333, |
| "learning_rate": 0.0001, |
| "loss": 6.8425, |
| "loss/crossentropy": 1.7352489478886128, |
| "loss/hidden": 3.32421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16651339596137404, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.0875, |
| "grad_norm": 29.875, |
| "grad_norm_var": 1.81015625, |
| "learning_rate": 0.0001, |
| "loss": 6.886, |
| "loss/crossentropy": 1.775932352244854, |
| "loss/hidden": 3.375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18791395220905543, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.088, |
| "grad_norm": 29.25, |
| "grad_norm_var": 2.9848307291666667, |
| "learning_rate": 0.0001, |
| "loss": 6.8755, |
| "loss/crossentropy": 1.700956543534994, |
| "loss/hidden": 3.359765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17034402694553136, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.0885, |
| "grad_norm": 30.375, |
| "grad_norm_var": 2.0660807291666665, |
| "learning_rate": 0.0001, |
| "loss": 6.9996, |
| "loss/crossentropy": 1.6696124613285064, |
| "loss/hidden": 3.317578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17471891567111014, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.089, |
| "grad_norm": 29.0, |
| "grad_norm_var": 2.7729166666666667, |
| "learning_rate": 0.0001, |
| "loss": 6.8325, |
| "loss/crossentropy": 1.6660587199032306, |
| "loss/hidden": 3.328515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1662266943603754, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.0895, |
| "grad_norm": 32.5, |
| "grad_norm_var": 4.6900390625, |
| "learning_rate": 0.0001, |
| "loss": 6.947, |
| "loss/crossentropy": 1.8900059774518012, |
| "loss/hidden": 3.315234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18781680446118115, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 30.0, |
| "grad_norm_var": 4.231705729166666, |
| "learning_rate": 0.0001, |
| "loss": 6.9437, |
| "loss/crossentropy": 1.8869778975844382, |
| "loss/hidden": 3.269921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17426692880690098, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.0905, |
| "grad_norm": 33.0, |
| "grad_norm_var": 2.8309895833333334, |
| "learning_rate": 0.0001, |
| "loss": 6.9652, |
| "loss/crossentropy": 1.8232818126678467, |
| "loss/hidden": 3.331640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16745625659823418, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.091, |
| "grad_norm": 34.25, |
| "grad_norm_var": 4.40390625, |
| "learning_rate": 0.0001, |
| "loss": 7.0219, |
| "loss/crossentropy": 1.8258642494678496, |
| "loss/hidden": 3.315234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19198300442658364, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.0915, |
| "grad_norm": 32.25, |
| "grad_norm_var": 8.268684895833333, |
| "learning_rate": 0.0001, |
| "loss": 6.8434, |
| "loss/crossentropy": 1.7024194486439228, |
| "loss/hidden": 3.409765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18930096151307224, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.092, |
| "grad_norm": 31.625, |
| "grad_norm_var": 6.74765625, |
| "learning_rate": 0.0001, |
| "loss": 6.9231, |
| "loss/crossentropy": 1.7479817308485508, |
| "loss/hidden": 3.3453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1829341644886881, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.0925, |
| "grad_norm": 33.75, |
| "grad_norm_var": 4.48515625, |
| "learning_rate": 0.0001, |
| "loss": 7.0635, |
| "loss/crossentropy": 2.0127600729465485, |
| "loss/hidden": 3.2953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18128359764814378, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.093, |
| "grad_norm": 31.75, |
| "grad_norm_var": 11.642708333333333, |
| "learning_rate": 0.0001, |
| "loss": 6.9505, |
| "loss/crossentropy": 1.7567149683833123, |
| "loss/hidden": 3.343359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1842447452247143, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.0935, |
| "grad_norm": 34.5, |
| "grad_norm_var": 1.5832967231255347e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.1294, |
| "loss/crossentropy": 1.8183075070381165, |
| "loss/hidden": 3.25, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17170923966914414, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.094, |
| "grad_norm": 36.0, |
| "grad_norm_var": 14.670833333333333, |
| "learning_rate": 0.0001, |
| "loss": 6.7269, |
| "loss/crossentropy": 1.6782560005784035, |
| "loss/hidden": 3.329296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16191824562847615, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.0945, |
| "grad_norm": 29.5, |
| "grad_norm_var": 8.283984344848707e+17, |
| "learning_rate": 0.0001, |
| "loss": 6.9423, |
| "loss/crossentropy": 1.7822233349084855, |
| "loss/hidden": 3.319140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.15704208929091693, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.095, |
| "grad_norm": 27.25, |
| "grad_norm_var": 12.049739583333333, |
| "learning_rate": 0.0001, |
| "loss": 6.8598, |
| "loss/crossentropy": 1.8880347676575184, |
| "loss/hidden": 3.30546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18590961638838052, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.0955, |
| "grad_norm": 32.75, |
| "grad_norm_var": 6.827351348981094e+17, |
| "learning_rate": 0.0001, |
| "loss": 7.0671, |
| "loss/crossentropy": 1.6947499185800552, |
| "loss/hidden": 3.341015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17880834415555, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.096, |
| "grad_norm": 30.875, |
| "grad_norm_var": 7.036874278235887e+17, |
| "learning_rate": 0.0001, |
| "loss": 6.8978, |
| "loss/crossentropy": 1.6141892828047275, |
| "loss/hidden": 3.35390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18202604549005627, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.0965, |
| "grad_norm": 29.625, |
| "grad_norm_var": 12.239583333333334, |
| "learning_rate": 0.0001, |
| "loss": 6.9659, |
| "loss/crossentropy": 1.7211613908410073, |
| "loss/hidden": 3.29453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19102244451642036, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.097, |
| "grad_norm": 28.375, |
| "grad_norm_var": 15.983268229166667, |
| "learning_rate": 0.0001, |
| "loss": 6.8912, |
| "loss/crossentropy": 1.7675188466906548, |
| "loss/hidden": 3.32421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19818378714844584, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.0975, |
| "grad_norm": 32.75, |
| "grad_norm_var": 9.306266259729068e+17, |
| "learning_rate": 0.0001, |
| "loss": 6.9645, |
| "loss/crossentropy": 1.7558425486087799, |
| "loss/hidden": 3.419921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1911760584451258, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.098, |
| "grad_norm": 27.625, |
| "grad_norm_var": 1.5205981735288307e+18, |
| "learning_rate": 0.0001, |
| "loss": 6.8635, |
| "loss/crossentropy": 1.7457415886223315, |
| "loss/hidden": 3.384765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1852768061682582, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.0985, |
| "grad_norm": 32.75, |
| "grad_norm_var": 14.7125, |
| "learning_rate": 0.0001, |
| "loss": 6.8508, |
| "loss/crossentropy": 1.683419554680586, |
| "loss/hidden": 3.337890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1731728465296328, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.099, |
| "grad_norm": 30.625, |
| "grad_norm_var": 1.0302687666727377e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.0005, |
| "loss/crossentropy": 1.727415306866169, |
| "loss/hidden": 3.297265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18517111875116826, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.0995, |
| "grad_norm": 32.25, |
| "grad_norm_var": 22.14375, |
| "learning_rate": 0.0001, |
| "loss": 6.9138, |
| "loss/crossentropy": 1.8120180189609527, |
| "loss/hidden": 3.434375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20129222217947246, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 35.5, |
| "grad_norm_var": 8.491080729166667, |
| "learning_rate": 0.0001, |
| "loss": 6.9525, |
| "loss/crossentropy": 1.8299045406281949, |
| "loss/hidden": 3.251171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17095453599467875, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.1005, |
| "grad_norm": 32.75, |
| "grad_norm_var": 8.586458333333333, |
| "learning_rate": 0.0001, |
| "loss": 6.7871, |
| "loss/crossentropy": 1.7243870817124844, |
| "loss/hidden": 3.3703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16602067481726407, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.101, |
| "grad_norm": 29.625, |
| "grad_norm_var": 9.378125, |
| "learning_rate": 0.0001, |
| "loss": 6.855, |
| "loss/crossentropy": 1.6784847162663936, |
| "loss/hidden": 3.225, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16919725136831404, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.1015, |
| "grad_norm": 41.0, |
| "grad_norm_var": 112.83170572916667, |
| "learning_rate": 0.0001, |
| "loss": 6.9616, |
| "loss/crossentropy": 1.8477609053254127, |
| "loss/hidden": 3.259375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16309508439153433, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.102, |
| "grad_norm": 30.0, |
| "grad_norm_var": 111.6259765625, |
| "learning_rate": 0.0001, |
| "loss": 6.9517, |
| "loss/crossentropy": 1.7308252967894078, |
| "loss/hidden": 3.202734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1722710312344134, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.1025, |
| "grad_norm": 30.625, |
| "grad_norm_var": 4.073893229166667, |
| "learning_rate": 0.0001, |
| "loss": 6.9088, |
| "loss/crossentropy": 1.7544417701661588, |
| "loss/hidden": 3.41796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19881883040070533, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.103, |
| "grad_norm": 38.0, |
| "grad_norm_var": 13.948958333333334, |
| "learning_rate": 0.0001, |
| "loss": 6.9474, |
| "loss/crossentropy": 1.9995075345039368, |
| "loss/hidden": 3.271875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17399701047688723, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.1035, |
| "grad_norm": 31.75, |
| "grad_norm_var": 21.0744140625, |
| "learning_rate": 0.0001, |
| "loss": 6.8732, |
| "loss/crossentropy": 1.8493791602551937, |
| "loss/hidden": 3.2421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16063635479658842, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.104, |
| "grad_norm": 32.25, |
| "grad_norm_var": 17.897916666666667, |
| "learning_rate": 0.0001, |
| "loss": 6.9556, |
| "loss/crossentropy": 1.737601400911808, |
| "loss/hidden": 3.333984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18038861453533173, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.1045, |
| "grad_norm": 32.25, |
| "grad_norm_var": 3.38515625, |
| "learning_rate": 0.0001, |
| "loss": 6.979, |
| "loss/crossentropy": 1.7256839543581008, |
| "loss/hidden": 3.3515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19299248773604633, |
| "step": 2090 |
| }, |
| { |
| "epoch": 0.105, |
| "grad_norm": 31.0, |
| "grad_norm_var": 3.4853515625, |
| "learning_rate": 0.0001, |
| "loss": 6.8191, |
| "loss/crossentropy": 1.7587849080562592, |
| "loss/hidden": 3.25, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16214433256536723, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.1055, |
| "grad_norm": 33.5, |
| "grad_norm_var": 4.112239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.0774, |
| "loss/crossentropy": 2.092029668390751, |
| "loss/hidden": 3.332421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19293731367215514, |
| "step": 2110 |
| }, |
| { |
| "epoch": 0.106, |
| "grad_norm": 30.375, |
| "grad_norm_var": 5.1072265625, |
| "learning_rate": 0.0001, |
| "loss": 6.9724, |
| "loss/crossentropy": 1.7829479269683361, |
| "loss/hidden": 3.349609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19456620067358016, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.1065, |
| "grad_norm": 29.375, |
| "grad_norm_var": 20.4525390625, |
| "learning_rate": 0.0001, |
| "loss": 6.9908, |
| "loss/crossentropy": 1.7853210166096687, |
| "loss/hidden": 3.32265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18279874734580517, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.107, |
| "grad_norm": 36.5, |
| "grad_norm_var": 20.847330729166668, |
| "learning_rate": 0.0001, |
| "loss": 6.9787, |
| "loss/crossentropy": 1.8366479635238648, |
| "loss/hidden": 3.324609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17941316729411483, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.1075, |
| "grad_norm": 29.25, |
| "grad_norm_var": 5.1384765625, |
| "learning_rate": 0.0001, |
| "loss": 7.0703, |
| "loss/crossentropy": 1.8491265431046486, |
| "loss/hidden": 3.253515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1788581835106015, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.108, |
| "grad_norm": 28.5, |
| "grad_norm_var": 3.8082682291666665, |
| "learning_rate": 0.0001, |
| "loss": 7.0117, |
| "loss/crossentropy": 1.8718080654740334, |
| "loss/hidden": 3.36015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18362828250974417, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.1085, |
| "grad_norm": 31.375, |
| "grad_norm_var": 4.0541015625, |
| "learning_rate": 0.0001, |
| "loss": 6.9147, |
| "loss/crossentropy": 1.823565386980772, |
| "loss/hidden": 3.346875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17529825307428837, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.109, |
| "grad_norm": 29.875, |
| "grad_norm_var": 3.1510416666666665, |
| "learning_rate": 0.0001, |
| "loss": 6.8799, |
| "loss/crossentropy": 1.8646746143698691, |
| "loss/hidden": 3.3625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18420496406033635, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.1095, |
| "grad_norm": 28.0, |
| "grad_norm_var": 1.6061848958333333, |
| "learning_rate": 0.0001, |
| "loss": 6.9741, |
| "loss/crossentropy": 1.8418309345841408, |
| "loss/hidden": 3.289453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17159662526100875, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 31.375, |
| "grad_norm_var": 2.4184895833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.0042, |
| "loss/crossentropy": 1.8776386469602584, |
| "loss/hidden": 3.40390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.187642621435225, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.1105, |
| "grad_norm": 29.125, |
| "grad_norm_var": 8.20781018083492e+17, |
| "learning_rate": 0.0001, |
| "loss": 6.9378, |
| "loss/crossentropy": 1.655004223436117, |
| "loss/hidden": 3.273046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1580679954495281, |
| "step": 2210 |
| }, |
| { |
| "epoch": 0.111, |
| "grad_norm": 30.125, |
| "grad_norm_var": 3.468489583333333, |
| "learning_rate": 0.0001, |
| "loss": 6.9831, |
| "loss/crossentropy": 1.792271687835455, |
| "loss/hidden": 3.277734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17089223572984338, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.1115, |
| "grad_norm": 34.75, |
| "grad_norm_var": 4.209375, |
| "learning_rate": 0.0001, |
| "loss": 6.8364, |
| "loss/crossentropy": 1.734425350278616, |
| "loss/hidden": 3.394140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18262410946190358, |
| "step": 2230 |
| }, |
| { |
| "epoch": 0.112, |
| "grad_norm": 27.0, |
| "grad_norm_var": 4.629166666666666, |
| "learning_rate": 0.0001, |
| "loss": 6.8305, |
| "loss/crossentropy": 1.772131036967039, |
| "loss/hidden": 3.274609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1691578391008079, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.1125, |
| "grad_norm": 29.125, |
| "grad_norm_var": 6.303580729166667, |
| "learning_rate": 0.0001, |
| "loss": 6.9967, |
| "loss/crossentropy": 1.9334307715296746, |
| "loss/hidden": 3.383203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19251629430800676, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.113, |
| "grad_norm": 36.5, |
| "grad_norm_var": 6.4791015625, |
| "learning_rate": 0.0001, |
| "loss": 6.981, |
| "loss/crossentropy": 1.887280984222889, |
| "loss/hidden": 3.358984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21319616939872504, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.1135, |
| "grad_norm": 28.75, |
| "grad_norm_var": 4.7009765625, |
| "learning_rate": 0.0001, |
| "loss": 7.0286, |
| "loss/crossentropy": 1.8285806521773338, |
| "loss/hidden": 3.41484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18080311622470618, |
| "step": 2270 |
| }, |
| { |
| "epoch": 0.114, |
| "grad_norm": 31.0, |
| "grad_norm_var": 7.09375, |
| "learning_rate": 0.0001, |
| "loss": 6.863, |
| "loss/crossentropy": 1.6441345304250716, |
| "loss/hidden": 3.323046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18446694109588863, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.1145, |
| "grad_norm": 30.125, |
| "grad_norm_var": 9.029166666666667, |
| "learning_rate": 0.0001, |
| "loss": 6.8549, |
| "loss/crossentropy": 1.5048397369682789, |
| "loss/hidden": 3.359765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16425186553969978, |
| "step": 2290 |
| }, |
| { |
| "epoch": 0.115, |
| "grad_norm": 28.625, |
| "grad_norm_var": 3.9400390625, |
| "learning_rate": 0.0001, |
| "loss": 6.9448, |
| "loss/crossentropy": 1.7213742382824422, |
| "loss/hidden": 3.3328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17190376687794923, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.1155, |
| "grad_norm": 29.125, |
| "grad_norm_var": 51.71608072916667, |
| "learning_rate": 0.0001, |
| "loss": 7.0456, |
| "loss/crossentropy": 1.8745042860507966, |
| "loss/hidden": 3.3703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1922046933323145, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.116, |
| "grad_norm": 31.625, |
| "grad_norm_var": 5.101822916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.0037, |
| "loss/crossentropy": 1.835337746143341, |
| "loss/hidden": 3.291015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.172516768053174, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.1165, |
| "grad_norm": 29.625, |
| "grad_norm_var": 4.792122395833333, |
| "learning_rate": 0.0001, |
| "loss": 6.8605, |
| "loss/crossentropy": 1.7886844381690026, |
| "loss/hidden": 3.31328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17270518001168966, |
| "step": 2330 |
| }, |
| { |
| "epoch": 0.117, |
| "grad_norm": 30.875, |
| "grad_norm_var": 24.301041666666666, |
| "learning_rate": 0.0001, |
| "loss": 6.8857, |
| "loss/crossentropy": 1.8270663298666476, |
| "loss/hidden": 3.316796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17341279415413738, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.1175, |
| "grad_norm": 28.25, |
| "grad_norm_var": 23.795247395833332, |
| "learning_rate": 0.0001, |
| "loss": 6.981, |
| "loss/crossentropy": 1.7389558240771295, |
| "loss/hidden": 3.33671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20616078823804856, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.118, |
| "grad_norm": 31.75, |
| "grad_norm_var": 3.3712890625, |
| "learning_rate": 0.0001, |
| "loss": 6.9706, |
| "loss/crossentropy": 1.7505015313625336, |
| "loss/hidden": 3.2375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1691287737339735, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.1185, |
| "grad_norm": 29.875, |
| "grad_norm_var": 3.7864583333333335, |
| "learning_rate": 0.0001, |
| "loss": 7.0493, |
| "loss/crossentropy": 1.8290210530161857, |
| "loss/hidden": 3.3515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17870840784162284, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.119, |
| "grad_norm": 28.75, |
| "grad_norm_var": 3.06640625, |
| "learning_rate": 0.0001, |
| "loss": 6.8945, |
| "loss/crossentropy": 1.7312066838145257, |
| "loss/hidden": 3.3453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16353450021706523, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.1195, |
| "grad_norm": 38.75, |
| "grad_norm_var": 8.985384797395922e+17, |
| "learning_rate": 0.0001, |
| "loss": 7.1346, |
| "loss/crossentropy": 1.8643671602010727, |
| "loss/hidden": 3.3734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18776546316221357, |
| "step": 2390 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 33.25, |
| "grad_norm_var": 8.985384795065637e+17, |
| "learning_rate": 0.0001, |
| "loss": 7.0339, |
| "loss/crossentropy": 1.7668686166405678, |
| "loss/hidden": 3.4140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19443758334964514, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.1205, |
| "grad_norm": 30.25, |
| "grad_norm_var": 1.8852243670131978e+18, |
| "learning_rate": 0.0001, |
| "loss": 6.9626, |
| "loss/crossentropy": 1.8465783804655076, |
| "loss/hidden": 3.371875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1860800025984645, |
| "step": 2410 |
| }, |
| { |
| "epoch": 0.121, |
| "grad_norm": 33.75, |
| "grad_norm_var": 1.8852243674568678e+18, |
| "learning_rate": 0.0001, |
| "loss": 6.8455, |
| "loss/crossentropy": 1.7212153851985932, |
| "loss/hidden": 3.36171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18209199868142606, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.1215, |
| "grad_norm": 28.0, |
| "grad_norm_var": 3.81015625, |
| "learning_rate": 0.0001, |
| "loss": 6.9986, |
| "loss/crossentropy": 1.898094529658556, |
| "loss/hidden": 3.375390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.194298998080194, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.122, |
| "grad_norm": 27.5, |
| "grad_norm_var": 3.332291666666667, |
| "learning_rate": 0.0001, |
| "loss": 6.8924, |
| "loss/crossentropy": 1.7420293487608434, |
| "loss/hidden": 3.2546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.161607267241925, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.1225, |
| "grad_norm": 33.0, |
| "grad_norm_var": 2.7622395833333333, |
| "learning_rate": 0.0001, |
| "loss": 6.8686, |
| "loss/crossentropy": 1.6050585605204106, |
| "loss/hidden": 3.371875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17848586086183788, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.123, |
| "grad_norm": 28.75, |
| "grad_norm_var": 2.4400390625, |
| "learning_rate": 0.0001, |
| "loss": 6.9804, |
| "loss/crossentropy": 1.9553805246949196, |
| "loss/hidden": 3.3859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19847506172955037, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.1235, |
| "grad_norm": 29.875, |
| "grad_norm_var": 2.0791015625, |
| "learning_rate": 0.0001, |
| "loss": 6.9913, |
| "loss/crossentropy": 1.4568642482161522, |
| "loss/hidden": 3.335546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.15850053485482932, |
| "step": 2470 |
| }, |
| { |
| "epoch": 0.124, |
| "grad_norm": 31.875, |
| "grad_norm_var": 4.3775390625, |
| "learning_rate": 0.0001, |
| "loss": 6.9326, |
| "loss/crossentropy": 1.6532236352562903, |
| "loss/hidden": 3.45859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18165745195001365, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.1245, |
| "grad_norm": 28.5, |
| "grad_norm_var": 4.522330729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.005, |
| "loss/crossentropy": 1.6793559297919274, |
| "loss/hidden": 3.339453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17017313856631516, |
| "step": 2490 |
| }, |
| { |
| "epoch": 0.125, |
| "grad_norm": 30.75, |
| "grad_norm_var": 4.3353515625, |
| "learning_rate": 0.0001, |
| "loss": 7.0956, |
| "loss/crossentropy": 1.8292289204895495, |
| "loss/hidden": 3.38203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18509325329214335, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.1255, |
| "grad_norm": 30.875, |
| "grad_norm_var": 3.78125, |
| "learning_rate": 0.0001, |
| "loss": 6.9137, |
| "loss/crossentropy": 1.7439368188381195, |
| "loss/hidden": 3.3890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19252277240157128, |
| "step": 2510 |
| }, |
| { |
| "epoch": 0.126, |
| "grad_norm": 31.125, |
| "grad_norm_var": 1.1349774577470627e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.051, |
| "loss/crossentropy": 2.0631623208522796, |
| "loss/hidden": 3.4265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22505897115916013, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.1265, |
| "grad_norm": 29.75, |
| "grad_norm_var": 1.1349774575828206e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.1194, |
| "loss/crossentropy": 1.8867668241262436, |
| "loss/hidden": 3.35390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20316522121429442, |
| "step": 2530 |
| }, |
| { |
| "epoch": 0.127, |
| "grad_norm": 28.25, |
| "grad_norm_var": 20.151822916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.0832, |
| "loss/crossentropy": 1.8491319343447685, |
| "loss/hidden": 3.392578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19702840279787778, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.1275, |
| "grad_norm": 29.0, |
| "grad_norm_var": 11.6681640625, |
| "learning_rate": 0.0001, |
| "loss": 6.9728, |
| "loss/crossentropy": 1.8162995487451554, |
| "loss/hidden": 3.366015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18736656550318004, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.128, |
| "grad_norm": 34.5, |
| "grad_norm_var": 13.088997395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.1137, |
| "loss/crossentropy": 2.031092081964016, |
| "loss/hidden": 3.447265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21819815230555833, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.1285, |
| "grad_norm": 31.125, |
| "grad_norm_var": 1.7945788315993818e+17, |
| "learning_rate": 0.0001, |
| "loss": 7.0175, |
| "loss/crossentropy": 1.731457906216383, |
| "loss/hidden": 3.4015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18550403621047734, |
| "step": 2570 |
| }, |
| { |
| "epoch": 0.129, |
| "grad_norm": 32.0, |
| "grad_norm_var": 1.794578832870256e+17, |
| "learning_rate": 0.0001, |
| "loss": 6.8552, |
| "loss/crossentropy": 1.8714622184634209, |
| "loss/hidden": 3.2890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1802680429071188, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.1295, |
| "grad_norm": 38.75, |
| "grad_norm_var": 11.655143229166667, |
| "learning_rate": 0.0001, |
| "loss": 6.9513, |
| "loss/crossentropy": 1.6536960810422898, |
| "loss/hidden": 3.426171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19141803495585918, |
| "step": 2590 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 30.25, |
| "grad_norm_var": 10.824934895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.0451, |
| "loss/crossentropy": 1.7446824312210083, |
| "loss/hidden": 3.437890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21996904909610748, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.1305, |
| "grad_norm": 32.0, |
| "grad_norm_var": 0.9895182291666667, |
| "learning_rate": 0.0001, |
| "loss": 6.9912, |
| "loss/crossentropy": 1.8711062870919704, |
| "loss/hidden": 3.344140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18015410769730805, |
| "step": 2610 |
| }, |
| { |
| "epoch": 0.131, |
| "grad_norm": 29.0, |
| "grad_norm_var": 1.9697265625, |
| "learning_rate": 0.0001, |
| "loss": 6.9974, |
| "loss/crossentropy": 1.7273207187652588, |
| "loss/hidden": 3.3171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17100013056769967, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.1315, |
| "grad_norm": 33.0, |
| "grad_norm_var": 0.9681640625, |
| "learning_rate": 0.0001, |
| "loss": 6.864, |
| "loss/crossentropy": 1.772182758897543, |
| "loss/hidden": 3.410546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18076814245432615, |
| "step": 2630 |
| }, |
| { |
| "epoch": 0.132, |
| "grad_norm": 29.25, |
| "grad_norm_var": 5.707291666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.1259, |
| "loss/crossentropy": 1.7641409367322922, |
| "loss/hidden": 3.434375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18833348713815212, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.1325, |
| "grad_norm": 40.75, |
| "grad_norm_var": 10.91015625, |
| "learning_rate": 0.0001, |
| "loss": 7.0193, |
| "loss/crossentropy": 1.859598373621702, |
| "loss/hidden": 3.394140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18878742419183253, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.133, |
| "grad_norm": 31.375, |
| "grad_norm_var": 18.1822265625, |
| "learning_rate": 0.0001, |
| "loss": 6.9707, |
| "loss/crossentropy": 1.7797490507364273, |
| "loss/hidden": 3.411328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20212376527488232, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.1335, |
| "grad_norm": 29.875, |
| "grad_norm_var": 11.162239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.0002, |
| "loss/crossentropy": 1.7839721478521824, |
| "loss/hidden": 3.3140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.173302289377898, |
| "step": 2670 |
| }, |
| { |
| "epoch": 0.134, |
| "grad_norm": 27.125, |
| "grad_norm_var": 4.2009765625, |
| "learning_rate": 0.0001, |
| "loss": 6.9156, |
| "loss/crossentropy": 1.7781757101416589, |
| "loss/hidden": 3.390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19893121821805834, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.1345, |
| "grad_norm": 30.75, |
| "grad_norm_var": 36.837239583333336, |
| "learning_rate": 0.0001, |
| "loss": 7.0997, |
| "loss/crossentropy": 1.8467799574136734, |
| "loss/hidden": 3.41953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20002066995948553, |
| "step": 2690 |
| }, |
| { |
| "epoch": 0.135, |
| "grad_norm": 28.5, |
| "grad_norm_var": 37.431705729166666, |
| "learning_rate": 0.0001, |
| "loss": 6.9236, |
| "loss/crossentropy": 1.6248198747634888, |
| "loss/hidden": 3.335546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1642201030626893, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.1355, |
| "grad_norm": 34.0, |
| "grad_norm_var": 4.030989583333334, |
| "learning_rate": 0.0001, |
| "loss": 6.9361, |
| "loss/crossentropy": 1.7102701038122177, |
| "loss/hidden": 3.3609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16836816985160113, |
| "step": 2710 |
| }, |
| { |
| "epoch": 0.136, |
| "grad_norm": 26.0, |
| "grad_norm_var": 1.0907331108694131e+18, |
| "learning_rate": 0.0001, |
| "loss": 6.9167, |
| "loss/crossentropy": 1.8059025250375271, |
| "loss/hidden": 3.31796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16677290350198745, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.1365, |
| "grad_norm": 29.5, |
| "grad_norm_var": 6.2728515625, |
| "learning_rate": 0.0001, |
| "loss": 6.8796, |
| "loss/crossentropy": 1.776158544421196, |
| "loss/hidden": 3.4125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1929216692224145, |
| "step": 2730 |
| }, |
| { |
| "epoch": 0.137, |
| "grad_norm": 28.75, |
| "grad_norm_var": 7.5431640625, |
| "learning_rate": 0.0001, |
| "loss": 6.8288, |
| "loss/crossentropy": 1.8780412912368774, |
| "loss/hidden": 3.397265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18471294036135077, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.1375, |
| "grad_norm": 33.0, |
| "grad_norm_var": 15.1947265625, |
| "learning_rate": 0.0001, |
| "loss": 6.9741, |
| "loss/crossentropy": 1.7919296585023403, |
| "loss/hidden": 3.40703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19309423677623272, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.138, |
| "grad_norm": 31.375, |
| "grad_norm_var": 16.696809895833333, |
| "learning_rate": 0.0001, |
| "loss": 6.9971, |
| "loss/crossentropy": 1.8414636544883252, |
| "loss/hidden": 3.330859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1888686059974134, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.1385, |
| "grad_norm": 28.5, |
| "grad_norm_var": 7.121875, |
| "learning_rate": 0.0001, |
| "loss": 6.9869, |
| "loss/crossentropy": 1.8438507467508316, |
| "loss/hidden": 3.357421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19185615349560975, |
| "step": 2770 |
| }, |
| { |
| "epoch": 0.139, |
| "grad_norm": 33.25, |
| "grad_norm_var": 10.338541666666666, |
| "learning_rate": 0.0001, |
| "loss": 6.9528, |
| "loss/crossentropy": 1.8890479058027267, |
| "loss/hidden": 3.384765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19027914050966502, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.1395, |
| "grad_norm": 33.5, |
| "grad_norm_var": 12.343684895833333, |
| "learning_rate": 0.0001, |
| "loss": 6.9585, |
| "loss/crossentropy": 1.6378353632986546, |
| "loss/hidden": 3.41484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18243511486798525, |
| "step": 2790 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 33.0, |
| "grad_norm_var": 7.9384765625, |
| "learning_rate": 0.0001, |
| "loss": 6.885, |
| "loss/crossentropy": 1.6422518469393252, |
| "loss/hidden": 3.26875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.15738149764947593, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.1405, |
| "grad_norm": 35.0, |
| "grad_norm_var": 7.362239583333333, |
| "learning_rate": 0.0001, |
| "loss": 6.9251, |
| "loss/crossentropy": 1.818039534240961, |
| "loss/hidden": 3.222265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17553653065115213, |
| "step": 2810 |
| }, |
| { |
| "epoch": 0.141, |
| "grad_norm": 28.875, |
| "grad_norm_var": 8.7134765625, |
| "learning_rate": 0.0001, |
| "loss": 6.9659, |
| "loss/crossentropy": 1.8913455709815026, |
| "loss/hidden": 3.325390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18545334562659263, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.1415, |
| "grad_norm": 27.5, |
| "grad_norm_var": 7.718684895833333, |
| "learning_rate": 0.0001, |
| "loss": 6.8653, |
| "loss/crossentropy": 1.9232856243848802, |
| "loss/hidden": 3.34921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19609272833913566, |
| "step": 2830 |
| }, |
| { |
| "epoch": 0.142, |
| "grad_norm": 31.75, |
| "grad_norm_var": 18.7166015625, |
| "learning_rate": 0.0001, |
| "loss": 6.9271, |
| "loss/crossentropy": 1.7873032443225383, |
| "loss/hidden": 3.2796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16436451440677047, |
| "step": 2840 |
| }, |
| { |
| "epoch": 0.1425, |
| "grad_norm": 31.375, |
| "grad_norm_var": 4.561393229166667, |
| "learning_rate": 0.0001, |
| "loss": 6.859, |
| "loss/crossentropy": 1.764283910393715, |
| "loss/hidden": 3.3890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18506875950843096, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.143, |
| "grad_norm": 30.125, |
| "grad_norm_var": 5.339322916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.1328, |
| "loss/crossentropy": 1.746024763584137, |
| "loss/hidden": 3.333984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19091468937695028, |
| "step": 2860 |
| }, |
| { |
| "epoch": 0.1435, |
| "grad_norm": 31.125, |
| "grad_norm_var": 7.5875, |
| "learning_rate": 0.0001, |
| "loss": 6.8931, |
| "loss/crossentropy": 1.8621096529066563, |
| "loss/hidden": 3.2265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.167528663482517, |
| "step": 2870 |
| }, |
| { |
| "epoch": 0.144, |
| "grad_norm": 32.25, |
| "grad_norm_var": 7.123372395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.0369, |
| "loss/crossentropy": 1.9750339597463609, |
| "loss/hidden": 3.364453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20070471633225678, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.1445, |
| "grad_norm": 33.5, |
| "grad_norm_var": 14.7275390625, |
| "learning_rate": 0.0001, |
| "loss": 6.8862, |
| "loss/crossentropy": 1.74088372066617, |
| "loss/hidden": 3.320703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17231013607233764, |
| "step": 2890 |
| }, |
| { |
| "epoch": 0.145, |
| "grad_norm": 28.375, |
| "grad_norm_var": 19.409830729166668, |
| "learning_rate": 0.0001, |
| "loss": 7.035, |
| "loss/crossentropy": 1.7799094915390015, |
| "loss/hidden": 3.32265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18373552113771438, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.1455, |
| "grad_norm": 31.375, |
| "grad_norm_var": 5.517708333333333, |
| "learning_rate": 0.0001, |
| "loss": 6.9546, |
| "loss/crossentropy": 1.7803256064653397, |
| "loss/hidden": 3.34921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1977113801985979, |
| "step": 2910 |
| }, |
| { |
| "epoch": 0.146, |
| "grad_norm": 28.125, |
| "grad_norm_var": 5.627018229166667, |
| "learning_rate": 0.0001, |
| "loss": 6.9317, |
| "loss/crossentropy": 1.8050019271671771, |
| "loss/hidden": 3.257421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16629343312233685, |
| "step": 2920 |
| }, |
| { |
| "epoch": 0.1465, |
| "grad_norm": 34.5, |
| "grad_norm_var": 7.16640625, |
| "learning_rate": 0.0001, |
| "loss": 6.9453, |
| "loss/crossentropy": 1.8659825779497623, |
| "loss/hidden": 3.331640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1742606306448579, |
| "step": 2930 |
| }, |
| { |
| "epoch": 0.147, |
| "grad_norm": 35.5, |
| "grad_norm_var": 8.9306640625, |
| "learning_rate": 0.0001, |
| "loss": 7.0142, |
| "loss/crossentropy": 1.913654712587595, |
| "loss/hidden": 3.403515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20132352095097303, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.1475, |
| "grad_norm": 30.25, |
| "grad_norm_var": 6.614518229166666, |
| "learning_rate": 0.0001, |
| "loss": 6.9147, |
| "loss/crossentropy": 1.645759216696024, |
| "loss/hidden": 3.376171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16875347392633558, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.148, |
| "grad_norm": 29.0, |
| "grad_norm_var": 6.8322265625, |
| "learning_rate": 0.0001, |
| "loss": 6.9988, |
| "loss/crossentropy": 1.8556548431515694, |
| "loss/hidden": 3.39375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17874295320361852, |
| "step": 2960 |
| }, |
| { |
| "epoch": 0.1485, |
| "grad_norm": 28.75, |
| "grad_norm_var": 3.5791666666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.0313, |
| "loss/crossentropy": 1.688177353143692, |
| "loss/hidden": 3.28515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16950420523062348, |
| "step": 2970 |
| }, |
| { |
| "epoch": 0.149, |
| "grad_norm": 32.25, |
| "grad_norm_var": 2.246875, |
| "learning_rate": 0.0001, |
| "loss": 7.0247, |
| "loss/crossentropy": 2.071097436547279, |
| "loss/hidden": 3.404296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20375496093183756, |
| "step": 2980 |
| }, |
| { |
| "epoch": 0.1495, |
| "grad_norm": 27.5, |
| "grad_norm_var": 2.6014973958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.0495, |
| "loss/crossentropy": 1.852598314732313, |
| "loss/hidden": 3.31328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16860631257295608, |
| "step": 2990 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 27.0, |
| "grad_norm_var": 3.5122395833333333, |
| "learning_rate": 0.0001, |
| "loss": 6.7966, |
| "loss/crossentropy": 1.7948169738054276, |
| "loss/hidden": 3.325, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17319696098566056, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.1505, |
| "grad_norm": 28.875, |
| "grad_norm_var": 4.367122395833333, |
| "learning_rate": 0.0001, |
| "loss": 6.9423, |
| "loss/crossentropy": 1.6970888696610928, |
| "loss/hidden": 3.43203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16700221002101898, |
| "step": 3010 |
| }, |
| { |
| "epoch": 0.151, |
| "grad_norm": 3674210304.0, |
| "grad_norm_var": 2.0173451962123377e+18, |
| "learning_rate": 0.0001, |
| "loss": 6.9283, |
| "loss/crossentropy": 1.713117253035307, |
| "loss/hidden": 3.34375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1704209728166461, |
| "step": 3020 |
| }, |
| { |
| "epoch": 0.1515, |
| "grad_norm": 31.375, |
| "grad_norm_var": 1.710129338897767e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.0097, |
| "loss/crossentropy": 1.9506682097911834, |
| "loss/hidden": 3.407421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19250028654932977, |
| "step": 3030 |
| }, |
| { |
| "epoch": 0.152, |
| "grad_norm": 29.25, |
| "grad_norm_var": 2.1416666666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.0202, |
| "loss/crossentropy": 1.831156849861145, |
| "loss/hidden": 3.307421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18563526798970997, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.1525, |
| "grad_norm": 30.625, |
| "grad_norm_var": 2.6768229166666666, |
| "learning_rate": 0.0001, |
| "loss": 7.0529, |
| "loss/crossentropy": 1.8806451916694642, |
| "loss/hidden": 3.405859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19239903232082725, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.153, |
| "grad_norm": 29.25, |
| "grad_norm_var": 2.8375, |
| "learning_rate": 0.0001, |
| "loss": 6.9243, |
| "loss/crossentropy": 1.8184577412903309, |
| "loss/hidden": 3.359765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.173899077065289, |
| "step": 3060 |
| }, |
| { |
| "epoch": 0.1535, |
| "grad_norm": 30.5, |
| "grad_norm_var": 1.6489583333333333, |
| "learning_rate": 0.0001, |
| "loss": 6.901, |
| "loss/crossentropy": 1.782475320994854, |
| "loss/hidden": 3.303125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17683281004428864, |
| "step": 3070 |
| }, |
| { |
| "epoch": 0.154, |
| "grad_norm": 30.125, |
| "grad_norm_var": 2.4770833333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.0536, |
| "loss/crossentropy": 1.7542385324835776, |
| "loss/hidden": 3.31015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1734863522462547, |
| "step": 3080 |
| }, |
| { |
| "epoch": 0.1545, |
| "grad_norm": 31.375, |
| "grad_norm_var": 2.5077473958333334, |
| "learning_rate": 0.0001, |
| "loss": 6.7429, |
| "loss/crossentropy": 1.721788990870118, |
| "loss/hidden": 3.336328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1703654458746314, |
| "step": 3090 |
| }, |
| { |
| "epoch": 0.155, |
| "grad_norm": 40.25, |
| "grad_norm_var": 9.09140625, |
| "learning_rate": 0.0001, |
| "loss": 6.9729, |
| "loss/crossentropy": 1.6206283092498779, |
| "loss/hidden": 3.35859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1712807172909379, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.1555, |
| "grad_norm": 32.0, |
| "grad_norm_var": 8.16640625, |
| "learning_rate": 0.0001, |
| "loss": 6.8604, |
| "loss/crossentropy": 1.7044736705720425, |
| "loss/hidden": 3.247265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16109976628795267, |
| "step": 3110 |
| }, |
| { |
| "epoch": 0.156, |
| "grad_norm": 28.875, |
| "grad_norm_var": 61.90305989583333, |
| "learning_rate": 0.0001, |
| "loss": 6.8603, |
| "loss/crossentropy": 1.7201604932546615, |
| "loss/hidden": 3.3421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1717333897948265, |
| "step": 3120 |
| }, |
| { |
| "epoch": 0.1565, |
| "grad_norm": 29.25, |
| "grad_norm_var": 3.2666015625, |
| "learning_rate": 0.0001, |
| "loss": 6.9316, |
| "loss/crossentropy": 1.611024511605501, |
| "loss/hidden": 3.331640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17799030421301723, |
| "step": 3130 |
| }, |
| { |
| "epoch": 0.157, |
| "grad_norm": 29.25, |
| "grad_norm_var": 6.059830729166666, |
| "learning_rate": 0.0001, |
| "loss": 6.8749, |
| "loss/crossentropy": 1.542306227236986, |
| "loss/hidden": 3.2828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17464940482750535, |
| "step": 3140 |
| }, |
| { |
| "epoch": 0.1575, |
| "grad_norm": 30.75, |
| "grad_norm_var": 4.820572916666666, |
| "learning_rate": 0.0001, |
| "loss": 6.8917, |
| "loss/crossentropy": 1.7465024203062058, |
| "loss/hidden": 3.3546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18054623370990158, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.158, |
| "grad_norm": 31.5, |
| "grad_norm_var": 2.787239583333333, |
| "learning_rate": 0.0001, |
| "loss": 6.969, |
| "loss/crossentropy": 2.0858161732554437, |
| "loss/hidden": 3.28125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18169568832963706, |
| "step": 3160 |
| }, |
| { |
| "epoch": 0.1585, |
| "grad_norm": 29.5, |
| "grad_norm_var": 4.023372395833333, |
| "learning_rate": 0.0001, |
| "loss": 6.8406, |
| "loss/crossentropy": 1.9426328182220458, |
| "loss/hidden": 3.324609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17592350710183383, |
| "step": 3170 |
| }, |
| { |
| "epoch": 0.159, |
| "grad_norm": 29.125, |
| "grad_norm_var": 1.5832967238438093e+18, |
| "learning_rate": 0.0001, |
| "loss": 6.9453, |
| "loss/crossentropy": 1.8308497540652753, |
| "loss/hidden": 3.603125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19216080345213413, |
| "step": 3180 |
| }, |
| { |
| "epoch": 0.1595, |
| "grad_norm": 29.5, |
| "grad_norm_var": 1.5832967237861376e+18, |
| "learning_rate": 0.0001, |
| "loss": 6.9032, |
| "loss/crossentropy": 1.705291760712862, |
| "loss/hidden": 3.383984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1843032216653228, |
| "step": 3190 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 40.25, |
| "grad_norm_var": 19.5056640625, |
| "learning_rate": 0.0001, |
| "loss": 7.0209, |
| "loss/crossentropy": 1.7651132240891456, |
| "loss/hidden": 3.390234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1844408256933093, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.1605, |
| "grad_norm": 38.0, |
| "grad_norm_var": 6.217782109866559e+17, |
| "learning_rate": 0.0001, |
| "loss": 6.7736, |
| "loss/crossentropy": 1.8051001697778701, |
| "loss/hidden": 3.38984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17440476845949887, |
| "step": 3210 |
| }, |
| { |
| "epoch": 0.161, |
| "grad_norm": 31.125, |
| "grad_norm_var": 6.428059895833333, |
| "learning_rate": 0.0001, |
| "loss": 6.9109, |
| "loss/crossentropy": 1.8851144686341286, |
| "loss/hidden": 3.2359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17897074315696954, |
| "step": 3220 |
| }, |
| { |
| "epoch": 0.1615, |
| "grad_norm": 30.125, |
| "grad_norm_var": 17.601041666666667, |
| "learning_rate": 0.0001, |
| "loss": 6.9799, |
| "loss/crossentropy": 1.6312229566276073, |
| "loss/hidden": 3.2609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16838383311405777, |
| "step": 3230 |
| }, |
| { |
| "epoch": 0.162, |
| "grad_norm": 31.5, |
| "grad_norm_var": 20.835872395833334, |
| "learning_rate": 0.0001, |
| "loss": 6.932, |
| "loss/crossentropy": 2.011029013991356, |
| "loss/hidden": 3.310546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1832389457151294, |
| "step": 3240 |
| }, |
| { |
| "epoch": 0.1625, |
| "grad_norm": 28.375, |
| "grad_norm_var": 7.161458333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.0405, |
| "loss/crossentropy": 1.8453179642558097, |
| "loss/hidden": 3.434765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19180234288796782, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.163, |
| "grad_norm": 36.5, |
| "grad_norm_var": 10.517708333333333, |
| "learning_rate": 0.0001, |
| "loss": 6.823, |
| "loss/crossentropy": 1.9555616907775402, |
| "loss/hidden": 3.318359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17895318511873484, |
| "step": 3260 |
| }, |
| { |
| "epoch": 0.1635, |
| "grad_norm": 29.125, |
| "grad_norm_var": 8.909830729166666, |
| "learning_rate": 0.0001, |
| "loss": 6.892, |
| "loss/crossentropy": 1.843096625804901, |
| "loss/hidden": 3.33203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18395393253304065, |
| "step": 3270 |
| }, |
| { |
| "epoch": 0.164, |
| "grad_norm": 27.875, |
| "grad_norm_var": 7.260416666666667, |
| "learning_rate": 0.0001, |
| "loss": 6.9288, |
| "loss/crossentropy": 1.688144066929817, |
| "loss/hidden": 3.277734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.172001248691231, |
| "step": 3280 |
| }, |
| { |
| "epoch": 0.1645, |
| "grad_norm": 37.5, |
| "grad_norm_var": 12.014518229166667, |
| "learning_rate": 0.0001, |
| "loss": 6.9012, |
| "loss/crossentropy": 1.6900858603417874, |
| "loss/hidden": 3.346875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1850940717384219, |
| "step": 3290 |
| }, |
| { |
| "epoch": 0.165, |
| "grad_norm": 30.5, |
| "grad_norm_var": 11.887955729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.0327, |
| "loss/crossentropy": 1.8690055832266808, |
| "loss/hidden": 3.41015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2061467545107007, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.1655, |
| "grad_norm": 33.25, |
| "grad_norm_var": 44.0900390625, |
| "learning_rate": 0.0001, |
| "loss": 6.9398, |
| "loss/crossentropy": 1.864616620540619, |
| "loss/hidden": 3.21875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.15337421298027037, |
| "step": 3310 |
| }, |
| { |
| "epoch": 0.166, |
| "grad_norm": 37.25, |
| "grad_norm_var": 45.87473958333333, |
| "learning_rate": 0.0001, |
| "loss": 6.9275, |
| "loss/crossentropy": 1.8501743324100972, |
| "loss/hidden": 3.266796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17113643269985915, |
| "step": 3320 |
| }, |
| { |
| "epoch": 0.1665, |
| "grad_norm": 29.625, |
| "grad_norm_var": 1.1349774579334994e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.0081, |
| "loss/crossentropy": 1.779020744562149, |
| "loss/hidden": 3.323046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1846176441758871, |
| "step": 3330 |
| }, |
| { |
| "epoch": 0.167, |
| "grad_norm": 35.75, |
| "grad_norm_var": 1.0819897936507308e+18, |
| "learning_rate": 0.0001, |
| "loss": 6.9779, |
| "loss/crossentropy": 1.7754384666681289, |
| "loss/hidden": 3.36796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19158907625824212, |
| "step": 3340 |
| }, |
| { |
| "epoch": 0.1675, |
| "grad_norm": 34.25, |
| "grad_norm_var": 1.081989793663733e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.0539, |
| "loss/crossentropy": 1.759375052154064, |
| "loss/hidden": 3.32109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18603504877537488, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.168, |
| "grad_norm": 29.875, |
| "grad_norm_var": 5.620833333333334, |
| "learning_rate": 0.0001, |
| "loss": 6.8589, |
| "loss/crossentropy": 1.845319252461195, |
| "loss/hidden": 3.3328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18480119155719876, |
| "step": 3360 |
| }, |
| { |
| "epoch": 0.1685, |
| "grad_norm": 34.5, |
| "grad_norm_var": 19.439583333333335, |
| "learning_rate": 0.0001, |
| "loss": 6.9772, |
| "loss/crossentropy": 1.6411745361983776, |
| "loss/hidden": 3.321484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.15529545303434134, |
| "step": 3370 |
| }, |
| { |
| "epoch": 0.169, |
| "grad_norm": 28.5, |
| "grad_norm_var": 36.9103515625, |
| "learning_rate": 0.0001, |
| "loss": 6.8489, |
| "loss/crossentropy": 1.7360669024288655, |
| "loss/hidden": 3.327734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17661824598908424, |
| "step": 3380 |
| }, |
| { |
| "epoch": 0.1695, |
| "grad_norm": 29.375, |
| "grad_norm_var": 35.46848958333333, |
| "learning_rate": 0.0001, |
| "loss": 6.7757, |
| "loss/crossentropy": 1.7902205429971219, |
| "loss/hidden": 3.38125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17157120602205395, |
| "step": 3390 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 28.875, |
| "grad_norm_var": 3.6395833333333334, |
| "learning_rate": 0.0001, |
| "loss": 6.8708, |
| "loss/crossentropy": 1.842449489980936, |
| "loss/hidden": 3.29921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16762932492420077, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.1705, |
| "grad_norm": 37.0, |
| "grad_norm_var": 6.513997395833333, |
| "learning_rate": 0.0001, |
| "loss": 6.8956, |
| "loss/crossentropy": 1.7051387749612332, |
| "loss/hidden": 3.3078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16933946274220943, |
| "step": 3410 |
| }, |
| { |
| "epoch": 0.171, |
| "grad_norm": 30.75, |
| "grad_norm_var": 9.762239583333333, |
| "learning_rate": 0.0001, |
| "loss": 6.9733, |
| "loss/crossentropy": 1.7448437750339507, |
| "loss/hidden": 3.3890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20119084492325784, |
| "step": 3420 |
| }, |
| { |
| "epoch": 0.1715, |
| "grad_norm": 78.0, |
| "grad_norm_var": 144.5125, |
| "learning_rate": 0.0001, |
| "loss": 7.0287, |
| "loss/crossentropy": 1.824779784679413, |
| "loss/hidden": 3.37421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17832597270607947, |
| "step": 3430 |
| }, |
| { |
| "epoch": 0.172, |
| "grad_norm": 28.125, |
| "grad_norm_var": 145.68430989583334, |
| "learning_rate": 0.0001, |
| "loss": 6.7435, |
| "loss/crossentropy": 1.6466563902795315, |
| "loss/hidden": 3.328515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16660706931725144, |
| "step": 3440 |
| }, |
| { |
| "epoch": 0.1725, |
| "grad_norm": 29.875, |
| "grad_norm_var": 8.811393229166667, |
| "learning_rate": 0.0001, |
| "loss": 6.934, |
| "loss/crossentropy": 1.8493422105908395, |
| "loss/hidden": 3.333984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2010068495757878, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.173, |
| "grad_norm": 29.0, |
| "grad_norm_var": 6.62890625, |
| "learning_rate": 0.0001, |
| "loss": 6.8496, |
| "loss/crossentropy": 1.6380462288856505, |
| "loss/hidden": 3.26015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18449038956314326, |
| "step": 3460 |
| }, |
| { |
| "epoch": 0.1735, |
| "grad_norm": 32.75, |
| "grad_norm_var": 32.5875, |
| "learning_rate": 0.0001, |
| "loss": 6.9309, |
| "loss/crossentropy": 1.6813900470733643, |
| "loss/hidden": 3.400390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18437479846179486, |
| "step": 3470 |
| }, |
| { |
| "epoch": 0.174, |
| "grad_norm": 31.625, |
| "grad_norm_var": 7.465419918819722e+17, |
| "learning_rate": 0.0001, |
| "loss": 7.1677, |
| "loss/crossentropy": 1.789808637648821, |
| "loss/hidden": 3.343359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17758243400603532, |
| "step": 3480 |
| }, |
| { |
| "epoch": 0.1745, |
| "grad_norm": 29.75, |
| "grad_norm_var": 58.84557291666667, |
| "learning_rate": 0.0001, |
| "loss": 6.8709, |
| "loss/crossentropy": 1.8069385841488839, |
| "loss/hidden": 3.403125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19292932376265526, |
| "step": 3490 |
| }, |
| { |
| "epoch": 0.175, |
| "grad_norm": 28.25, |
| "grad_norm_var": 13.452018229166667, |
| "learning_rate": 0.0001, |
| "loss": 6.9084, |
| "loss/crossentropy": 1.6264689728617667, |
| "loss/hidden": 3.269140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16363061694428324, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.1755, |
| "grad_norm": 32.75, |
| "grad_norm_var": 1.459166261163747e+18, |
| "learning_rate": 0.0001, |
| "loss": 6.9837, |
| "loss/crossentropy": 1.7061957284808158, |
| "loss/hidden": 3.411328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18923843959346415, |
| "step": 3510 |
| }, |
| { |
| "epoch": 0.176, |
| "grad_norm": 29.75, |
| "grad_norm_var": 1.459166260217512e+18, |
| "learning_rate": 0.0001, |
| "loss": 6.9459, |
| "loss/crossentropy": 1.6986562974750996, |
| "loss/hidden": 3.453515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18663678420707583, |
| "step": 3520 |
| }, |
| { |
| "epoch": 0.1765, |
| "grad_norm": 31.0, |
| "grad_norm_var": 1.8478515625, |
| "learning_rate": 0.0001, |
| "loss": 6.9793, |
| "loss/crossentropy": 1.7609238177537918, |
| "loss/hidden": 3.38515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19038589783012866, |
| "step": 3530 |
| }, |
| { |
| "epoch": 0.177, |
| "grad_norm": 31.25, |
| "grad_norm_var": 3.1770833333333335, |
| "learning_rate": 0.0001, |
| "loss": 6.9966, |
| "loss/crossentropy": 1.9084905117750168, |
| "loss/hidden": 3.33984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1776235220953822, |
| "step": 3540 |
| }, |
| { |
| "epoch": 0.1775, |
| "grad_norm": 30.25, |
| "grad_norm_var": 2.051497395833333, |
| "learning_rate": 0.0001, |
| "loss": 6.9292, |
| "loss/crossentropy": 1.6809238217771054, |
| "loss/hidden": 3.355078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19617705075070263, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.178, |
| "grad_norm": 33.75, |
| "grad_norm_var": 1.9955729166666667, |
| "learning_rate": 0.0001, |
| "loss": 6.972, |
| "loss/crossentropy": 1.6389021024107933, |
| "loss/hidden": 3.3796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18174178060144186, |
| "step": 3560 |
| }, |
| { |
| "epoch": 0.1785, |
| "grad_norm": 36.5, |
| "grad_norm_var": 7.553059895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.0848, |
| "loss/crossentropy": 1.7566796734929084, |
| "loss/hidden": 3.465234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1923373954370618, |
| "step": 3570 |
| }, |
| { |
| "epoch": 0.179, |
| "grad_norm": 28.125, |
| "grad_norm_var": 5.9603515625, |
| "learning_rate": 0.0001, |
| "loss": 6.956, |
| "loss/crossentropy": 1.7154954925179482, |
| "loss/hidden": 3.380859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17990761240944267, |
| "step": 3580 |
| }, |
| { |
| "epoch": 0.1795, |
| "grad_norm": 29.875, |
| "grad_norm_var": 4.399934895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.0142, |
| "loss/crossentropy": 1.8327077120542525, |
| "loss/hidden": 3.35234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1800425429828465, |
| "step": 3590 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 28.875, |
| "grad_norm_var": 3.3247395833333333, |
| "learning_rate": 0.0001, |
| "loss": 6.9351, |
| "loss/crossentropy": 1.8267195105552674, |
| "loss/hidden": 3.394140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19433746309950947, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.1805, |
| "grad_norm": 32.25, |
| "grad_norm_var": 24.673372395833333, |
| "learning_rate": 0.0001, |
| "loss": 6.8892, |
| "loss/crossentropy": 1.737992748618126, |
| "loss/hidden": 3.40390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20098126940429212, |
| "step": 3610 |
| }, |
| { |
| "epoch": 0.181, |
| "grad_norm": 30.25, |
| "grad_norm_var": 33.395572916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.0103, |
| "loss/crossentropy": 1.8915371976792812, |
| "loss/hidden": 3.305078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1876732436940074, |
| "step": 3620 |
| }, |
| { |
| "epoch": 0.1815, |
| "grad_norm": 26.5, |
| "grad_norm_var": 38.799739583333334, |
| "learning_rate": 0.0001, |
| "loss": 6.981, |
| "loss/crossentropy": 1.7780213125050068, |
| "loss/hidden": 3.421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18925584964454173, |
| "step": 3630 |
| }, |
| { |
| "epoch": 0.182, |
| "grad_norm": 32.0, |
| "grad_norm_var": 1.0995116106143062e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.039, |
| "loss/crossentropy": 1.7201772332191467, |
| "loss/hidden": 3.292578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1817839713767171, |
| "step": 3640 |
| }, |
| { |
| "epoch": 0.1825, |
| "grad_norm": 29.125, |
| "grad_norm_var": 1.0995116110905345e+18, |
| "learning_rate": 0.0001, |
| "loss": 6.7937, |
| "loss/crossentropy": 1.824095284193754, |
| "loss/hidden": 3.3203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16389566464349628, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.183, |
| "grad_norm": 28.625, |
| "grad_norm_var": 14.382291666666667, |
| "learning_rate": 0.0001, |
| "loss": 6.918, |
| "loss/crossentropy": 1.7039800986647606, |
| "loss/hidden": 3.34921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17251853737980127, |
| "step": 3660 |
| }, |
| { |
| "epoch": 0.1835, |
| "grad_norm": 29.375, |
| "grad_norm_var": 0.82265625, |
| "learning_rate": 0.0001, |
| "loss": 6.8614, |
| "loss/crossentropy": 1.670785766094923, |
| "loss/hidden": 3.466015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1893833376467228, |
| "step": 3670 |
| }, |
| { |
| "epoch": 0.184, |
| "grad_norm": 28.375, |
| "grad_norm_var": 8.297916666666667, |
| "learning_rate": 0.0001, |
| "loss": 6.8747, |
| "loss/crossentropy": 1.7371518418192864, |
| "loss/hidden": 3.329296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17423492725938558, |
| "step": 3680 |
| }, |
| { |
| "epoch": 0.1845, |
| "grad_norm": 30.5, |
| "grad_norm_var": 11.51640625, |
| "learning_rate": 0.0001, |
| "loss": 7.1482, |
| "loss/crossentropy": 2.011937528848648, |
| "loss/hidden": 3.376171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19120746664702892, |
| "step": 3690 |
| }, |
| { |
| "epoch": 0.185, |
| "grad_norm": 29.75, |
| "grad_norm_var": 114.8119140625, |
| "learning_rate": 0.0001, |
| "loss": 6.9318, |
| "loss/crossentropy": 1.9779032841324806, |
| "loss/hidden": 3.508203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19792085662484168, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.1855, |
| "grad_norm": 29.5, |
| "grad_norm_var": 3.1666015625, |
| "learning_rate": 0.0001, |
| "loss": 6.9801, |
| "loss/crossentropy": 1.8196966513991355, |
| "loss/hidden": 3.364453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17692473586648702, |
| "step": 3710 |
| }, |
| { |
| "epoch": 0.186, |
| "grad_norm": 31.625, |
| "grad_norm_var": 7.036874289840129e+17, |
| "learning_rate": 0.0001, |
| "loss": 6.9754, |
| "loss/crossentropy": 1.7481721505522727, |
| "loss/hidden": 3.37734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1970391605515033, |
| "step": 3720 |
| }, |
| { |
| "epoch": 0.1865, |
| "grad_norm": 28.75, |
| "grad_norm_var": 7.036874289385746e+17, |
| "learning_rate": 0.0001, |
| "loss": 6.8514, |
| "loss/crossentropy": 1.609993650764227, |
| "loss/hidden": 3.378125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19023605762049556, |
| "step": 3730 |
| }, |
| { |
| "epoch": 0.187, |
| "grad_norm": 32.75, |
| "grad_norm_var": 2.0978515625, |
| "learning_rate": 0.0001, |
| "loss": 7.0947, |
| "loss/crossentropy": 2.0539694875478745, |
| "loss/hidden": 3.399609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20270574633032085, |
| "step": 3740 |
| }, |
| { |
| "epoch": 0.1875, |
| "grad_norm": 29.0, |
| "grad_norm_var": 1.9103515625, |
| "learning_rate": 0.0001, |
| "loss": 6.9278, |
| "loss/crossentropy": 1.8215243116021156, |
| "loss/hidden": 3.2640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16428390927612782, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.188, |
| "grad_norm": 30.0, |
| "grad_norm_var": 2.0541666666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.0503, |
| "loss/crossentropy": 1.8183038413524628, |
| "loss/hidden": 3.3234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19697826653718947, |
| "step": 3760 |
| }, |
| { |
| "epoch": 0.1885, |
| "grad_norm": 32.75, |
| "grad_norm_var": 1.0989583333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.1034, |
| "loss/crossentropy": 1.7321583658456803, |
| "loss/hidden": 3.41484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19197138799354435, |
| "step": 3770 |
| }, |
| { |
| "epoch": 0.189, |
| "grad_norm": 28.25, |
| "grad_norm_var": 2.314322916666667, |
| "learning_rate": 0.0001, |
| "loss": 6.8113, |
| "loss/crossentropy": 1.8538015499711036, |
| "loss/hidden": 3.369140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18398043606430292, |
| "step": 3780 |
| }, |
| { |
| "epoch": 0.1895, |
| "grad_norm": 29.625, |
| "grad_norm_var": 5.827018229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.0817, |
| "loss/crossentropy": 1.8768661253154277, |
| "loss/hidden": 3.369140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20710380356758834, |
| "step": 3790 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 33.25, |
| "grad_norm_var": 4.195572916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.0374, |
| "loss/crossentropy": 1.7977422267198562, |
| "loss/hidden": 3.28828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1821097361855209, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.1905, |
| "grad_norm": 36.0, |
| "grad_norm_var": 5.763997395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.0815, |
| "loss/crossentropy": 1.743187139183283, |
| "loss/hidden": 3.371875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18471882613375784, |
| "step": 3810 |
| }, |
| { |
| "epoch": 0.191, |
| "grad_norm": 30.25, |
| "grad_norm_var": 6.658333333333333, |
| "learning_rate": 0.0001, |
| "loss": 6.8304, |
| "loss/crossentropy": 1.7315315805375575, |
| "loss/hidden": 3.3015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16926794005557894, |
| "step": 3820 |
| }, |
| { |
| "epoch": 0.1915, |
| "grad_norm": 29.875, |
| "grad_norm_var": 7.5380756628017e+17, |
| "learning_rate": 0.0001, |
| "loss": 7.091, |
| "loss/crossentropy": 1.8176006272435188, |
| "loss/hidden": 3.350390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1842843361198902, |
| "step": 3830 |
| }, |
| { |
| "epoch": 0.192, |
| "grad_norm": 29.375, |
| "grad_norm_var": 7.563541666666667, |
| "learning_rate": 0.0001, |
| "loss": 6.9694, |
| "loss/crossentropy": 1.777810937166214, |
| "loss/hidden": 3.32578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1739983822219074, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.1925, |
| "grad_norm": 28.375, |
| "grad_norm_var": 5.01875, |
| "learning_rate": 0.0001, |
| "loss": 6.8715, |
| "loss/crossentropy": 1.9018649347126484, |
| "loss/hidden": 3.32421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18006115844473242, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.193, |
| "grad_norm": 30.0, |
| "grad_norm_var": 1.2455729166666667, |
| "learning_rate": 0.0001, |
| "loss": 6.881, |
| "loss/crossentropy": 1.8844246573746204, |
| "loss/hidden": 3.3390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19470994817093015, |
| "step": 3860 |
| }, |
| { |
| "epoch": 0.1935, |
| "grad_norm": 28.75, |
| "grad_norm_var": 2.6809895833333335, |
| "learning_rate": 0.0001, |
| "loss": 6.9021, |
| "loss/crossentropy": 1.7199362799525262, |
| "loss/hidden": 3.36328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18913396131247281, |
| "step": 3870 |
| }, |
| { |
| "epoch": 0.194, |
| "grad_norm": 32.25, |
| "grad_norm_var": 2.5197265625, |
| "learning_rate": 0.0001, |
| "loss": 6.9324, |
| "loss/crossentropy": 1.755439005047083, |
| "loss/hidden": 3.343359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1858789509162307, |
| "step": 3880 |
| }, |
| { |
| "epoch": 0.1945, |
| "grad_norm": 31.125, |
| "grad_norm_var": 3.384375, |
| "learning_rate": 0.0001, |
| "loss": 6.9477, |
| "loss/crossentropy": 1.7906312070786954, |
| "loss/hidden": 3.334765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19127205722033977, |
| "step": 3890 |
| }, |
| { |
| "epoch": 0.195, |
| "grad_norm": 32.25, |
| "grad_norm_var": 2.1504557291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.1196, |
| "loss/crossentropy": 1.9957764573395251, |
| "loss/hidden": 3.36015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18469135276973248, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.1955, |
| "grad_norm": 31.875, |
| "grad_norm_var": 9.387366238726391e+17, |
| "learning_rate": 0.0001, |
| "loss": 6.9857, |
| "loss/crossentropy": 1.7901725992560387, |
| "loss/hidden": 3.300390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18710751123726369, |
| "step": 3910 |
| }, |
| { |
| "epoch": 0.196, |
| "grad_norm": 30.625, |
| "grad_norm_var": 27.239322916666666, |
| "learning_rate": 0.0001, |
| "loss": 6.9815, |
| "loss/crossentropy": 1.7652528271079064, |
| "loss/hidden": 3.234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16305868746712804, |
| "step": 3920 |
| }, |
| { |
| "epoch": 0.1965, |
| "grad_norm": 29.75, |
| "grad_norm_var": 21.822916666666668, |
| "learning_rate": 0.0001, |
| "loss": 6.873, |
| "loss/crossentropy": 1.7368287414312362, |
| "loss/hidden": 3.433984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19844600670039653, |
| "step": 3930 |
| }, |
| { |
| "epoch": 0.197, |
| "grad_norm": 29.625, |
| "grad_norm_var": 2.035416666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.0347, |
| "loss/crossentropy": 1.9710937917232514, |
| "loss/hidden": 3.265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17006599269807338, |
| "step": 3940 |
| }, |
| { |
| "epoch": 0.1975, |
| "grad_norm": 27.875, |
| "grad_norm_var": 55.904622395833336, |
| "learning_rate": 0.0001, |
| "loss": 6.9427, |
| "loss/crossentropy": 1.6834511645138264, |
| "loss/hidden": 3.315234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18321871096268297, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.198, |
| "grad_norm": 28.25, |
| "grad_norm_var": 4.515559895833333, |
| "learning_rate": 0.0001, |
| "loss": 6.8793, |
| "loss/crossentropy": 1.8481898710131646, |
| "loss/hidden": 3.219921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16018803734332324, |
| "step": 3960 |
| }, |
| { |
| "epoch": 0.1985, |
| "grad_norm": 31.375, |
| "grad_norm_var": 4.105208333333334, |
| "learning_rate": 0.0001, |
| "loss": 6.9158, |
| "loss/crossentropy": 1.7632210277020932, |
| "loss/hidden": 3.35390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17569016199558973, |
| "step": 3970 |
| }, |
| { |
| "epoch": 0.199, |
| "grad_norm": 31.5, |
| "grad_norm_var": 3.6056640625, |
| "learning_rate": 0.0001, |
| "loss": 7.0644, |
| "loss/crossentropy": 1.8658879399299622, |
| "loss/hidden": 3.445703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2075220150873065, |
| "step": 3980 |
| }, |
| { |
| "epoch": 0.1995, |
| "grad_norm": 30.0, |
| "grad_norm_var": 3.3301432291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.1057, |
| "loss/crossentropy": 1.915429985523224, |
| "loss/hidden": 3.390234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19412722568958998, |
| "step": 3990 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 29.75, |
| "grad_norm_var": 139.13170572916667, |
| "learning_rate": 0.0001, |
| "loss": 6.9355, |
| "loss/crossentropy": 1.8257215216755867, |
| "loss/hidden": 3.39375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18498760322108865, |
| "step": 4000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 20000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 4000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.1430040128035226e+19, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|