diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4833 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2, + "eval_steps": 2000, + "global_step": 4000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005, + "grad_norm": 30.875, + "learning_rate": 0.0001, + "loss": 7.1506, + "loss/crossentropy": 1.9750229328870774, + "loss/hidden": 3.38984375, + "loss/jsd": 0.0, + "loss/logits": 0.18868114035576583, + "step": 10 + }, + { + "epoch": 0.001, + "grad_norm": 30.75, + "grad_norm_var": 2.09765625, + "learning_rate": 0.0001, + "loss": 7.266, + "loss/crossentropy": 1.915299428999424, + "loss/hidden": 3.368359375, + "loss/jsd": 0.0, + "loss/logits": 0.19173294119536877, + "step": 20 + }, + { + "epoch": 0.0015, + "grad_norm": 31.625, + "grad_norm_var": 35.572330729166666, + "learning_rate": 0.0001, + "loss": 7.1477, + "loss/crossentropy": 1.845322072505951, + "loss/hidden": 3.42421875, + "loss/jsd": 0.0, + "loss/logits": 0.1835887383669615, + "step": 30 + }, + { + "epoch": 0.002, + "grad_norm": 30.25, + "grad_norm_var": 5.803580729166667, + "learning_rate": 0.0001, + "loss": 7.125, + "loss/crossentropy": 1.8556978717446326, + "loss/hidden": 3.5, + "loss/jsd": 0.0, + "loss/logits": 0.22780380193144084, + "step": 40 + }, + { + "epoch": 0.0025, + "grad_norm": 39.5, + "grad_norm_var": 6.737239583333333, + "learning_rate": 0.0001, + "loss": 7.2665, + "loss/crossentropy": 2.051687541604042, + "loss/hidden": 3.45078125, + "loss/jsd": 0.0, + "loss/logits": 0.21537381634116173, + "step": 50 + }, + { + "epoch": 0.003, + "grad_norm": 36.5, + "grad_norm_var": 11.058333333333334, + "learning_rate": 0.0001, + "loss": 7.2095, + "loss/crossentropy": 1.9898784533143044, + "loss/hidden": 3.3953125, + "loss/jsd": 0.0, + "loss/logits": 0.19060547631233932, + "step": 60 + }, + { + "epoch": 0.0035, + "grad_norm": 27.0, + "grad_norm_var": 6.45390625, + "learning_rate": 0.0001, + "loss": 7.2606, + "loss/crossentropy": 1.8448080085217953, + "loss/hidden": 3.394140625, + "loss/jsd": 0.0, + "loss/logits": 0.18068002099171282, + "step": 70 + }, + { + "epoch": 0.004, + "grad_norm": 38.75, + "grad_norm_var": 1.3401023445121106e+18, + "learning_rate": 0.0001, + "loss": 7.4871, + "loss/crossentropy": 2.0318232350051404, + "loss/hidden": 3.733984375, + "loss/jsd": 0.0, + "loss/logits": 0.337183965742588, + "step": 80 + }, + { + "epoch": 0.0045, + "grad_norm": 35.25, + "grad_norm_var": 1.3401023442516444e+18, + "learning_rate": 0.0001, + "loss": 7.1923, + "loss/crossentropy": 1.7826939225196838, + "loss/hidden": 3.587890625, + "loss/jsd": 0.0, + "loss/logits": 0.2118432404473424, + "step": 90 + }, + { + "epoch": 0.005, + "grad_norm": 32.75, + "grad_norm_var": 2.7309895833333333, + "learning_rate": 0.0001, + "loss": 7.2487, + "loss/crossentropy": 1.88408655077219, + "loss/hidden": 3.48671875, + "loss/jsd": 0.0, + "loss/logits": 0.1903762748464942, + "step": 100 + }, + { + "epoch": 0.0055, + "grad_norm": 34.25, + "grad_norm_var": 4.268489583333333, + "learning_rate": 0.0001, + "loss": 7.1643, + "loss/crossentropy": 1.83259879052639, + "loss/hidden": 3.41953125, + "loss/jsd": 0.0, + "loss/logits": 0.19554968569427728, + "step": 110 + }, + { + "epoch": 0.006, + "grad_norm": 33.0, + "grad_norm_var": 6.548958333333333, + "learning_rate": 0.0001, + "loss": 7.1535, + "loss/crossentropy": 1.8173740945756436, + "loss/hidden": 3.34609375, + "loss/jsd": 0.0, + "loss/logits": 0.17036083210259675, + "step": 120 + }, + { + "epoch": 0.0065, + "grad_norm": 32.25, + "grad_norm_var": 3.220572916666667, + "learning_rate": 0.0001, + "loss": 7.2113, + "loss/crossentropy": 1.8991591855883598, + "loss/hidden": 3.4359375, + "loss/jsd": 0.0, + "loss/logits": 0.20231554415076972, + "step": 130 + }, + { + "epoch": 0.007, + "grad_norm": 120.0, + "grad_norm_var": 494.52890625, + "learning_rate": 0.0001, + "loss": 7.1589, + "loss/crossentropy": 1.9234379842877387, + "loss/hidden": 3.348828125, + "loss/jsd": 0.0, + "loss/logits": 0.19592595770955085, + "step": 140 + }, + { + "epoch": 0.0075, + "grad_norm": 30.375, + "grad_norm_var": 496.27265625, + "learning_rate": 0.0001, + "loss": 7.1392, + "loss/crossentropy": 1.7669467806816102, + "loss/hidden": 3.39453125, + "loss/jsd": 0.0, + "loss/logits": 0.1691664818674326, + "step": 150 + }, + { + "epoch": 0.008, + "grad_norm": 35.25, + "grad_norm_var": 202.11354166666666, + "learning_rate": 0.0001, + "loss": 7.2551, + "loss/crossentropy": 1.979496531933546, + "loss/hidden": 3.51484375, + "loss/jsd": 0.0, + "loss/logits": 0.2397671105340123, + "step": 160 + }, + { + "epoch": 0.0085, + "grad_norm": 29.75, + "grad_norm_var": 41.73118489583333, + "learning_rate": 0.0001, + "loss": 7.0709, + "loss/crossentropy": 1.6596970088779925, + "loss/hidden": 3.46875, + "loss/jsd": 0.0, + "loss/logits": 0.1801933947019279, + "step": 170 + }, + { + "epoch": 0.009, + "grad_norm": 31.375, + "grad_norm_var": 3.1510416666666665, + "learning_rate": 0.0001, + "loss": 7.1329, + "loss/crossentropy": 1.8317318260669708, + "loss/hidden": 3.470703125, + "loss/jsd": 0.0, + "loss/logits": 0.2027322521433234, + "step": 180 + }, + { + "epoch": 0.0095, + "grad_norm": 31.25, + "grad_norm_var": 1.034375, + "learning_rate": 0.0001, + "loss": 7.2704, + "loss/crossentropy": 1.7871993221342564, + "loss/hidden": 3.3296875, + "loss/jsd": 0.0, + "loss/logits": 0.17167234625667332, + "step": 190 + }, + { + "epoch": 0.01, + "grad_norm": 29.375, + "grad_norm_var": 1.4218098958333334, + "learning_rate": 0.0001, + "loss": 7.2074, + "loss/crossentropy": 1.9208836354315282, + "loss/hidden": 3.355859375, + "loss/jsd": 0.0, + "loss/logits": 0.18774686167016624, + "step": 200 + }, + { + "epoch": 0.0105, + "grad_norm": 29.75, + "grad_norm_var": 5.548958333333333, + "learning_rate": 0.0001, + "loss": 7.2446, + "loss/crossentropy": 1.8792764976620675, + "loss/hidden": 3.430859375, + "loss/jsd": 0.0, + "loss/logits": 0.19080359637737274, + "step": 210 + }, + { + "epoch": 0.011, + "grad_norm": 32.25, + "grad_norm_var": 11.7619140625, + "learning_rate": 0.0001, + "loss": 7.2031, + "loss/crossentropy": 1.926865078508854, + "loss/hidden": 3.387890625, + "loss/jsd": 0.0, + "loss/logits": 0.19636590238660573, + "step": 220 + }, + { + "epoch": 0.0115, + "grad_norm": 29.25, + "grad_norm_var": 4.170247395833333, + "learning_rate": 0.0001, + "loss": 7.0576, + "loss/crossentropy": 1.8266212515532971, + "loss/hidden": 3.377734375, + "loss/jsd": 0.0, + "loss/logits": 0.18201391287148, + "step": 230 + }, + { + "epoch": 0.012, + "grad_norm": 31.5, + "grad_norm_var": 1.81015625, + "learning_rate": 0.0001, + "loss": 7.1432, + "loss/crossentropy": 1.8445213377475738, + "loss/hidden": 3.34140625, + "loss/jsd": 0.0, + "loss/logits": 0.18868241235613822, + "step": 240 + }, + { + "epoch": 0.0125, + "grad_norm": 33.75, + "grad_norm_var": 1.9625138843884541e+18, + "learning_rate": 0.0001, + "loss": 7.0655, + "loss/crossentropy": 1.8239912115037442, + "loss/hidden": 3.298828125, + "loss/jsd": 0.0, + "loss/logits": 0.17756748497486113, + "step": 250 + }, + { + "epoch": 0.013, + "grad_norm": 31.875, + "grad_norm_var": 1.56640625, + "learning_rate": 0.0001, + "loss": 7.1575, + "loss/crossentropy": 1.7626003332436084, + "loss/hidden": 3.4109375, + "loss/jsd": 0.0, + "loss/logits": 0.18398213125765323, + "step": 260 + }, + { + "epoch": 0.0135, + "grad_norm": 32.25, + "grad_norm_var": 1.1129557291666667, + "learning_rate": 0.0001, + "loss": 7.1441, + "loss/crossentropy": 1.7845010846853255, + "loss/hidden": 3.344140625, + "loss/jsd": 0.0, + "loss/logits": 0.18147525601089, + "step": 270 + }, + { + "epoch": 0.014, + "grad_norm": 30.25, + "grad_norm_var": 2.9822265625, + "learning_rate": 0.0001, + "loss": 7.1286, + "loss/crossentropy": 1.8358447797596456, + "loss/hidden": 3.358203125, + "loss/jsd": 0.0, + "loss/logits": 0.17241306640207768, + "step": 280 + }, + { + "epoch": 0.0145, + "grad_norm": 33.0, + "grad_norm_var": 10.982291666666667, + "learning_rate": 0.0001, + "loss": 7.1123, + "loss/crossentropy": 1.843992917239666, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.19916406068950893, + "step": 290 + }, + { + "epoch": 0.015, + "grad_norm": 31.5, + "grad_norm_var": 3.6176432291666667, + "learning_rate": 0.0001, + "loss": 6.9761, + "loss/crossentropy": 1.710184234380722, + "loss/hidden": 3.385546875, + "loss/jsd": 0.0, + "loss/logits": 0.1904242929071188, + "step": 300 + }, + { + "epoch": 0.0155, + "grad_norm": 30.625, + "grad_norm_var": 1.4795028269701094e+18, + "learning_rate": 0.0001, + "loss": 7.1128, + "loss/crossentropy": 1.783938717842102, + "loss/hidden": 3.38515625, + "loss/jsd": 0.0, + "loss/logits": 0.19371993821114303, + "step": 310 + }, + { + "epoch": 0.016, + "grad_norm": 27.375, + "grad_norm_var": 9.558072916666667, + "learning_rate": 0.0001, + "loss": 7.1587, + "loss/crossentropy": 1.799688772857189, + "loss/hidden": 3.35078125, + "loss/jsd": 0.0, + "loss/logits": 0.18227657950483261, + "step": 320 + }, + { + "epoch": 0.0165, + "grad_norm": 30.75, + "grad_norm_var": 5.827235584899985e+17, + "learning_rate": 0.0001, + "loss": 7.1719, + "loss/crossentropy": 1.8475290067493915, + "loss/hidden": 3.490234375, + "loss/jsd": 0.0, + "loss/logits": 0.20651640743017197, + "step": 330 + }, + { + "epoch": 0.017, + "grad_norm": 31.875, + "grad_norm_var": 1.0473683707078467e+18, + "learning_rate": 0.0001, + "loss": 7.2024, + "loss/crossentropy": 1.7877734430134296, + "loss/hidden": 3.341015625, + "loss/jsd": 0.0, + "loss/logits": 0.17529369578696788, + "step": 340 + }, + { + "epoch": 0.0175, + "grad_norm": 29.625, + "grad_norm_var": 1.0473683706481477e+18, + "learning_rate": 0.0001, + "loss": 7.0127, + "loss/crossentropy": 1.8476789727807046, + "loss/hidden": 3.376953125, + "loss/jsd": 0.0, + "loss/logits": 0.18340907394886016, + "step": 350 + }, + { + "epoch": 0.018, + "grad_norm": 31.5, + "grad_norm_var": 4.201822916666667, + "learning_rate": 0.0001, + "loss": 7.0837, + "loss/crossentropy": 1.9127952009439468, + "loss/hidden": 3.274609375, + "loss/jsd": 0.0, + "loss/logits": 0.18515819907188416, + "step": 360 + }, + { + "epoch": 0.0185, + "grad_norm": 33.25, + "grad_norm_var": 3.4580729166666666, + "learning_rate": 0.0001, + "loss": 7.1494, + "loss/crossentropy": 1.7446002267301082, + "loss/hidden": 3.410546875, + "loss/jsd": 0.0, + "loss/logits": 0.18972037807106973, + "step": 370 + }, + { + "epoch": 0.019, + "grad_norm": 32.25, + "grad_norm_var": 4.0712890625, + "learning_rate": 0.0001, + "loss": 6.9798, + "loss/crossentropy": 1.6596938122063876, + "loss/hidden": 3.39296875, + "loss/jsd": 0.0, + "loss/logits": 0.16941323587670923, + "step": 380 + }, + { + "epoch": 0.0195, + "grad_norm": 31.5, + "grad_norm_var": 1.8014398298089062e+18, + "learning_rate": 0.0001, + "loss": 7.1659, + "loss/crossentropy": 1.8092470526695252, + "loss/hidden": 3.278515625, + "loss/jsd": 0.0, + "loss/logits": 0.16989028006792067, + "step": 390 + }, + { + "epoch": 0.02, + "grad_norm": 29.25, + "grad_norm_var": 1.801439829596395e+18, + "learning_rate": 0.0001, + "loss": 7.1246, + "loss/crossentropy": 1.803744176030159, + "loss/hidden": 3.365625, + "loss/jsd": 0.0, + "loss/logits": 0.19061805782839655, + "step": 400 + }, + { + "epoch": 0.0205, + "grad_norm": 30.75, + "grad_norm_var": 1.1895833333333334, + "learning_rate": 0.0001, + "loss": 6.8644, + "loss/crossentropy": 1.711807917803526, + "loss/hidden": 3.348046875, + "loss/jsd": 0.0, + "loss/logits": 0.17410435527563095, + "step": 410 + }, + { + "epoch": 0.021, + "grad_norm": 28.75, + "grad_norm_var": 1.0518229166666666, + "learning_rate": 0.0001, + "loss": 6.9733, + "loss/crossentropy": 1.9412737876176833, + "loss/hidden": 3.32109375, + "loss/jsd": 0.0, + "loss/logits": 0.1845760691910982, + "step": 420 + }, + { + "epoch": 0.0215, + "grad_norm": 33.75, + "grad_norm_var": 3.36875, + "learning_rate": 0.0001, + "loss": 7.0425, + "loss/crossentropy": 1.6975354842841626, + "loss/hidden": 3.30703125, + "loss/jsd": 0.0, + "loss/logits": 0.17426773644983767, + "step": 430 + }, + { + "epoch": 0.022, + "grad_norm": 28.875, + "grad_norm_var": 4.533072916666667, + "learning_rate": 0.0001, + "loss": 7.0644, + "loss/crossentropy": 1.8431582309305667, + "loss/hidden": 3.309765625, + "loss/jsd": 0.0, + "loss/logits": 0.19988675275817513, + "step": 440 + }, + { + "epoch": 0.0225, + "grad_norm": 28.5, + "grad_norm_var": 4.65, + "learning_rate": 0.0001, + "loss": 7.1091, + "loss/crossentropy": 1.845390348136425, + "loss/hidden": 3.395703125, + "loss/jsd": 0.0, + "loss/logits": 0.18364266194403173, + "step": 450 + }, + { + "epoch": 0.023, + "grad_norm": 30.75, + "grad_norm_var": 4.459375, + "learning_rate": 0.0001, + "loss": 7.0581, + "loss/crossentropy": 1.7513741821050643, + "loss/hidden": 3.42109375, + "loss/jsd": 0.0, + "loss/logits": 0.186102606728673, + "step": 460 + }, + { + "epoch": 0.0235, + "grad_norm": 27.375, + "grad_norm_var": 4.786458333333333, + "learning_rate": 0.0001, + "loss": 6.9763, + "loss/crossentropy": 1.779174941033125, + "loss/hidden": 3.373046875, + "loss/jsd": 0.0, + "loss/logits": 0.17763521214947103, + "step": 470 + }, + { + "epoch": 0.024, + "grad_norm": 32.75, + "grad_norm_var": 4.1, + "learning_rate": 0.0001, + "loss": 6.9638, + "loss/crossentropy": 1.7178381219506265, + "loss/hidden": 3.36484375, + "loss/jsd": 0.0, + "loss/logits": 0.17294319327920676, + "step": 480 + }, + { + "epoch": 0.0245, + "grad_norm": 33.75, + "grad_norm_var": 3.40625, + "learning_rate": 0.0001, + "loss": 6.9397, + "loss/crossentropy": 1.8609587274491788, + "loss/hidden": 3.309765625, + "loss/jsd": 0.0, + "loss/logits": 0.1921778223477304, + "step": 490 + }, + { + "epoch": 0.025, + "grad_norm": 30.125, + "grad_norm_var": 7.0625, + "learning_rate": 0.0001, + "loss": 7.1176, + "loss/crossentropy": 1.8291713461279868, + "loss/hidden": 3.390234375, + "loss/jsd": 0.0, + "loss/logits": 0.18730791788548232, + "step": 500 + }, + { + "epoch": 0.0255, + "grad_norm": 30.375, + "grad_norm_var": 6.520572916666667, + "learning_rate": 0.0001, + "loss": 7.097, + "loss/crossentropy": 1.6978721603751183, + "loss/hidden": 3.354296875, + "loss/jsd": 0.0, + "loss/logits": 0.16910959454253316, + "step": 510 + }, + { + "epoch": 0.026, + "grad_norm": 31.5, + "grad_norm_var": 5.492708333333334, + "learning_rate": 0.0001, + "loss": 7.1184, + "loss/crossentropy": 1.7646001767367125, + "loss/hidden": 3.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.18606224549002945, + "step": 520 + }, + { + "epoch": 0.0265, + "grad_norm": 33.25, + "grad_norm_var": 3.2478515625, + "learning_rate": 0.0001, + "loss": 6.9289, + "loss/crossentropy": 1.7254683546721936, + "loss/hidden": 3.414453125, + "loss/jsd": 0.0, + "loss/logits": 0.19350956091657281, + "step": 530 + }, + { + "epoch": 0.027, + "grad_norm": 28.5, + "grad_norm_var": 3.2426432291666667, + "learning_rate": 0.0001, + "loss": 7.0072, + "loss/crossentropy": 1.8291743457317353, + "loss/hidden": 3.2703125, + "loss/jsd": 0.0, + "loss/logits": 0.17015220914036036, + "step": 540 + }, + { + "epoch": 0.0275, + "grad_norm": 29.375, + "grad_norm_var": 6.1978515625, + "learning_rate": 0.0001, + "loss": 7.0714, + "loss/crossentropy": 1.7038650900125503, + "loss/hidden": 3.35546875, + "loss/jsd": 0.0, + "loss/logits": 0.17573642041534185, + "step": 550 + }, + { + "epoch": 0.028, + "grad_norm": 28.875, + "grad_norm_var": 5.530143229166667, + "learning_rate": 0.0001, + "loss": 7.0376, + "loss/crossentropy": 2.000048974901438, + "loss/hidden": 3.3921875, + "loss/jsd": 0.0, + "loss/logits": 0.20670556500554085, + "step": 560 + }, + { + "epoch": 0.0285, + "grad_norm": 30.125, + "grad_norm_var": 37.509830729166666, + "learning_rate": 0.0001, + "loss": 7.0782, + "loss/crossentropy": 1.7484589993953705, + "loss/hidden": 3.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.20398099757730961, + "step": 570 + }, + { + "epoch": 0.029, + "grad_norm": 30.75, + "grad_norm_var": 37.80930989583333, + "learning_rate": 0.0001, + "loss": 7.1094, + "loss/crossentropy": 1.747946521639824, + "loss/hidden": 3.325, + "loss/jsd": 0.0, + "loss/logits": 0.1723929913714528, + "step": 580 + }, + { + "epoch": 0.0295, + "grad_norm": 31.5, + "grad_norm_var": 1.9410807291666667, + "learning_rate": 0.0001, + "loss": 7.0532, + "loss/crossentropy": 1.714518916606903, + "loss/hidden": 3.395703125, + "loss/jsd": 0.0, + "loss/logits": 0.17450172062963248, + "step": 590 + }, + { + "epoch": 0.03, + "grad_norm": 31.375, + "grad_norm_var": 6.620995009586922e+17, + "learning_rate": 0.0001, + "loss": 7.2589, + "loss/crossentropy": 1.7456246592104434, + "loss/hidden": 3.3328125, + "loss/jsd": 0.0, + "loss/logits": 0.18539317091926932, + "step": 600 + }, + { + "epoch": 0.0305, + "grad_norm": 31.625, + "grad_norm_var": 6.620995011655063e+17, + "learning_rate": 0.0001, + "loss": 7.1014, + "loss/crossentropy": 1.6763587422668933, + "loss/hidden": 3.4015625, + "loss/jsd": 0.0, + "loss/logits": 0.1931827544234693, + "step": 610 + }, + { + "epoch": 0.031, + "grad_norm": 31.5, + "grad_norm_var": 4.528125, + "learning_rate": 0.0001, + "loss": 7.115, + "loss/crossentropy": 1.849663856625557, + "loss/hidden": 3.41953125, + "loss/jsd": 0.0, + "loss/logits": 0.21164124589413405, + "step": 620 + }, + { + "epoch": 0.0315, + "grad_norm": 31.25, + "grad_norm_var": 3.027083333333333, + "learning_rate": 0.0001, + "loss": 7.1975, + "loss/crossentropy": 1.765239630639553, + "loss/hidden": 3.3609375, + "loss/jsd": 0.0, + "loss/logits": 0.18264974560588598, + "step": 630 + }, + { + "epoch": 0.032, + "grad_norm": 29.25, + "grad_norm_var": 3.428580729166667, + "learning_rate": 0.0001, + "loss": 7.1206, + "loss/crossentropy": 1.8783695727586747, + "loss/hidden": 3.369921875, + "loss/jsd": 0.0, + "loss/logits": 0.18768006665632128, + "step": 640 + }, + { + "epoch": 0.0325, + "grad_norm": 30.75, + "grad_norm_var": 3.9385416666666666, + "learning_rate": 0.0001, + "loss": 7.1671, + "loss/crossentropy": 1.8120282679796218, + "loss/hidden": 3.41484375, + "loss/jsd": 0.0, + "loss/logits": 0.21209220625460148, + "step": 650 + }, + { + "epoch": 0.033, + "grad_norm": 31.75, + "grad_norm_var": 1.77265625, + "learning_rate": 0.0001, + "loss": 7.0683, + "loss/crossentropy": 1.6486516989767552, + "loss/hidden": 3.3765625, + "loss/jsd": 0.0, + "loss/logits": 0.17768741883337497, + "step": 660 + }, + { + "epoch": 0.0335, + "grad_norm": 28.5, + "grad_norm_var": 1.9622395833333333, + "learning_rate": 0.0001, + "loss": 7.0341, + "loss/crossentropy": 1.5188174404203891, + "loss/hidden": 3.355078125, + "loss/jsd": 0.0, + "loss/logits": 0.17400255370885134, + "step": 670 + }, + { + "epoch": 0.034, + "grad_norm": 29.25, + "grad_norm_var": 3.075, + "learning_rate": 0.0001, + "loss": 7.0187, + "loss/crossentropy": 1.7111039966344834, + "loss/hidden": 3.42734375, + "loss/jsd": 0.0, + "loss/logits": 0.20188356712460517, + "step": 680 + }, + { + "epoch": 0.0345, + "grad_norm": 30.5, + "grad_norm_var": 1.5458333333333334, + "learning_rate": 0.0001, + "loss": 7.1392, + "loss/crossentropy": 1.7463210627436638, + "loss/hidden": 3.380078125, + "loss/jsd": 0.0, + "loss/logits": 0.18064118530601264, + "step": 690 + }, + { + "epoch": 0.035, + "grad_norm": 30.0, + "grad_norm_var": 1.6020833333333333, + "learning_rate": 0.0001, + "loss": 7.0488, + "loss/crossentropy": 1.913002396374941, + "loss/hidden": 3.248046875, + "loss/jsd": 0.0, + "loss/logits": 0.17795131383463741, + "step": 700 + }, + { + "epoch": 0.0355, + "grad_norm": 3674210304.0, + "grad_norm_var": 2.2729279965717071e+18, + "learning_rate": 0.0001, + "loss": 7.1836, + "loss/crossentropy": 1.7232265777885913, + "loss/hidden": 3.417578125, + "loss/jsd": 0.0, + "loss/logits": 0.19430895978584886, + "step": 710 + }, + { + "epoch": 0.036, + "grad_norm": 29.125, + "grad_norm_var": 8.437388195823355e+17, + "learning_rate": 0.0001, + "loss": 6.9841, + "loss/crossentropy": 1.8030119113624097, + "loss/hidden": 3.39453125, + "loss/jsd": 0.0, + "loss/logits": 0.18302876157686115, + "step": 720 + }, + { + "epoch": 0.0365, + "grad_norm": 30.375, + "grad_norm_var": 2.85, + "learning_rate": 0.0001, + "loss": 6.9804, + "loss/crossentropy": 1.9009442821145057, + "loss/hidden": 3.266796875, + "loss/jsd": 0.0, + "loss/logits": 0.16866004383191466, + "step": 730 + }, + { + "epoch": 0.037, + "grad_norm": 30.0, + "grad_norm_var": 9.339322916666667, + "learning_rate": 0.0001, + "loss": 6.9876, + "loss/crossentropy": 1.6418433368206025, + "loss/hidden": 3.438671875, + "loss/jsd": 0.0, + "loss/logits": 0.191958365496248, + "step": 740 + }, + { + "epoch": 0.0375, + "grad_norm": 30.875, + "grad_norm_var": 7.639322916666667, + "learning_rate": 0.0001, + "loss": 7.0538, + "loss/crossentropy": 1.853764034062624, + "loss/hidden": 3.32578125, + "loss/jsd": 0.0, + "loss/logits": 0.17473467853851615, + "step": 750 + }, + { + "epoch": 0.038, + "grad_norm": 31.125, + "grad_norm_var": 1.0613932291666666, + "learning_rate": 0.0001, + "loss": 7.1458, + "loss/crossentropy": 1.8514880582690239, + "loss/hidden": 3.378515625, + "loss/jsd": 0.0, + "loss/logits": 0.19726306498050689, + "step": 760 + }, + { + "epoch": 0.0385, + "grad_norm": 28.875, + "grad_norm_var": 1.7997395833333334, + "learning_rate": 0.0001, + "loss": 7.0766, + "loss/crossentropy": 1.8405121728777885, + "loss/hidden": 3.324609375, + "loss/jsd": 0.0, + "loss/logits": 0.19442977402359246, + "step": 770 + }, + { + "epoch": 0.039, + "grad_norm": 29.0, + "grad_norm_var": 2.3802083333333335, + "learning_rate": 0.0001, + "loss": 7.0214, + "loss/crossentropy": 1.9466332450509072, + "loss/hidden": 3.289453125, + "loss/jsd": 0.0, + "loss/logits": 0.170109105668962, + "step": 780 + }, + { + "epoch": 0.0395, + "grad_norm": 30.0, + "grad_norm_var": 1.6124348958333334, + "learning_rate": 0.0001, + "loss": 7.1306, + "loss/crossentropy": 1.8399325378239155, + "loss/hidden": 3.46015625, + "loss/jsd": 0.0, + "loss/logits": 0.20626397961750625, + "step": 790 + }, + { + "epoch": 0.04, + "grad_norm": 31.75, + "grad_norm_var": 1.6559895833333333, + "learning_rate": 0.0001, + "loss": 7.1375, + "loss/crossentropy": 1.9278223380446433, + "loss/hidden": 3.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.2024382423609495, + "step": 800 + }, + { + "epoch": 0.0405, + "grad_norm": 27.5, + "grad_norm_var": 16.089322916666667, + "learning_rate": 0.0001, + "loss": 7.0363, + "loss/crossentropy": 1.859210267663002, + "loss/hidden": 3.345703125, + "loss/jsd": 0.0, + "loss/logits": 0.18585832975804806, + "step": 810 + }, + { + "epoch": 0.041, + "grad_norm": 28.25, + "grad_norm_var": 38.77265625, + "learning_rate": 0.0001, + "loss": 6.9378, + "loss/crossentropy": 1.8994540706276895, + "loss/hidden": 3.376953125, + "loss/jsd": 0.0, + "loss/logits": 0.2018324811011553, + "step": 820 + }, + { + "epoch": 0.0415, + "grad_norm": 32.0, + "grad_norm_var": 38.8375, + "learning_rate": 0.0001, + "loss": 7.002, + "loss/crossentropy": 1.8244094364345074, + "loss/hidden": 3.415625, + "loss/jsd": 0.0, + "loss/logits": 0.20930232629179954, + "step": 830 + }, + { + "epoch": 0.042, + "grad_norm": 30.25, + "grad_norm_var": 2.0634765625, + "learning_rate": 0.0001, + "loss": 6.9688, + "loss/crossentropy": 1.8976417139172554, + "loss/hidden": 3.33515625, + "loss/jsd": 0.0, + "loss/logits": 0.1871755332686007, + "step": 840 + }, + { + "epoch": 0.0425, + "grad_norm": 50.75, + "grad_norm_var": 28.351497395833334, + "learning_rate": 0.0001, + "loss": 6.992, + "loss/crossentropy": 1.899886740744114, + "loss/hidden": 3.417578125, + "loss/jsd": 0.0, + "loss/logits": 0.18904313631355762, + "step": 850 + }, + { + "epoch": 0.043, + "grad_norm": 29.0, + "grad_norm_var": 27.3056640625, + "learning_rate": 0.0001, + "loss": 7.0939, + "loss/crossentropy": 1.8286892741918563, + "loss/hidden": 3.362109375, + "loss/jsd": 0.0, + "loss/logits": 0.18909739144146442, + "step": 860 + }, + { + "epoch": 0.0435, + "grad_norm": 28.375, + "grad_norm_var": 1.3247395833333333, + "learning_rate": 0.0001, + "loss": 6.9381, + "loss/crossentropy": 1.9782623961567878, + "loss/hidden": 3.305859375, + "loss/jsd": 0.0, + "loss/logits": 0.1766037069261074, + "step": 870 + }, + { + "epoch": 0.044, + "grad_norm": 29.0, + "grad_norm_var": 2.1988932291666665, + "learning_rate": 0.0001, + "loss": 6.8414, + "loss/crossentropy": 1.8968854755163194, + "loss/hidden": 3.413671875, + "loss/jsd": 0.0, + "loss/logits": 0.20138736004009844, + "step": 880 + }, + { + "epoch": 0.0445, + "grad_norm": 32.75, + "grad_norm_var": 1.92890625, + "learning_rate": 0.0001, + "loss": 7.1271, + "loss/crossentropy": 1.8630956932902336, + "loss/hidden": 3.428125, + "loss/jsd": 0.0, + "loss/logits": 0.21029497124254704, + "step": 890 + }, + { + "epoch": 0.045, + "grad_norm": 29.25, + "grad_norm_var": 2.037239583333333, + "learning_rate": 0.0001, + "loss": 7.0435, + "loss/crossentropy": 1.8676601111888886, + "loss/hidden": 3.351953125, + "loss/jsd": 0.0, + "loss/logits": 0.19789310321211814, + "step": 900 + }, + { + "epoch": 0.0455, + "grad_norm": 30.25, + "grad_norm_var": 4.2265225949129395e+17, + "learning_rate": 0.0001, + "loss": 7.1233, + "loss/crossentropy": 1.8434145867824554, + "loss/hidden": 3.378125, + "loss/jsd": 0.0, + "loss/logits": 0.18832013495266436, + "step": 910 + }, + { + "epoch": 0.046, + "grad_norm": 29.375, + "grad_norm_var": 4.2265225969445555e+17, + "learning_rate": 0.0001, + "loss": 6.8733, + "loss/crossentropy": 1.81582195982337, + "loss/hidden": 3.416796875, + "loss/jsd": 0.0, + "loss/logits": 0.18773540575057268, + "step": 920 + }, + { + "epoch": 0.0465, + "grad_norm": 33.0, + "grad_norm_var": 4.476822916666666, + "learning_rate": 0.0001, + "loss": 7.0752, + "loss/crossentropy": 1.8667447365820409, + "loss/hidden": 3.336328125, + "loss/jsd": 0.0, + "loss/logits": 0.18054497512057424, + "step": 930 + }, + { + "epoch": 0.047, + "grad_norm": 28.625, + "grad_norm_var": 6.144205729166667, + "learning_rate": 0.0001, + "loss": 7.0032, + "loss/crossentropy": 1.8144822165369987, + "loss/hidden": 3.271484375, + "loss/jsd": 0.0, + "loss/logits": 0.1632128401659429, + "step": 940 + }, + { + "epoch": 0.0475, + "grad_norm": 30.375, + "grad_norm_var": 5.01875, + "learning_rate": 0.0001, + "loss": 6.8626, + "loss/crossentropy": 1.8152224607765675, + "loss/hidden": 3.394140625, + "loss/jsd": 0.0, + "loss/logits": 0.18933067489415406, + "step": 950 + }, + { + "epoch": 0.048, + "grad_norm": 37.0, + "grad_norm_var": 7.297916666666667, + "learning_rate": 0.0001, + "loss": 7.0437, + "loss/crossentropy": 1.6399064034223556, + "loss/hidden": 3.39140625, + "loss/jsd": 0.0, + "loss/logits": 0.18825935963541268, + "step": 960 + }, + { + "epoch": 0.0485, + "grad_norm": 29.75, + "grad_norm_var": 4.739583333333333, + "learning_rate": 0.0001, + "loss": 7.0331, + "loss/crossentropy": 1.6737658925354482, + "loss/hidden": 3.412890625, + "loss/jsd": 0.0, + "loss/logits": 0.17548465421423315, + "step": 970 + }, + { + "epoch": 0.049, + "grad_norm": 30.0, + "grad_norm_var": 18.1541015625, + "learning_rate": 0.0001, + "loss": 6.9385, + "loss/crossentropy": 1.8608146458864212, + "loss/hidden": 3.35234375, + "loss/jsd": 0.0, + "loss/logits": 0.19196428768336773, + "step": 980 + }, + { + "epoch": 0.0495, + "grad_norm": 33.75, + "grad_norm_var": 4.003125, + "learning_rate": 0.0001, + "loss": 7.0686, + "loss/crossentropy": 1.8301926247775555, + "loss/hidden": 3.347265625, + "loss/jsd": 0.0, + "loss/logits": 0.18049606634303927, + "step": 990 + }, + { + "epoch": 0.05, + "grad_norm": 31.75, + "grad_norm_var": 1.0473683721235639e+18, + "learning_rate": 0.0001, + "loss": 7.0193, + "loss/crossentropy": 1.7465273767709732, + "loss/hidden": 3.369921875, + "loss/jsd": 0.0, + "loss/logits": 0.17173261381685734, + "step": 1000 + }, + { + "epoch": 0.0505, + "grad_norm": 29.75, + "grad_norm_var": 22.408268229166666, + "learning_rate": 0.0001, + "loss": 6.9709, + "loss/crossentropy": 1.7683202728629113, + "loss/hidden": 3.419921875, + "loss/jsd": 0.0, + "loss/logits": 0.210743809863925, + "step": 1010 + }, + { + "epoch": 0.051, + "grad_norm": 28.625, + "grad_norm_var": 2.371875, + "learning_rate": 0.0001, + "loss": 7.0597, + "loss/crossentropy": 2.046058624982834, + "loss/hidden": 3.3375, + "loss/jsd": 0.0, + "loss/logits": 0.18963768277317286, + "step": 1020 + }, + { + "epoch": 0.0515, + "grad_norm": 30.0, + "grad_norm_var": 1.3184895833333334, + "learning_rate": 0.0001, + "loss": 7.0245, + "loss/crossentropy": 1.745854178071022, + "loss/hidden": 3.30390625, + "loss/jsd": 0.0, + "loss/logits": 0.17351055853068828, + "step": 1030 + }, + { + "epoch": 0.052, + "grad_norm": 34.75, + "grad_norm_var": 2.8108723958333335, + "learning_rate": 0.0001, + "loss": 6.9474, + "loss/crossentropy": 1.8277953140437604, + "loss/hidden": 3.33984375, + "loss/jsd": 0.0, + "loss/logits": 0.16915141120553018, + "step": 1040 + }, + { + "epoch": 0.0525, + "grad_norm": 32.5, + "grad_norm_var": 3.39765625, + "learning_rate": 0.0001, + "loss": 6.9366, + "loss/crossentropy": 1.9404960587620734, + "loss/hidden": 3.35625, + "loss/jsd": 0.0, + "loss/logits": 0.18970660548657178, + "step": 1050 + }, + { + "epoch": 0.053, + "grad_norm": 35.75, + "grad_norm_var": 1.1892317588406927e+18, + "learning_rate": 0.0001, + "loss": 7.0954, + "loss/crossentropy": 1.8612810902297496, + "loss/hidden": 3.31171875, + "loss/jsd": 0.0, + "loss/logits": 0.17269262354820966, + "step": 1060 + }, + { + "epoch": 0.0535, + "grad_norm": 29.875, + "grad_norm_var": 1.1892317588497805e+18, + "learning_rate": 0.0001, + "loss": 7.0259, + "loss/crossentropy": 1.743497943878174, + "loss/hidden": 3.2609375, + "loss/jsd": 0.0, + "loss/logits": 0.1666251303628087, + "step": 1070 + }, + { + "epoch": 0.054, + "grad_norm": 29.625, + "grad_norm_var": 2.903059895833333, + "learning_rate": 0.0001, + "loss": 7.0055, + "loss/crossentropy": 1.9657445706427097, + "loss/hidden": 3.32734375, + "loss/jsd": 0.0, + "loss/logits": 0.18259168425574898, + "step": 1080 + }, + { + "epoch": 0.0545, + "grad_norm": 30.25, + "grad_norm_var": 51.16015625, + "learning_rate": 0.0001, + "loss": 7.1126, + "loss/crossentropy": 2.0204195216298104, + "loss/hidden": 3.334765625, + "loss/jsd": 0.0, + "loss/logits": 0.20481194872409106, + "step": 1090 + }, + { + "epoch": 0.055, + "grad_norm": 29.625, + "grad_norm_var": 2.90390625, + "learning_rate": 0.0001, + "loss": 7.0413, + "loss/crossentropy": 1.589720468968153, + "loss/hidden": 3.275, + "loss/jsd": 0.0, + "loss/logits": 0.18000307623296977, + "step": 1100 + }, + { + "epoch": 0.0555, + "grad_norm": 29.375, + "grad_norm_var": 2.2613932291666665, + "learning_rate": 0.0001, + "loss": 6.9722, + "loss/crossentropy": 1.7191244810819626, + "loss/hidden": 3.45390625, + "loss/jsd": 0.0, + "loss/logits": 0.18164545409381389, + "step": 1110 + }, + { + "epoch": 0.056, + "grad_norm": 28.875, + "grad_norm_var": 1.7520833333333334, + "learning_rate": 0.0001, + "loss": 6.9492, + "loss/crossentropy": 1.8928776159882545, + "loss/hidden": 3.358203125, + "loss/jsd": 0.0, + "loss/logits": 0.18985262140631676, + "step": 1120 + }, + { + "epoch": 0.0565, + "grad_norm": 30.0, + "grad_norm_var": 1.2447265625, + "learning_rate": 0.0001, + "loss": 7.1367, + "loss/crossentropy": 1.7702923499047756, + "loss/hidden": 3.32109375, + "loss/jsd": 0.0, + "loss/logits": 0.17983693201094866, + "step": 1130 + }, + { + "epoch": 0.057, + "grad_norm": 30.25, + "grad_norm_var": 3.3080729166666667, + "learning_rate": 0.0001, + "loss": 7.0322, + "loss/crossentropy": 1.8519952863454818, + "loss/hidden": 3.465234375, + "loss/jsd": 0.0, + "loss/logits": 0.20197003111243247, + "step": 1140 + }, + { + "epoch": 0.0575, + "grad_norm": 31.125, + "grad_norm_var": 3.1962890625, + "learning_rate": 0.0001, + "loss": 7.0557, + "loss/crossentropy": 1.8624355979263783, + "loss/hidden": 3.526953125, + "loss/jsd": 0.0, + "loss/logits": 0.20604186709970235, + "step": 1150 + }, + { + "epoch": 0.058, + "grad_norm": 28.5, + "grad_norm_var": 22.8462890625, + "learning_rate": 0.0001, + "loss": 6.9562, + "loss/crossentropy": 1.8102556586265564, + "loss/hidden": 3.44609375, + "loss/jsd": 0.0, + "loss/logits": 0.20240887869149446, + "step": 1160 + }, + { + "epoch": 0.0585, + "grad_norm": 32.25, + "grad_norm_var": 23.950455729166666, + "learning_rate": 0.0001, + "loss": 6.9857, + "loss/crossentropy": 1.8860370084643363, + "loss/hidden": 3.339453125, + "loss/jsd": 0.0, + "loss/logits": 0.18206186592578888, + "step": 1170 + }, + { + "epoch": 0.059, + "grad_norm": 30.125, + "grad_norm_var": 1.6518229166666667, + "learning_rate": 0.0001, + "loss": 7.056, + "loss/crossentropy": 1.9338740326464177, + "loss/hidden": 3.42265625, + "loss/jsd": 0.0, + "loss/logits": 0.22607974465936423, + "step": 1180 + }, + { + "epoch": 0.0595, + "grad_norm": 29.5, + "grad_norm_var": 11.267708333333333, + "learning_rate": 0.0001, + "loss": 6.931, + "loss/crossentropy": 1.9357615426182746, + "loss/hidden": 3.351953125, + "loss/jsd": 0.0, + "loss/logits": 0.1852928228676319, + "step": 1190 + }, + { + "epoch": 0.06, + "grad_norm": 39.25, + "grad_norm_var": 1.2635411532464435e+18, + "learning_rate": 0.0001, + "loss": 7.0138, + "loss/crossentropy": 1.669256182014942, + "loss/hidden": 3.31328125, + "loss/jsd": 0.0, + "loss/logits": 0.1792891369201243, + "step": 1200 + }, + { + "epoch": 0.0605, + "grad_norm": 30.125, + "grad_norm_var": 2.2555340145024479e+18, + "learning_rate": 0.0001, + "loss": 7.003, + "loss/crossentropy": 1.8537344850599766, + "loss/hidden": 3.6078125, + "loss/jsd": 0.0, + "loss/logits": 0.18713028654456138, + "step": 1210 + }, + { + "epoch": 0.061, + "grad_norm": 30.75, + "grad_norm_var": 1.1529214881025404e+18, + "learning_rate": 0.0001, + "loss": 6.9982, + "loss/crossentropy": 1.8868144243955611, + "loss/hidden": 3.259375, + "loss/jsd": 0.0, + "loss/logits": 0.16826356202363968, + "step": 1220 + }, + { + "epoch": 0.0615, + "grad_norm": 38.0, + "grad_norm_var": 11.041080729166667, + "learning_rate": 0.0001, + "loss": 7.1145, + "loss/crossentropy": 1.7373395457863807, + "loss/hidden": 3.26328125, + "loss/jsd": 0.0, + "loss/logits": 0.16631986051797867, + "step": 1230 + }, + { + "epoch": 0.062, + "grad_norm": 28.625, + "grad_norm_var": 6.718489583333334, + "learning_rate": 0.0001, + "loss": 6.8881, + "loss/crossentropy": 1.610298927500844, + "loss/hidden": 3.358984375, + "loss/jsd": 0.0, + "loss/logits": 0.1909397032111883, + "step": 1240 + }, + { + "epoch": 0.0625, + "grad_norm": 29.625, + "grad_norm_var": 4.344205729166666, + "learning_rate": 0.0001, + "loss": 7.0797, + "loss/crossentropy": 1.7361410059034825, + "loss/hidden": 3.366796875, + "loss/jsd": 0.0, + "loss/logits": 0.18541559688746928, + "step": 1250 + }, + { + "epoch": 0.063, + "grad_norm": 27.875, + "grad_norm_var": 3.3889973958333335, + "learning_rate": 0.0001, + "loss": 6.9329, + "loss/crossentropy": 1.7078735738992692, + "loss/hidden": 3.4015625, + "loss/jsd": 0.0, + "loss/logits": 0.18024133574217557, + "step": 1260 + }, + { + "epoch": 0.0635, + "grad_norm": 35.0, + "grad_norm_var": 6.6166015625, + "learning_rate": 0.0001, + "loss": 6.9738, + "loss/crossentropy": 1.8044774197041988, + "loss/hidden": 3.276171875, + "loss/jsd": 0.0, + "loss/logits": 0.1794836211949587, + "step": 1270 + }, + { + "epoch": 0.064, + "grad_norm": 29.375, + "grad_norm_var": 13.601822916666666, + "learning_rate": 0.0001, + "loss": 6.9062, + "loss/crossentropy": 1.8313415050506592, + "loss/hidden": 3.3140625, + "loss/jsd": 0.0, + "loss/logits": 0.18087668968364595, + "step": 1280 + }, + { + "epoch": 0.0645, + "grad_norm": 29.75, + "grad_norm_var": 3.6020182291666667, + "learning_rate": 0.0001, + "loss": 6.9407, + "loss/crossentropy": 1.6438103877007961, + "loss/hidden": 3.41875, + "loss/jsd": 0.0, + "loss/logits": 0.1820345466956496, + "step": 1290 + }, + { + "epoch": 0.065, + "grad_norm": 30.25, + "grad_norm_var": 1.2379557291666667, + "learning_rate": 0.0001, + "loss": 7.0302, + "loss/crossentropy": 1.7621051207184792, + "loss/hidden": 3.41171875, + "loss/jsd": 0.0, + "loss/logits": 0.19308385904878378, + "step": 1300 + }, + { + "epoch": 0.0655, + "grad_norm": 29.375, + "grad_norm_var": 3.46640625, + "learning_rate": 0.0001, + "loss": 7.1178, + "loss/crossentropy": 1.871315811574459, + "loss/hidden": 3.3875, + "loss/jsd": 0.0, + "loss/logits": 0.19272034596651794, + "step": 1310 + }, + { + "epoch": 0.066, + "grad_norm": 31.625, + "grad_norm_var": 3.609375, + "learning_rate": 0.0001, + "loss": 7.0298, + "loss/crossentropy": 1.8252998240292073, + "loss/hidden": 3.36875, + "loss/jsd": 0.0, + "loss/logits": 0.21978344805538655, + "step": 1320 + }, + { + "epoch": 0.0665, + "grad_norm": 33.5, + "grad_norm_var": 1.3990009840566536e+18, + "learning_rate": 0.0001, + "loss": 7.068, + "loss/crossentropy": 1.639507355540991, + "loss/hidden": 3.60703125, + "loss/jsd": 0.0, + "loss/logits": 0.18024437148123978, + "step": 1330 + }, + { + "epoch": 0.067, + "grad_norm": 28.75, + "grad_norm_var": 1.3990009842291443e+18, + "learning_rate": 0.0001, + "loss": 6.9556, + "loss/crossentropy": 1.8158223167061807, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.18003626042045653, + "step": 1340 + }, + { + "epoch": 0.0675, + "grad_norm": 29.75, + "grad_norm_var": 3.21640625, + "learning_rate": 0.0001, + "loss": 6.7859, + "loss/crossentropy": 1.6335266396403312, + "loss/hidden": 3.38046875, + "loss/jsd": 0.0, + "loss/logits": 0.1845483684912324, + "step": 1350 + }, + { + "epoch": 0.068, + "grad_norm": 30.75, + "grad_norm_var": 2.5497395833333334, + "learning_rate": 0.0001, + "loss": 6.8607, + "loss/crossentropy": 1.7433619983494282, + "loss/hidden": 3.3484375, + "loss/jsd": 0.0, + "loss/logits": 0.17121702507138253, + "step": 1360 + }, + { + "epoch": 0.0685, + "grad_norm": 28.0, + "grad_norm_var": 4.353580729166667, + "learning_rate": 0.0001, + "loss": 7.1422, + "loss/crossentropy": 1.8455571182072164, + "loss/hidden": 3.333203125, + "loss/jsd": 0.0, + "loss/logits": 0.2054300512187183, + "step": 1370 + }, + { + "epoch": 0.069, + "grad_norm": 29.625, + "grad_norm_var": 3.388541666666667, + "learning_rate": 0.0001, + "loss": 7.0213, + "loss/crossentropy": 1.8241696588695049, + "loss/hidden": 3.321875, + "loss/jsd": 0.0, + "loss/logits": 0.18985041994601487, + "step": 1380 + }, + { + "epoch": 0.0695, + "grad_norm": 31.25, + "grad_norm_var": 8.0431640625, + "learning_rate": 0.0001, + "loss": 7.0, + "loss/crossentropy": 1.7940153643488883, + "loss/hidden": 3.331640625, + "loss/jsd": 0.0, + "loss/logits": 0.18176266234368085, + "step": 1390 + }, + { + "epoch": 0.07, + "grad_norm": 33.0, + "grad_norm_var": 14.3041015625, + "learning_rate": 0.0001, + "loss": 6.898, + "loss/crossentropy": 1.8607503667473793, + "loss/hidden": 3.326953125, + "loss/jsd": 0.0, + "loss/logits": 0.17468307819217443, + "step": 1400 + }, + { + "epoch": 0.0705, + "grad_norm": 28.125, + "grad_norm_var": 13.432291666666666, + "learning_rate": 0.0001, + "loss": 7.031, + "loss/crossentropy": 1.6316836021840573, + "loss/hidden": 3.240234375, + "loss/jsd": 0.0, + "loss/logits": 0.15119749261066318, + "step": 1410 + }, + { + "epoch": 0.071, + "grad_norm": 28.25, + "grad_norm_var": 45.9634765625, + "learning_rate": 0.0001, + "loss": 7.1507, + "loss/crossentropy": 1.8821631267666816, + "loss/hidden": 3.465625, + "loss/jsd": 0.0, + "loss/logits": 0.19027305245399476, + "step": 1420 + }, + { + "epoch": 0.0715, + "grad_norm": 28.375, + "grad_norm_var": 46.1884765625, + "learning_rate": 0.0001, + "loss": 7.063, + "loss/crossentropy": 1.6992614693939685, + "loss/hidden": 3.4625, + "loss/jsd": 0.0, + "loss/logits": 0.2002884623594582, + "step": 1430 + }, + { + "epoch": 0.072, + "grad_norm": 29.625, + "grad_norm_var": 6.732291666666667, + "learning_rate": 0.0001, + "loss": 6.9439, + "loss/crossentropy": 1.7733798533678056, + "loss/hidden": 3.307421875, + "loss/jsd": 0.0, + "loss/logits": 0.17554995641112328, + "step": 1440 + }, + { + "epoch": 0.0725, + "grad_norm": 30.625, + "grad_norm_var": 24.97265625, + "learning_rate": 0.0001, + "loss": 7.0264, + "loss/crossentropy": 1.8444553710520268, + "loss/hidden": 3.412109375, + "loss/jsd": 0.0, + "loss/logits": 0.1976129287853837, + "step": 1450 + }, + { + "epoch": 0.073, + "grad_norm": 41.5, + "grad_norm_var": 18.2275390625, + "learning_rate": 0.0001, + "loss": 7.0056, + "loss/crossentropy": 1.778428715467453, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.17879956895485521, + "step": 1460 + }, + { + "epoch": 0.0735, + "grad_norm": 40.75, + "grad_norm_var": 14.88515625, + "learning_rate": 0.0001, + "loss": 6.8647, + "loss/crossentropy": 1.8260969623923302, + "loss/hidden": 3.431640625, + "loss/jsd": 0.0, + "loss/logits": 0.18223165888339282, + "step": 1470 + }, + { + "epoch": 0.074, + "grad_norm": 30.75, + "grad_norm_var": 12.42265625, + "learning_rate": 0.0001, + "loss": 6.9814, + "loss/crossentropy": 1.852180902659893, + "loss/hidden": 3.188671875, + "loss/jsd": 0.0, + "loss/logits": 0.15915404492989182, + "step": 1480 + }, + { + "epoch": 0.0745, + "grad_norm": 32.0, + "grad_norm_var": 17.264518229166665, + "learning_rate": 0.0001, + "loss": 6.9467, + "loss/crossentropy": 1.8016018435359, + "loss/hidden": 3.30234375, + "loss/jsd": 0.0, + "loss/logits": 0.17374343778938056, + "step": 1490 + }, + { + "epoch": 0.075, + "grad_norm": 27.75, + "grad_norm_var": 16.795572916666668, + "learning_rate": 0.0001, + "loss": 6.9688, + "loss/crossentropy": 1.7803546212613583, + "loss/hidden": 3.230078125, + "loss/jsd": 0.0, + "loss/logits": 0.1623454326763749, + "step": 1500 + }, + { + "epoch": 0.0755, + "grad_norm": 27.125, + "grad_norm_var": 11.0072265625, + "learning_rate": 0.0001, + "loss": 6.9148, + "loss/crossentropy": 1.7990518882870674, + "loss/hidden": 3.341015625, + "loss/jsd": 0.0, + "loss/logits": 0.1776049867272377, + "step": 1510 + }, + { + "epoch": 0.076, + "grad_norm": 28.875, + "grad_norm_var": 9.0009765625, + "learning_rate": 0.0001, + "loss": 6.9834, + "loss/crossentropy": 1.7659361466765404, + "loss/hidden": 3.229296875, + "loss/jsd": 0.0, + "loss/logits": 0.17018448635935784, + "step": 1520 + }, + { + "epoch": 0.0765, + "grad_norm": 28.75, + "grad_norm_var": 5.566666666666666, + "learning_rate": 0.0001, + "loss": 6.9513, + "loss/crossentropy": 1.948898734152317, + "loss/hidden": 3.368359375, + "loss/jsd": 0.0, + "loss/logits": 0.20332392100244762, + "step": 1530 + }, + { + "epoch": 0.077, + "grad_norm": 37.0, + "grad_norm_var": 12.0337890625, + "learning_rate": 0.0001, + "loss": 6.9845, + "loss/crossentropy": 1.897236557304859, + "loss/hidden": 3.305078125, + "loss/jsd": 0.0, + "loss/logits": 0.1786106862127781, + "step": 1540 + }, + { + "epoch": 0.0775, + "grad_norm": 30.75, + "grad_norm_var": 10.74140625, + "learning_rate": 0.0001, + "loss": 6.9651, + "loss/crossentropy": 1.668473443388939, + "loss/hidden": 3.30390625, + "loss/jsd": 0.0, + "loss/logits": 0.18010491924360394, + "step": 1550 + }, + { + "epoch": 0.078, + "grad_norm": 35.0, + "grad_norm_var": 11.645768229166666, + "learning_rate": 0.0001, + "loss": 7.0873, + "loss/crossentropy": 1.8844516187906266, + "loss/hidden": 3.323828125, + "loss/jsd": 0.0, + "loss/logits": 0.19164156243205072, + "step": 1560 + }, + { + "epoch": 0.0785, + "grad_norm": 36.5, + "grad_norm_var": 9.326497395833334, + "learning_rate": 0.0001, + "loss": 6.9175, + "loss/crossentropy": 1.7603260070085525, + "loss/hidden": 3.276953125, + "loss/jsd": 0.0, + "loss/logits": 0.17738686297088863, + "step": 1570 + }, + { + "epoch": 0.079, + "grad_norm": 28.25, + "grad_norm_var": 11.4259765625, + "learning_rate": 0.0001, + "loss": 7.0352, + "loss/crossentropy": 1.8728493131697177, + "loss/hidden": 3.341796875, + "loss/jsd": 0.0, + "loss/logits": 0.19688725294545292, + "step": 1580 + }, + { + "epoch": 0.0795, + "grad_norm": 29.25, + "grad_norm_var": 8.5375, + "learning_rate": 0.0001, + "loss": 6.955, + "loss/crossentropy": 1.8099886417388915, + "loss/hidden": 3.29375, + "loss/jsd": 0.0, + "loss/logits": 0.18610341083258392, + "step": 1590 + }, + { + "epoch": 0.08, + "grad_norm": 36.0, + "grad_norm_var": 19.722330729166668, + "learning_rate": 0.0001, + "loss": 6.9313, + "loss/crossentropy": 1.7017989411950112, + "loss/hidden": 3.35234375, + "loss/jsd": 0.0, + "loss/logits": 0.17710780492052436, + "step": 1600 + }, + { + "epoch": 0.0805, + "grad_norm": 32.25, + "grad_norm_var": 21.603125, + "learning_rate": 0.0001, + "loss": 7.069, + "loss/crossentropy": 1.7873531341552735, + "loss/hidden": 3.333203125, + "loss/jsd": 0.0, + "loss/logits": 0.1812642457894981, + "step": 1610 + }, + { + "epoch": 0.081, + "grad_norm": 28.875, + "grad_norm_var": 3.2207682291666666, + "learning_rate": 0.0001, + "loss": 7.0405, + "loss/crossentropy": 1.7903928458690643, + "loss/hidden": 3.394921875, + "loss/jsd": 0.0, + "loss/logits": 0.19645511778071523, + "step": 1620 + }, + { + "epoch": 0.0815, + "grad_norm": 29.75, + "grad_norm_var": 2.874739583333333, + "learning_rate": 0.0001, + "loss": 7.0022, + "loss/crossentropy": 1.6019535034894943, + "loss/hidden": 3.271484375, + "loss/jsd": 0.0, + "loss/logits": 0.1628541074693203, + "step": 1630 + }, + { + "epoch": 0.082, + "grad_norm": 31.375, + "grad_norm_var": 6.37265625, + "learning_rate": 0.0001, + "loss": 6.7734, + "loss/crossentropy": 1.7893570616841317, + "loss/hidden": 3.371484375, + "loss/jsd": 0.0, + "loss/logits": 0.2000499103218317, + "step": 1640 + }, + { + "epoch": 0.0825, + "grad_norm": 30.5, + "grad_norm_var": 6.910416666666666, + "learning_rate": 0.0001, + "loss": 6.9578, + "loss/crossentropy": 1.6443258710205555, + "loss/hidden": 3.259765625, + "loss/jsd": 0.0, + "loss/logits": 0.16416865289211274, + "step": 1650 + }, + { + "epoch": 0.083, + "grad_norm": 30.5, + "grad_norm_var": 35.25182291666667, + "learning_rate": 0.0001, + "loss": 7.0861, + "loss/crossentropy": 1.8358689159154893, + "loss/hidden": 3.28359375, + "loss/jsd": 0.0, + "loss/logits": 0.1853348884731531, + "step": 1660 + }, + { + "epoch": 0.0835, + "grad_norm": 30.0, + "grad_norm_var": 15.6587890625, + "learning_rate": 0.0001, + "loss": 6.9008, + "loss/crossentropy": 1.9014468491077423, + "loss/hidden": 3.34140625, + "loss/jsd": 0.0, + "loss/logits": 0.19975380562245845, + "step": 1670 + }, + { + "epoch": 0.084, + "grad_norm": 28.25, + "grad_norm_var": 4.9666015625, + "learning_rate": 0.0001, + "loss": 7.0062, + "loss/crossentropy": 1.7637556672096253, + "loss/hidden": 3.40703125, + "loss/jsd": 0.0, + "loss/logits": 0.19306765552610158, + "step": 1680 + }, + { + "epoch": 0.0845, + "grad_norm": 44.0, + "grad_norm_var": 14.08125, + "learning_rate": 0.0001, + "loss": 6.9184, + "loss/crossentropy": 1.7980270460247993, + "loss/hidden": 3.336328125, + "loss/jsd": 0.0, + "loss/logits": 0.17251317510381342, + "step": 1690 + }, + { + "epoch": 0.085, + "grad_norm": 30.0, + "grad_norm_var": 16.656184895833334, + "learning_rate": 0.0001, + "loss": 6.8985, + "loss/crossentropy": 1.9003560155630113, + "loss/hidden": 3.336328125, + "loss/jsd": 0.0, + "loss/logits": 0.19372209012508393, + "step": 1700 + }, + { + "epoch": 0.0855, + "grad_norm": 28.375, + "grad_norm_var": 4.02265625, + "learning_rate": 0.0001, + "loss": 6.8638, + "loss/crossentropy": 1.7488896727561951, + "loss/hidden": 3.31484375, + "loss/jsd": 0.0, + "loss/logits": 0.16111841816455125, + "step": 1710 + }, + { + "epoch": 0.086, + "grad_norm": 4362076160.0, + "grad_norm_var": 1.1892317599584748e+18, + "learning_rate": 0.0001, + "loss": 7.061, + "loss/crossentropy": 1.7708093903958797, + "loss/hidden": 3.35625, + "loss/jsd": 0.0, + "loss/logits": 0.19512660000473261, + "step": 1720 + }, + { + "epoch": 0.0865, + "grad_norm": 30.375, + "grad_norm_var": 1.1892317591996554e+18, + "learning_rate": 0.0001, + "loss": 6.8861, + "loss/crossentropy": 1.6944726780056953, + "loss/hidden": 3.333203125, + "loss/jsd": 0.0, + "loss/logits": 0.16455791369080544, + "step": 1730 + }, + { + "epoch": 0.087, + "grad_norm": 29.375, + "grad_norm_var": 3.2905598958333333, + "learning_rate": 0.0001, + "loss": 6.8425, + "loss/crossentropy": 1.7352489478886128, + "loss/hidden": 3.32421875, + "loss/jsd": 0.0, + "loss/logits": 0.16651339596137404, + "step": 1740 + }, + { + "epoch": 0.0875, + "grad_norm": 29.875, + "grad_norm_var": 1.81015625, + "learning_rate": 0.0001, + "loss": 6.886, + "loss/crossentropy": 1.775932352244854, + "loss/hidden": 3.375, + "loss/jsd": 0.0, + "loss/logits": 0.18791395220905543, + "step": 1750 + }, + { + "epoch": 0.088, + "grad_norm": 29.25, + "grad_norm_var": 2.9848307291666667, + "learning_rate": 0.0001, + "loss": 6.8755, + "loss/crossentropy": 1.700956543534994, + "loss/hidden": 3.359765625, + "loss/jsd": 0.0, + "loss/logits": 0.17034402694553136, + "step": 1760 + }, + { + "epoch": 0.0885, + "grad_norm": 30.375, + "grad_norm_var": 2.0660807291666665, + "learning_rate": 0.0001, + "loss": 6.9996, + "loss/crossentropy": 1.6696124613285064, + "loss/hidden": 3.317578125, + "loss/jsd": 0.0, + "loss/logits": 0.17471891567111014, + "step": 1770 + }, + { + "epoch": 0.089, + "grad_norm": 29.0, + "grad_norm_var": 2.7729166666666667, + "learning_rate": 0.0001, + "loss": 6.8325, + "loss/crossentropy": 1.6660587199032306, + "loss/hidden": 3.328515625, + "loss/jsd": 0.0, + "loss/logits": 0.1662266943603754, + "step": 1780 + }, + { + "epoch": 0.0895, + "grad_norm": 32.5, + "grad_norm_var": 4.6900390625, + "learning_rate": 0.0001, + "loss": 6.947, + "loss/crossentropy": 1.8900059774518012, + "loss/hidden": 3.315234375, + "loss/jsd": 0.0, + "loss/logits": 0.18781680446118115, + "step": 1790 + }, + { + "epoch": 0.09, + "grad_norm": 30.0, + "grad_norm_var": 4.231705729166666, + "learning_rate": 0.0001, + "loss": 6.9437, + "loss/crossentropy": 1.8869778975844382, + "loss/hidden": 3.269921875, + "loss/jsd": 0.0, + "loss/logits": 0.17426692880690098, + "step": 1800 + }, + { + "epoch": 0.0905, + "grad_norm": 33.0, + "grad_norm_var": 2.8309895833333334, + "learning_rate": 0.0001, + "loss": 6.9652, + "loss/crossentropy": 1.8232818126678467, + "loss/hidden": 3.331640625, + "loss/jsd": 0.0, + "loss/logits": 0.16745625659823418, + "step": 1810 + }, + { + "epoch": 0.091, + "grad_norm": 34.25, + "grad_norm_var": 4.40390625, + "learning_rate": 0.0001, + "loss": 7.0219, + "loss/crossentropy": 1.8258642494678496, + "loss/hidden": 3.315234375, + "loss/jsd": 0.0, + "loss/logits": 0.19198300442658364, + "step": 1820 + }, + { + "epoch": 0.0915, + "grad_norm": 32.25, + "grad_norm_var": 8.268684895833333, + "learning_rate": 0.0001, + "loss": 6.8434, + "loss/crossentropy": 1.7024194486439228, + "loss/hidden": 3.409765625, + "loss/jsd": 0.0, + "loss/logits": 0.18930096151307224, + "step": 1830 + }, + { + "epoch": 0.092, + "grad_norm": 31.625, + "grad_norm_var": 6.74765625, + "learning_rate": 0.0001, + "loss": 6.9231, + "loss/crossentropy": 1.7479817308485508, + "loss/hidden": 3.3453125, + "loss/jsd": 0.0, + "loss/logits": 0.1829341644886881, + "step": 1840 + }, + { + "epoch": 0.0925, + "grad_norm": 33.75, + "grad_norm_var": 4.48515625, + "learning_rate": 0.0001, + "loss": 7.0635, + "loss/crossentropy": 2.0127600729465485, + "loss/hidden": 3.2953125, + "loss/jsd": 0.0, + "loss/logits": 0.18128359764814378, + "step": 1850 + }, + { + "epoch": 0.093, + "grad_norm": 31.75, + "grad_norm_var": 11.642708333333333, + "learning_rate": 0.0001, + "loss": 6.9505, + "loss/crossentropy": 1.7567149683833123, + "loss/hidden": 3.343359375, + "loss/jsd": 0.0, + "loss/logits": 0.1842447452247143, + "step": 1860 + }, + { + "epoch": 0.0935, + "grad_norm": 34.5, + "grad_norm_var": 1.5832967231255347e+18, + "learning_rate": 0.0001, + "loss": 7.1294, + "loss/crossentropy": 1.8183075070381165, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.17170923966914414, + "step": 1870 + }, + { + "epoch": 0.094, + "grad_norm": 36.0, + "grad_norm_var": 14.670833333333333, + "learning_rate": 0.0001, + "loss": 6.7269, + "loss/crossentropy": 1.6782560005784035, + "loss/hidden": 3.329296875, + "loss/jsd": 0.0, + "loss/logits": 0.16191824562847615, + "step": 1880 + }, + { + "epoch": 0.0945, + "grad_norm": 29.5, + "grad_norm_var": 8.283984344848707e+17, + "learning_rate": 0.0001, + "loss": 6.9423, + "loss/crossentropy": 1.7822233349084855, + "loss/hidden": 3.319140625, + "loss/jsd": 0.0, + "loss/logits": 0.15704208929091693, + "step": 1890 + }, + { + "epoch": 0.095, + "grad_norm": 27.25, + "grad_norm_var": 12.049739583333333, + "learning_rate": 0.0001, + "loss": 6.8598, + "loss/crossentropy": 1.8880347676575184, + "loss/hidden": 3.30546875, + "loss/jsd": 0.0, + "loss/logits": 0.18590961638838052, + "step": 1900 + }, + { + "epoch": 0.0955, + "grad_norm": 32.75, + "grad_norm_var": 6.827351348981094e+17, + "learning_rate": 0.0001, + "loss": 7.0671, + "loss/crossentropy": 1.6947499185800552, + "loss/hidden": 3.341015625, + "loss/jsd": 0.0, + "loss/logits": 0.17880834415555, + "step": 1910 + }, + { + "epoch": 0.096, + "grad_norm": 30.875, + "grad_norm_var": 7.036874278235887e+17, + "learning_rate": 0.0001, + "loss": 6.8978, + "loss/crossentropy": 1.6141892828047275, + "loss/hidden": 3.35390625, + "loss/jsd": 0.0, + "loss/logits": 0.18202604549005627, + "step": 1920 + }, + { + "epoch": 0.0965, + "grad_norm": 29.625, + "grad_norm_var": 12.239583333333334, + "learning_rate": 0.0001, + "loss": 6.9659, + "loss/crossentropy": 1.7211613908410073, + "loss/hidden": 3.29453125, + "loss/jsd": 0.0, + "loss/logits": 0.19102244451642036, + "step": 1930 + }, + { + "epoch": 0.097, + "grad_norm": 28.375, + "grad_norm_var": 15.983268229166667, + "learning_rate": 0.0001, + "loss": 6.8912, + "loss/crossentropy": 1.7675188466906548, + "loss/hidden": 3.32421875, + "loss/jsd": 0.0, + "loss/logits": 0.19818378714844584, + "step": 1940 + }, + { + "epoch": 0.0975, + "grad_norm": 32.75, + "grad_norm_var": 9.306266259729068e+17, + "learning_rate": 0.0001, + "loss": 6.9645, + "loss/crossentropy": 1.7558425486087799, + "loss/hidden": 3.419921875, + "loss/jsd": 0.0, + "loss/logits": 0.1911760584451258, + "step": 1950 + }, + { + "epoch": 0.098, + "grad_norm": 27.625, + "grad_norm_var": 1.5205981735288307e+18, + "learning_rate": 0.0001, + "loss": 6.8635, + "loss/crossentropy": 1.7457415886223315, + "loss/hidden": 3.384765625, + "loss/jsd": 0.0, + "loss/logits": 0.1852768061682582, + "step": 1960 + }, + { + "epoch": 0.0985, + "grad_norm": 32.75, + "grad_norm_var": 14.7125, + "learning_rate": 0.0001, + "loss": 6.8508, + "loss/crossentropy": 1.683419554680586, + "loss/hidden": 3.337890625, + "loss/jsd": 0.0, + "loss/logits": 0.1731728465296328, + "step": 1970 + }, + { + "epoch": 0.099, + "grad_norm": 30.625, + "grad_norm_var": 1.0302687666727377e+18, + "learning_rate": 0.0001, + "loss": 7.0005, + "loss/crossentropy": 1.727415306866169, + "loss/hidden": 3.297265625, + "loss/jsd": 0.0, + "loss/logits": 0.18517111875116826, + "step": 1980 + }, + { + "epoch": 0.0995, + "grad_norm": 32.25, + "grad_norm_var": 22.14375, + "learning_rate": 0.0001, + "loss": 6.9138, + "loss/crossentropy": 1.8120180189609527, + "loss/hidden": 3.434375, + "loss/jsd": 0.0, + "loss/logits": 0.20129222217947246, + "step": 1990 + }, + { + "epoch": 0.1, + "grad_norm": 35.5, + "grad_norm_var": 8.491080729166667, + "learning_rate": 0.0001, + "loss": 6.9525, + "loss/crossentropy": 1.8299045406281949, + "loss/hidden": 3.251171875, + "loss/jsd": 0.0, + "loss/logits": 0.17095453599467875, + "step": 2000 + }, + { + "epoch": 0.1005, + "grad_norm": 32.75, + "grad_norm_var": 8.586458333333333, + "learning_rate": 0.0001, + "loss": 6.7871, + "loss/crossentropy": 1.7243870817124844, + "loss/hidden": 3.3703125, + "loss/jsd": 0.0, + "loss/logits": 0.16602067481726407, + "step": 2010 + }, + { + "epoch": 0.101, + "grad_norm": 29.625, + "grad_norm_var": 9.378125, + "learning_rate": 0.0001, + "loss": 6.855, + "loss/crossentropy": 1.6784847162663936, + "loss/hidden": 3.225, + "loss/jsd": 0.0, + "loss/logits": 0.16919725136831404, + "step": 2020 + }, + { + "epoch": 0.1015, + "grad_norm": 41.0, + "grad_norm_var": 112.83170572916667, + "learning_rate": 0.0001, + "loss": 6.9616, + "loss/crossentropy": 1.8477609053254127, + "loss/hidden": 3.259375, + "loss/jsd": 0.0, + "loss/logits": 0.16309508439153433, + "step": 2030 + }, + { + "epoch": 0.102, + "grad_norm": 30.0, + "grad_norm_var": 111.6259765625, + "learning_rate": 0.0001, + "loss": 6.9517, + "loss/crossentropy": 1.7308252967894078, + "loss/hidden": 3.202734375, + "loss/jsd": 0.0, + "loss/logits": 0.1722710312344134, + "step": 2040 + }, + { + "epoch": 0.1025, + "grad_norm": 30.625, + "grad_norm_var": 4.073893229166667, + "learning_rate": 0.0001, + "loss": 6.9088, + "loss/crossentropy": 1.7544417701661588, + "loss/hidden": 3.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.19881883040070533, + "step": 2050 + }, + { + "epoch": 0.103, + "grad_norm": 38.0, + "grad_norm_var": 13.948958333333334, + "learning_rate": 0.0001, + "loss": 6.9474, + "loss/crossentropy": 1.9995075345039368, + "loss/hidden": 3.271875, + "loss/jsd": 0.0, + "loss/logits": 0.17399701047688723, + "step": 2060 + }, + { + "epoch": 0.1035, + "grad_norm": 31.75, + "grad_norm_var": 21.0744140625, + "learning_rate": 0.0001, + "loss": 6.8732, + "loss/crossentropy": 1.8493791602551937, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.16063635479658842, + "step": 2070 + }, + { + "epoch": 0.104, + "grad_norm": 32.25, + "grad_norm_var": 17.897916666666667, + "learning_rate": 0.0001, + "loss": 6.9556, + "loss/crossentropy": 1.737601400911808, + "loss/hidden": 3.333984375, + "loss/jsd": 0.0, + "loss/logits": 0.18038861453533173, + "step": 2080 + }, + { + "epoch": 0.1045, + "grad_norm": 32.25, + "grad_norm_var": 3.38515625, + "learning_rate": 0.0001, + "loss": 6.979, + "loss/crossentropy": 1.7256839543581008, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.19299248773604633, + "step": 2090 + }, + { + "epoch": 0.105, + "grad_norm": 31.0, + "grad_norm_var": 3.4853515625, + "learning_rate": 0.0001, + "loss": 6.8191, + "loss/crossentropy": 1.7587849080562592, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.16214433256536723, + "step": 2100 + }, + { + "epoch": 0.1055, + "grad_norm": 33.5, + "grad_norm_var": 4.112239583333333, + "learning_rate": 0.0001, + "loss": 7.0774, + "loss/crossentropy": 2.092029668390751, + "loss/hidden": 3.332421875, + "loss/jsd": 0.0, + "loss/logits": 0.19293731367215514, + "step": 2110 + }, + { + "epoch": 0.106, + "grad_norm": 30.375, + "grad_norm_var": 5.1072265625, + "learning_rate": 0.0001, + "loss": 6.9724, + "loss/crossentropy": 1.7829479269683361, + "loss/hidden": 3.349609375, + "loss/jsd": 0.0, + "loss/logits": 0.19456620067358016, + "step": 2120 + }, + { + "epoch": 0.1065, + "grad_norm": 29.375, + "grad_norm_var": 20.4525390625, + "learning_rate": 0.0001, + "loss": 6.9908, + "loss/crossentropy": 1.7853210166096687, + "loss/hidden": 3.32265625, + "loss/jsd": 0.0, + "loss/logits": 0.18279874734580517, + "step": 2130 + }, + { + "epoch": 0.107, + "grad_norm": 36.5, + "grad_norm_var": 20.847330729166668, + "learning_rate": 0.0001, + "loss": 6.9787, + "loss/crossentropy": 1.8366479635238648, + "loss/hidden": 3.324609375, + "loss/jsd": 0.0, + "loss/logits": 0.17941316729411483, + "step": 2140 + }, + { + "epoch": 0.1075, + "grad_norm": 29.25, + "grad_norm_var": 5.1384765625, + "learning_rate": 0.0001, + "loss": 7.0703, + "loss/crossentropy": 1.8491265431046486, + "loss/hidden": 3.253515625, + "loss/jsd": 0.0, + "loss/logits": 0.1788581835106015, + "step": 2150 + }, + { + "epoch": 0.108, + "grad_norm": 28.5, + "grad_norm_var": 3.8082682291666665, + "learning_rate": 0.0001, + "loss": 7.0117, + "loss/crossentropy": 1.8718080654740334, + "loss/hidden": 3.36015625, + "loss/jsd": 0.0, + "loss/logits": 0.18362828250974417, + "step": 2160 + }, + { + "epoch": 0.1085, + "grad_norm": 31.375, + "grad_norm_var": 4.0541015625, + "learning_rate": 0.0001, + "loss": 6.9147, + "loss/crossentropy": 1.823565386980772, + "loss/hidden": 3.346875, + "loss/jsd": 0.0, + "loss/logits": 0.17529825307428837, + "step": 2170 + }, + { + "epoch": 0.109, + "grad_norm": 29.875, + "grad_norm_var": 3.1510416666666665, + "learning_rate": 0.0001, + "loss": 6.8799, + "loss/crossentropy": 1.8646746143698691, + "loss/hidden": 3.3625, + "loss/jsd": 0.0, + "loss/logits": 0.18420496406033635, + "step": 2180 + }, + { + "epoch": 0.1095, + "grad_norm": 28.0, + "grad_norm_var": 1.6061848958333333, + "learning_rate": 0.0001, + "loss": 6.9741, + "loss/crossentropy": 1.8418309345841408, + "loss/hidden": 3.289453125, + "loss/jsd": 0.0, + "loss/logits": 0.17159662526100875, + "step": 2190 + }, + { + "epoch": 0.11, + "grad_norm": 31.375, + "grad_norm_var": 2.4184895833333333, + "learning_rate": 0.0001, + "loss": 7.0042, + "loss/crossentropy": 1.8776386469602584, + "loss/hidden": 3.40390625, + "loss/jsd": 0.0, + "loss/logits": 0.187642621435225, + "step": 2200 + }, + { + "epoch": 0.1105, + "grad_norm": 29.125, + "grad_norm_var": 8.20781018083492e+17, + "learning_rate": 0.0001, + "loss": 6.9378, + "loss/crossentropy": 1.655004223436117, + "loss/hidden": 3.273046875, + "loss/jsd": 0.0, + "loss/logits": 0.1580679954495281, + "step": 2210 + }, + { + "epoch": 0.111, + "grad_norm": 30.125, + "grad_norm_var": 3.468489583333333, + "learning_rate": 0.0001, + "loss": 6.9831, + "loss/crossentropy": 1.792271687835455, + "loss/hidden": 3.277734375, + "loss/jsd": 0.0, + "loss/logits": 0.17089223572984338, + "step": 2220 + }, + { + "epoch": 0.1115, + "grad_norm": 34.75, + "grad_norm_var": 4.209375, + "learning_rate": 0.0001, + "loss": 6.8364, + "loss/crossentropy": 1.734425350278616, + "loss/hidden": 3.394140625, + "loss/jsd": 0.0, + "loss/logits": 0.18262410946190358, + "step": 2230 + }, + { + "epoch": 0.112, + "grad_norm": 27.0, + "grad_norm_var": 4.629166666666666, + "learning_rate": 0.0001, + "loss": 6.8305, + "loss/crossentropy": 1.772131036967039, + "loss/hidden": 3.274609375, + "loss/jsd": 0.0, + "loss/logits": 0.1691578391008079, + "step": 2240 + }, + { + "epoch": 0.1125, + "grad_norm": 29.125, + "grad_norm_var": 6.303580729166667, + "learning_rate": 0.0001, + "loss": 6.9967, + "loss/crossentropy": 1.9334307715296746, + "loss/hidden": 3.383203125, + "loss/jsd": 0.0, + "loss/logits": 0.19251629430800676, + "step": 2250 + }, + { + "epoch": 0.113, + "grad_norm": 36.5, + "grad_norm_var": 6.4791015625, + "learning_rate": 0.0001, + "loss": 6.981, + "loss/crossentropy": 1.887280984222889, + "loss/hidden": 3.358984375, + "loss/jsd": 0.0, + "loss/logits": 0.21319616939872504, + "step": 2260 + }, + { + "epoch": 0.1135, + "grad_norm": 28.75, + "grad_norm_var": 4.7009765625, + "learning_rate": 0.0001, + "loss": 7.0286, + "loss/crossentropy": 1.8285806521773338, + "loss/hidden": 3.41484375, + "loss/jsd": 0.0, + "loss/logits": 0.18080311622470618, + "step": 2270 + }, + { + "epoch": 0.114, + "grad_norm": 31.0, + "grad_norm_var": 7.09375, + "learning_rate": 0.0001, + "loss": 6.863, + "loss/crossentropy": 1.6441345304250716, + "loss/hidden": 3.323046875, + "loss/jsd": 0.0, + "loss/logits": 0.18446694109588863, + "step": 2280 + }, + { + "epoch": 0.1145, + "grad_norm": 30.125, + "grad_norm_var": 9.029166666666667, + "learning_rate": 0.0001, + "loss": 6.8549, + "loss/crossentropy": 1.5048397369682789, + "loss/hidden": 3.359765625, + "loss/jsd": 0.0, + "loss/logits": 0.16425186553969978, + "step": 2290 + }, + { + "epoch": 0.115, + "grad_norm": 28.625, + "grad_norm_var": 3.9400390625, + "learning_rate": 0.0001, + "loss": 6.9448, + "loss/crossentropy": 1.7213742382824422, + "loss/hidden": 3.3328125, + "loss/jsd": 0.0, + "loss/logits": 0.17190376687794923, + "step": 2300 + }, + { + "epoch": 0.1155, + "grad_norm": 29.125, + "grad_norm_var": 51.71608072916667, + "learning_rate": 0.0001, + "loss": 7.0456, + "loss/crossentropy": 1.8745042860507966, + "loss/hidden": 3.3703125, + "loss/jsd": 0.0, + "loss/logits": 0.1922046933323145, + "step": 2310 + }, + { + "epoch": 0.116, + "grad_norm": 31.625, + "grad_norm_var": 5.101822916666666, + "learning_rate": 0.0001, + "loss": 7.0037, + "loss/crossentropy": 1.835337746143341, + "loss/hidden": 3.291015625, + "loss/jsd": 0.0, + "loss/logits": 0.172516768053174, + "step": 2320 + }, + { + "epoch": 0.1165, + "grad_norm": 29.625, + "grad_norm_var": 4.792122395833333, + "learning_rate": 0.0001, + "loss": 6.8605, + "loss/crossentropy": 1.7886844381690026, + "loss/hidden": 3.31328125, + "loss/jsd": 0.0, + "loss/logits": 0.17270518001168966, + "step": 2330 + }, + { + "epoch": 0.117, + "grad_norm": 30.875, + "grad_norm_var": 24.301041666666666, + "learning_rate": 0.0001, + "loss": 6.8857, + "loss/crossentropy": 1.8270663298666476, + "loss/hidden": 3.316796875, + "loss/jsd": 0.0, + "loss/logits": 0.17341279415413738, + "step": 2340 + }, + { + "epoch": 0.1175, + "grad_norm": 28.25, + "grad_norm_var": 23.795247395833332, + "learning_rate": 0.0001, + "loss": 6.981, + "loss/crossentropy": 1.7389558240771295, + "loss/hidden": 3.33671875, + "loss/jsd": 0.0, + "loss/logits": 0.20616078823804856, + "step": 2350 + }, + { + "epoch": 0.118, + "grad_norm": 31.75, + "grad_norm_var": 3.3712890625, + "learning_rate": 0.0001, + "loss": 6.9706, + "loss/crossentropy": 1.7505015313625336, + "loss/hidden": 3.2375, + "loss/jsd": 0.0, + "loss/logits": 0.1691287737339735, + "step": 2360 + }, + { + "epoch": 0.1185, + "grad_norm": 29.875, + "grad_norm_var": 3.7864583333333335, + "learning_rate": 0.0001, + "loss": 7.0493, + "loss/crossentropy": 1.8290210530161857, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.17870840784162284, + "step": 2370 + }, + { + "epoch": 0.119, + "grad_norm": 28.75, + "grad_norm_var": 3.06640625, + "learning_rate": 0.0001, + "loss": 6.8945, + "loss/crossentropy": 1.7312066838145257, + "loss/hidden": 3.3453125, + "loss/jsd": 0.0, + "loss/logits": 0.16353450021706523, + "step": 2380 + }, + { + "epoch": 0.1195, + "grad_norm": 38.75, + "grad_norm_var": 8.985384797395922e+17, + "learning_rate": 0.0001, + "loss": 7.1346, + "loss/crossentropy": 1.8643671602010727, + "loss/hidden": 3.3734375, + "loss/jsd": 0.0, + "loss/logits": 0.18776546316221357, + "step": 2390 + }, + { + "epoch": 0.12, + "grad_norm": 33.25, + "grad_norm_var": 8.985384795065637e+17, + "learning_rate": 0.0001, + "loss": 7.0339, + "loss/crossentropy": 1.7668686166405678, + "loss/hidden": 3.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.19443758334964514, + "step": 2400 + }, + { + "epoch": 0.1205, + "grad_norm": 30.25, + "grad_norm_var": 1.8852243670131978e+18, + "learning_rate": 0.0001, + "loss": 6.9626, + "loss/crossentropy": 1.8465783804655076, + "loss/hidden": 3.371875, + "loss/jsd": 0.0, + "loss/logits": 0.1860800025984645, + "step": 2410 + }, + { + "epoch": 0.121, + "grad_norm": 33.75, + "grad_norm_var": 1.8852243674568678e+18, + "learning_rate": 0.0001, + "loss": 6.8455, + "loss/crossentropy": 1.7212153851985932, + "loss/hidden": 3.36171875, + "loss/jsd": 0.0, + "loss/logits": 0.18209199868142606, + "step": 2420 + }, + { + "epoch": 0.1215, + "grad_norm": 28.0, + "grad_norm_var": 3.81015625, + "learning_rate": 0.0001, + "loss": 6.9986, + "loss/crossentropy": 1.898094529658556, + "loss/hidden": 3.375390625, + "loss/jsd": 0.0, + "loss/logits": 0.194298998080194, + "step": 2430 + }, + { + "epoch": 0.122, + "grad_norm": 27.5, + "grad_norm_var": 3.332291666666667, + "learning_rate": 0.0001, + "loss": 6.8924, + "loss/crossentropy": 1.7420293487608434, + "loss/hidden": 3.2546875, + "loss/jsd": 0.0, + "loss/logits": 0.161607267241925, + "step": 2440 + }, + { + "epoch": 0.1225, + "grad_norm": 33.0, + "grad_norm_var": 2.7622395833333333, + "learning_rate": 0.0001, + "loss": 6.8686, + "loss/crossentropy": 1.6050585605204106, + "loss/hidden": 3.371875, + "loss/jsd": 0.0, + "loss/logits": 0.17848586086183788, + "step": 2450 + }, + { + "epoch": 0.123, + "grad_norm": 28.75, + "grad_norm_var": 2.4400390625, + "learning_rate": 0.0001, + "loss": 6.9804, + "loss/crossentropy": 1.9553805246949196, + "loss/hidden": 3.3859375, + "loss/jsd": 0.0, + "loss/logits": 0.19847506172955037, + "step": 2460 + }, + { + "epoch": 0.1235, + "grad_norm": 29.875, + "grad_norm_var": 2.0791015625, + "learning_rate": 0.0001, + "loss": 6.9913, + "loss/crossentropy": 1.4568642482161522, + "loss/hidden": 3.335546875, + "loss/jsd": 0.0, + "loss/logits": 0.15850053485482932, + "step": 2470 + }, + { + "epoch": 0.124, + "grad_norm": 31.875, + "grad_norm_var": 4.3775390625, + "learning_rate": 0.0001, + "loss": 6.9326, + "loss/crossentropy": 1.6532236352562903, + "loss/hidden": 3.45859375, + "loss/jsd": 0.0, + "loss/logits": 0.18165745195001365, + "step": 2480 + }, + { + "epoch": 0.1245, + "grad_norm": 28.5, + "grad_norm_var": 4.522330729166667, + "learning_rate": 0.0001, + "loss": 7.005, + "loss/crossentropy": 1.6793559297919274, + "loss/hidden": 3.339453125, + "loss/jsd": 0.0, + "loss/logits": 0.17017313856631516, + "step": 2490 + }, + { + "epoch": 0.125, + "grad_norm": 30.75, + "grad_norm_var": 4.3353515625, + "learning_rate": 0.0001, + "loss": 7.0956, + "loss/crossentropy": 1.8292289204895495, + "loss/hidden": 3.38203125, + "loss/jsd": 0.0, + "loss/logits": 0.18509325329214335, + "step": 2500 + }, + { + "epoch": 0.1255, + "grad_norm": 30.875, + "grad_norm_var": 3.78125, + "learning_rate": 0.0001, + "loss": 6.9137, + "loss/crossentropy": 1.7439368188381195, + "loss/hidden": 3.3890625, + "loss/jsd": 0.0, + "loss/logits": 0.19252277240157128, + "step": 2510 + }, + { + "epoch": 0.126, + "grad_norm": 31.125, + "grad_norm_var": 1.1349774577470627e+18, + "learning_rate": 0.0001, + "loss": 7.051, + "loss/crossentropy": 2.0631623208522796, + "loss/hidden": 3.4265625, + "loss/jsd": 0.0, + "loss/logits": 0.22505897115916013, + "step": 2520 + }, + { + "epoch": 0.1265, + "grad_norm": 29.75, + "grad_norm_var": 1.1349774575828206e+18, + "learning_rate": 0.0001, + "loss": 7.1194, + "loss/crossentropy": 1.8867668241262436, + "loss/hidden": 3.35390625, + "loss/jsd": 0.0, + "loss/logits": 0.20316522121429442, + "step": 2530 + }, + { + "epoch": 0.127, + "grad_norm": 28.25, + "grad_norm_var": 20.151822916666667, + "learning_rate": 0.0001, + "loss": 7.0832, + "loss/crossentropy": 1.8491319343447685, + "loss/hidden": 3.392578125, + "loss/jsd": 0.0, + "loss/logits": 0.19702840279787778, + "step": 2540 + }, + { + "epoch": 0.1275, + "grad_norm": 29.0, + "grad_norm_var": 11.6681640625, + "learning_rate": 0.0001, + "loss": 6.9728, + "loss/crossentropy": 1.8162995487451554, + "loss/hidden": 3.366015625, + "loss/jsd": 0.0, + "loss/logits": 0.18736656550318004, + "step": 2550 + }, + { + "epoch": 0.128, + "grad_norm": 34.5, + "grad_norm_var": 13.088997395833333, + "learning_rate": 0.0001, + "loss": 7.1137, + "loss/crossentropy": 2.031092081964016, + "loss/hidden": 3.447265625, + "loss/jsd": 0.0, + "loss/logits": 0.21819815230555833, + "step": 2560 + }, + { + "epoch": 0.1285, + "grad_norm": 31.125, + "grad_norm_var": 1.7945788315993818e+17, + "learning_rate": 0.0001, + "loss": 7.0175, + "loss/crossentropy": 1.731457906216383, + "loss/hidden": 3.4015625, + "loss/jsd": 0.0, + "loss/logits": 0.18550403621047734, + "step": 2570 + }, + { + "epoch": 0.129, + "grad_norm": 32.0, + "grad_norm_var": 1.794578832870256e+17, + "learning_rate": 0.0001, + "loss": 6.8552, + "loss/crossentropy": 1.8714622184634209, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.1802680429071188, + "step": 2580 + }, + { + "epoch": 0.1295, + "grad_norm": 38.75, + "grad_norm_var": 11.655143229166667, + "learning_rate": 0.0001, + "loss": 6.9513, + "loss/crossentropy": 1.6536960810422898, + "loss/hidden": 3.426171875, + "loss/jsd": 0.0, + "loss/logits": 0.19141803495585918, + "step": 2590 + }, + { + "epoch": 0.13, + "grad_norm": 30.25, + "grad_norm_var": 10.824934895833334, + "learning_rate": 0.0001, + "loss": 7.0451, + "loss/crossentropy": 1.7446824312210083, + "loss/hidden": 3.437890625, + "loss/jsd": 0.0, + "loss/logits": 0.21996904909610748, + "step": 2600 + }, + { + "epoch": 0.1305, + "grad_norm": 32.0, + "grad_norm_var": 0.9895182291666667, + "learning_rate": 0.0001, + "loss": 6.9912, + "loss/crossentropy": 1.8711062870919704, + "loss/hidden": 3.344140625, + "loss/jsd": 0.0, + "loss/logits": 0.18015410769730805, + "step": 2610 + }, + { + "epoch": 0.131, + "grad_norm": 29.0, + "grad_norm_var": 1.9697265625, + "learning_rate": 0.0001, + "loss": 6.9974, + "loss/crossentropy": 1.7273207187652588, + "loss/hidden": 3.3171875, + "loss/jsd": 0.0, + "loss/logits": 0.17100013056769967, + "step": 2620 + }, + { + "epoch": 0.1315, + "grad_norm": 33.0, + "grad_norm_var": 0.9681640625, + "learning_rate": 0.0001, + "loss": 6.864, + "loss/crossentropy": 1.772182758897543, + "loss/hidden": 3.410546875, + "loss/jsd": 0.0, + "loss/logits": 0.18076814245432615, + "step": 2630 + }, + { + "epoch": 0.132, + "grad_norm": 29.25, + "grad_norm_var": 5.707291666666666, + "learning_rate": 0.0001, + "loss": 7.1259, + "loss/crossentropy": 1.7641409367322922, + "loss/hidden": 3.434375, + "loss/jsd": 0.0, + "loss/logits": 0.18833348713815212, + "step": 2640 + }, + { + "epoch": 0.1325, + "grad_norm": 40.75, + "grad_norm_var": 10.91015625, + "learning_rate": 0.0001, + "loss": 7.0193, + "loss/crossentropy": 1.859598373621702, + "loss/hidden": 3.394140625, + "loss/jsd": 0.0, + "loss/logits": 0.18878742419183253, + "step": 2650 + }, + { + "epoch": 0.133, + "grad_norm": 31.375, + "grad_norm_var": 18.1822265625, + "learning_rate": 0.0001, + "loss": 6.9707, + "loss/crossentropy": 1.7797490507364273, + "loss/hidden": 3.411328125, + "loss/jsd": 0.0, + "loss/logits": 0.20212376527488232, + "step": 2660 + }, + { + "epoch": 0.1335, + "grad_norm": 29.875, + "grad_norm_var": 11.162239583333333, + "learning_rate": 0.0001, + "loss": 7.0002, + "loss/crossentropy": 1.7839721478521824, + "loss/hidden": 3.3140625, + "loss/jsd": 0.0, + "loss/logits": 0.173302289377898, + "step": 2670 + }, + { + "epoch": 0.134, + "grad_norm": 27.125, + "grad_norm_var": 4.2009765625, + "learning_rate": 0.0001, + "loss": 6.9156, + "loss/crossentropy": 1.7781757101416589, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.19893121821805834, + "step": 2680 + }, + { + "epoch": 0.1345, + "grad_norm": 30.75, + "grad_norm_var": 36.837239583333336, + "learning_rate": 0.0001, + "loss": 7.0997, + "loss/crossentropy": 1.8467799574136734, + "loss/hidden": 3.41953125, + "loss/jsd": 0.0, + "loss/logits": 0.20002066995948553, + "step": 2690 + }, + { + "epoch": 0.135, + "grad_norm": 28.5, + "grad_norm_var": 37.431705729166666, + "learning_rate": 0.0001, + "loss": 6.9236, + "loss/crossentropy": 1.6248198747634888, + "loss/hidden": 3.335546875, + "loss/jsd": 0.0, + "loss/logits": 0.1642201030626893, + "step": 2700 + }, + { + "epoch": 0.1355, + "grad_norm": 34.0, + "grad_norm_var": 4.030989583333334, + "learning_rate": 0.0001, + "loss": 6.9361, + "loss/crossentropy": 1.7102701038122177, + "loss/hidden": 3.3609375, + "loss/jsd": 0.0, + "loss/logits": 0.16836816985160113, + "step": 2710 + }, + { + "epoch": 0.136, + "grad_norm": 26.0, + "grad_norm_var": 1.0907331108694131e+18, + "learning_rate": 0.0001, + "loss": 6.9167, + "loss/crossentropy": 1.8059025250375271, + "loss/hidden": 3.31796875, + "loss/jsd": 0.0, + "loss/logits": 0.16677290350198745, + "step": 2720 + }, + { + "epoch": 0.1365, + "grad_norm": 29.5, + "grad_norm_var": 6.2728515625, + "learning_rate": 0.0001, + "loss": 6.8796, + "loss/crossentropy": 1.776158544421196, + "loss/hidden": 3.4125, + "loss/jsd": 0.0, + "loss/logits": 0.1929216692224145, + "step": 2730 + }, + { + "epoch": 0.137, + "grad_norm": 28.75, + "grad_norm_var": 7.5431640625, + "learning_rate": 0.0001, + "loss": 6.8288, + "loss/crossentropy": 1.8780412912368774, + "loss/hidden": 3.397265625, + "loss/jsd": 0.0, + "loss/logits": 0.18471294036135077, + "step": 2740 + }, + { + "epoch": 0.1375, + "grad_norm": 33.0, + "grad_norm_var": 15.1947265625, + "learning_rate": 0.0001, + "loss": 6.9741, + "loss/crossentropy": 1.7919296585023403, + "loss/hidden": 3.40703125, + "loss/jsd": 0.0, + "loss/logits": 0.19309423677623272, + "step": 2750 + }, + { + "epoch": 0.138, + "grad_norm": 31.375, + "grad_norm_var": 16.696809895833333, + "learning_rate": 0.0001, + "loss": 6.9971, + "loss/crossentropy": 1.8414636544883252, + "loss/hidden": 3.330859375, + "loss/jsd": 0.0, + "loss/logits": 0.1888686059974134, + "step": 2760 + }, + { + "epoch": 0.1385, + "grad_norm": 28.5, + "grad_norm_var": 7.121875, + "learning_rate": 0.0001, + "loss": 6.9869, + "loss/crossentropy": 1.8438507467508316, + "loss/hidden": 3.357421875, + "loss/jsd": 0.0, + "loss/logits": 0.19185615349560975, + "step": 2770 + }, + { + "epoch": 0.139, + "grad_norm": 33.25, + "grad_norm_var": 10.338541666666666, + "learning_rate": 0.0001, + "loss": 6.9528, + "loss/crossentropy": 1.8890479058027267, + "loss/hidden": 3.384765625, + "loss/jsd": 0.0, + "loss/logits": 0.19027914050966502, + "step": 2780 + }, + { + "epoch": 0.1395, + "grad_norm": 33.5, + "grad_norm_var": 12.343684895833333, + "learning_rate": 0.0001, + "loss": 6.9585, + "loss/crossentropy": 1.6378353632986546, + "loss/hidden": 3.41484375, + "loss/jsd": 0.0, + "loss/logits": 0.18243511486798525, + "step": 2790 + }, + { + "epoch": 0.14, + "grad_norm": 33.0, + "grad_norm_var": 7.9384765625, + "learning_rate": 0.0001, + "loss": 6.885, + "loss/crossentropy": 1.6422518469393252, + "loss/hidden": 3.26875, + "loss/jsd": 0.0, + "loss/logits": 0.15738149764947593, + "step": 2800 + }, + { + "epoch": 0.1405, + "grad_norm": 35.0, + "grad_norm_var": 7.362239583333333, + "learning_rate": 0.0001, + "loss": 6.9251, + "loss/crossentropy": 1.818039534240961, + "loss/hidden": 3.222265625, + "loss/jsd": 0.0, + "loss/logits": 0.17553653065115213, + "step": 2810 + }, + { + "epoch": 0.141, + "grad_norm": 28.875, + "grad_norm_var": 8.7134765625, + "learning_rate": 0.0001, + "loss": 6.9659, + "loss/crossentropy": 1.8913455709815026, + "loss/hidden": 3.325390625, + "loss/jsd": 0.0, + "loss/logits": 0.18545334562659263, + "step": 2820 + }, + { + "epoch": 0.1415, + "grad_norm": 27.5, + "grad_norm_var": 7.718684895833333, + "learning_rate": 0.0001, + "loss": 6.8653, + "loss/crossentropy": 1.9232856243848802, + "loss/hidden": 3.34921875, + "loss/jsd": 0.0, + "loss/logits": 0.19609272833913566, + "step": 2830 + }, + { + "epoch": 0.142, + "grad_norm": 31.75, + "grad_norm_var": 18.7166015625, + "learning_rate": 0.0001, + "loss": 6.9271, + "loss/crossentropy": 1.7873032443225383, + "loss/hidden": 3.2796875, + "loss/jsd": 0.0, + "loss/logits": 0.16436451440677047, + "step": 2840 + }, + { + "epoch": 0.1425, + "grad_norm": 31.375, + "grad_norm_var": 4.561393229166667, + "learning_rate": 0.0001, + "loss": 6.859, + "loss/crossentropy": 1.764283910393715, + "loss/hidden": 3.3890625, + "loss/jsd": 0.0, + "loss/logits": 0.18506875950843096, + "step": 2850 + }, + { + "epoch": 0.143, + "grad_norm": 30.125, + "grad_norm_var": 5.339322916666666, + "learning_rate": 0.0001, + "loss": 7.1328, + "loss/crossentropy": 1.746024763584137, + "loss/hidden": 3.333984375, + "loss/jsd": 0.0, + "loss/logits": 0.19091468937695028, + "step": 2860 + }, + { + "epoch": 0.1435, + "grad_norm": 31.125, + "grad_norm_var": 7.5875, + "learning_rate": 0.0001, + "loss": 6.8931, + "loss/crossentropy": 1.8621096529066563, + "loss/hidden": 3.2265625, + "loss/jsd": 0.0, + "loss/logits": 0.167528663482517, + "step": 2870 + }, + { + "epoch": 0.144, + "grad_norm": 32.25, + "grad_norm_var": 7.123372395833333, + "learning_rate": 0.0001, + "loss": 7.0369, + "loss/crossentropy": 1.9750339597463609, + "loss/hidden": 3.364453125, + "loss/jsd": 0.0, + "loss/logits": 0.20070471633225678, + "step": 2880 + }, + { + "epoch": 0.1445, + "grad_norm": 33.5, + "grad_norm_var": 14.7275390625, + "learning_rate": 0.0001, + "loss": 6.8862, + "loss/crossentropy": 1.74088372066617, + "loss/hidden": 3.320703125, + "loss/jsd": 0.0, + "loss/logits": 0.17231013607233764, + "step": 2890 + }, + { + "epoch": 0.145, + "grad_norm": 28.375, + "grad_norm_var": 19.409830729166668, + "learning_rate": 0.0001, + "loss": 7.035, + "loss/crossentropy": 1.7799094915390015, + "loss/hidden": 3.32265625, + "loss/jsd": 0.0, + "loss/logits": 0.18373552113771438, + "step": 2900 + }, + { + "epoch": 0.1455, + "grad_norm": 31.375, + "grad_norm_var": 5.517708333333333, + "learning_rate": 0.0001, + "loss": 6.9546, + "loss/crossentropy": 1.7803256064653397, + "loss/hidden": 3.34921875, + "loss/jsd": 0.0, + "loss/logits": 0.1977113801985979, + "step": 2910 + }, + { + "epoch": 0.146, + "grad_norm": 28.125, + "grad_norm_var": 5.627018229166667, + "learning_rate": 0.0001, + "loss": 6.9317, + "loss/crossentropy": 1.8050019271671771, + "loss/hidden": 3.257421875, + "loss/jsd": 0.0, + "loss/logits": 0.16629343312233685, + "step": 2920 + }, + { + "epoch": 0.1465, + "grad_norm": 34.5, + "grad_norm_var": 7.16640625, + "learning_rate": 0.0001, + "loss": 6.9453, + "loss/crossentropy": 1.8659825779497623, + "loss/hidden": 3.331640625, + "loss/jsd": 0.0, + "loss/logits": 0.1742606306448579, + "step": 2930 + }, + { + "epoch": 0.147, + "grad_norm": 35.5, + "grad_norm_var": 8.9306640625, + "learning_rate": 0.0001, + "loss": 7.0142, + "loss/crossentropy": 1.913654712587595, + "loss/hidden": 3.403515625, + "loss/jsd": 0.0, + "loss/logits": 0.20132352095097303, + "step": 2940 + }, + { + "epoch": 0.1475, + "grad_norm": 30.25, + "grad_norm_var": 6.614518229166666, + "learning_rate": 0.0001, + "loss": 6.9147, + "loss/crossentropy": 1.645759216696024, + "loss/hidden": 3.376171875, + "loss/jsd": 0.0, + "loss/logits": 0.16875347392633558, + "step": 2950 + }, + { + "epoch": 0.148, + "grad_norm": 29.0, + "grad_norm_var": 6.8322265625, + "learning_rate": 0.0001, + "loss": 6.9988, + "loss/crossentropy": 1.8556548431515694, + "loss/hidden": 3.39375, + "loss/jsd": 0.0, + "loss/logits": 0.17874295320361852, + "step": 2960 + }, + { + "epoch": 0.1485, + "grad_norm": 28.75, + "grad_norm_var": 3.5791666666666666, + "learning_rate": 0.0001, + "loss": 7.0313, + "loss/crossentropy": 1.688177353143692, + "loss/hidden": 3.28515625, + "loss/jsd": 0.0, + "loss/logits": 0.16950420523062348, + "step": 2970 + }, + { + "epoch": 0.149, + "grad_norm": 32.25, + "grad_norm_var": 2.246875, + "learning_rate": 0.0001, + "loss": 7.0247, + "loss/crossentropy": 2.071097436547279, + "loss/hidden": 3.404296875, + "loss/jsd": 0.0, + "loss/logits": 0.20375496093183756, + "step": 2980 + }, + { + "epoch": 0.1495, + "grad_norm": 27.5, + "grad_norm_var": 2.6014973958333334, + "learning_rate": 0.0001, + "loss": 7.0495, + "loss/crossentropy": 1.852598314732313, + "loss/hidden": 3.31328125, + "loss/jsd": 0.0, + "loss/logits": 0.16860631257295608, + "step": 2990 + }, + { + "epoch": 0.15, + "grad_norm": 27.0, + "grad_norm_var": 3.5122395833333333, + "learning_rate": 0.0001, + "loss": 6.7966, + "loss/crossentropy": 1.7948169738054276, + "loss/hidden": 3.325, + "loss/jsd": 0.0, + "loss/logits": 0.17319696098566056, + "step": 3000 + }, + { + "epoch": 0.1505, + "grad_norm": 28.875, + "grad_norm_var": 4.367122395833333, + "learning_rate": 0.0001, + "loss": 6.9423, + "loss/crossentropy": 1.6970888696610928, + "loss/hidden": 3.43203125, + "loss/jsd": 0.0, + "loss/logits": 0.16700221002101898, + "step": 3010 + }, + { + "epoch": 0.151, + "grad_norm": 3674210304.0, + "grad_norm_var": 2.0173451962123377e+18, + "learning_rate": 0.0001, + "loss": 6.9283, + "loss/crossentropy": 1.713117253035307, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.1704209728166461, + "step": 3020 + }, + { + "epoch": 0.1515, + "grad_norm": 31.375, + "grad_norm_var": 1.710129338897767e+18, + "learning_rate": 0.0001, + "loss": 7.0097, + "loss/crossentropy": 1.9506682097911834, + "loss/hidden": 3.407421875, + "loss/jsd": 0.0, + "loss/logits": 0.19250028654932977, + "step": 3030 + }, + { + "epoch": 0.152, + "grad_norm": 29.25, + "grad_norm_var": 2.1416666666666666, + "learning_rate": 0.0001, + "loss": 7.0202, + "loss/crossentropy": 1.831156849861145, + "loss/hidden": 3.307421875, + "loss/jsd": 0.0, + "loss/logits": 0.18563526798970997, + "step": 3040 + }, + { + "epoch": 0.1525, + "grad_norm": 30.625, + "grad_norm_var": 2.6768229166666666, + "learning_rate": 0.0001, + "loss": 7.0529, + "loss/crossentropy": 1.8806451916694642, + "loss/hidden": 3.405859375, + "loss/jsd": 0.0, + "loss/logits": 0.19239903232082725, + "step": 3050 + }, + { + "epoch": 0.153, + "grad_norm": 29.25, + "grad_norm_var": 2.8375, + "learning_rate": 0.0001, + "loss": 6.9243, + "loss/crossentropy": 1.8184577412903309, + "loss/hidden": 3.359765625, + "loss/jsd": 0.0, + "loss/logits": 0.173899077065289, + "step": 3060 + }, + { + "epoch": 0.1535, + "grad_norm": 30.5, + "grad_norm_var": 1.6489583333333333, + "learning_rate": 0.0001, + "loss": 6.901, + "loss/crossentropy": 1.782475320994854, + "loss/hidden": 3.303125, + "loss/jsd": 0.0, + "loss/logits": 0.17683281004428864, + "step": 3070 + }, + { + "epoch": 0.154, + "grad_norm": 30.125, + "grad_norm_var": 2.4770833333333333, + "learning_rate": 0.0001, + "loss": 7.0536, + "loss/crossentropy": 1.7542385324835776, + "loss/hidden": 3.31015625, + "loss/jsd": 0.0, + "loss/logits": 0.1734863522462547, + "step": 3080 + }, + { + "epoch": 0.1545, + "grad_norm": 31.375, + "grad_norm_var": 2.5077473958333334, + "learning_rate": 0.0001, + "loss": 6.7429, + "loss/crossentropy": 1.721788990870118, + "loss/hidden": 3.336328125, + "loss/jsd": 0.0, + "loss/logits": 0.1703654458746314, + "step": 3090 + }, + { + "epoch": 0.155, + "grad_norm": 40.25, + "grad_norm_var": 9.09140625, + "learning_rate": 0.0001, + "loss": 6.9729, + "loss/crossentropy": 1.6206283092498779, + "loss/hidden": 3.35859375, + "loss/jsd": 0.0, + "loss/logits": 0.1712807172909379, + "step": 3100 + }, + { + "epoch": 0.1555, + "grad_norm": 32.0, + "grad_norm_var": 8.16640625, + "learning_rate": 0.0001, + "loss": 6.8604, + "loss/crossentropy": 1.7044736705720425, + "loss/hidden": 3.247265625, + "loss/jsd": 0.0, + "loss/logits": 0.16109976628795267, + "step": 3110 + }, + { + "epoch": 0.156, + "grad_norm": 28.875, + "grad_norm_var": 61.90305989583333, + "learning_rate": 0.0001, + "loss": 6.8603, + "loss/crossentropy": 1.7201604932546615, + "loss/hidden": 3.3421875, + "loss/jsd": 0.0, + "loss/logits": 0.1717333897948265, + "step": 3120 + }, + { + "epoch": 0.1565, + "grad_norm": 29.25, + "grad_norm_var": 3.2666015625, + "learning_rate": 0.0001, + "loss": 6.9316, + "loss/crossentropy": 1.611024511605501, + "loss/hidden": 3.331640625, + "loss/jsd": 0.0, + "loss/logits": 0.17799030421301723, + "step": 3130 + }, + { + "epoch": 0.157, + "grad_norm": 29.25, + "grad_norm_var": 6.059830729166666, + "learning_rate": 0.0001, + "loss": 6.8749, + "loss/crossentropy": 1.542306227236986, + "loss/hidden": 3.2828125, + "loss/jsd": 0.0, + "loss/logits": 0.17464940482750535, + "step": 3140 + }, + { + "epoch": 0.1575, + "grad_norm": 30.75, + "grad_norm_var": 4.820572916666666, + "learning_rate": 0.0001, + "loss": 6.8917, + "loss/crossentropy": 1.7465024203062058, + "loss/hidden": 3.3546875, + "loss/jsd": 0.0, + "loss/logits": 0.18054623370990158, + "step": 3150 + }, + { + "epoch": 0.158, + "grad_norm": 31.5, + "grad_norm_var": 2.787239583333333, + "learning_rate": 0.0001, + "loss": 6.969, + "loss/crossentropy": 2.0858161732554437, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.18169568832963706, + "step": 3160 + }, + { + "epoch": 0.1585, + "grad_norm": 29.5, + "grad_norm_var": 4.023372395833333, + "learning_rate": 0.0001, + "loss": 6.8406, + "loss/crossentropy": 1.9426328182220458, + "loss/hidden": 3.324609375, + "loss/jsd": 0.0, + "loss/logits": 0.17592350710183383, + "step": 3170 + }, + { + "epoch": 0.159, + "grad_norm": 29.125, + "grad_norm_var": 1.5832967238438093e+18, + "learning_rate": 0.0001, + "loss": 6.9453, + "loss/crossentropy": 1.8308497540652753, + "loss/hidden": 3.603125, + "loss/jsd": 0.0, + "loss/logits": 0.19216080345213413, + "step": 3180 + }, + { + "epoch": 0.1595, + "grad_norm": 29.5, + "grad_norm_var": 1.5832967237861376e+18, + "learning_rate": 0.0001, + "loss": 6.9032, + "loss/crossentropy": 1.705291760712862, + "loss/hidden": 3.383984375, + "loss/jsd": 0.0, + "loss/logits": 0.1843032216653228, + "step": 3190 + }, + { + "epoch": 0.16, + "grad_norm": 40.25, + "grad_norm_var": 19.5056640625, + "learning_rate": 0.0001, + "loss": 7.0209, + "loss/crossentropy": 1.7651132240891456, + "loss/hidden": 3.390234375, + "loss/jsd": 0.0, + "loss/logits": 0.1844408256933093, + "step": 3200 + }, + { + "epoch": 0.1605, + "grad_norm": 38.0, + "grad_norm_var": 6.217782109866559e+17, + "learning_rate": 0.0001, + "loss": 6.7736, + "loss/crossentropy": 1.8051001697778701, + "loss/hidden": 3.38984375, + "loss/jsd": 0.0, + "loss/logits": 0.17440476845949887, + "step": 3210 + }, + { + "epoch": 0.161, + "grad_norm": 31.125, + "grad_norm_var": 6.428059895833333, + "learning_rate": 0.0001, + "loss": 6.9109, + "loss/crossentropy": 1.8851144686341286, + "loss/hidden": 3.2359375, + "loss/jsd": 0.0, + "loss/logits": 0.17897074315696954, + "step": 3220 + }, + { + "epoch": 0.1615, + "grad_norm": 30.125, + "grad_norm_var": 17.601041666666667, + "learning_rate": 0.0001, + "loss": 6.9799, + "loss/crossentropy": 1.6312229566276073, + "loss/hidden": 3.2609375, + "loss/jsd": 0.0, + "loss/logits": 0.16838383311405777, + "step": 3230 + }, + { + "epoch": 0.162, + "grad_norm": 31.5, + "grad_norm_var": 20.835872395833334, + "learning_rate": 0.0001, + "loss": 6.932, + "loss/crossentropy": 2.011029013991356, + "loss/hidden": 3.310546875, + "loss/jsd": 0.0, + "loss/logits": 0.1832389457151294, + "step": 3240 + }, + { + "epoch": 0.1625, + "grad_norm": 28.375, + "grad_norm_var": 7.161458333333333, + "learning_rate": 0.0001, + "loss": 7.0405, + "loss/crossentropy": 1.8453179642558097, + "loss/hidden": 3.434765625, + "loss/jsd": 0.0, + "loss/logits": 0.19180234288796782, + "step": 3250 + }, + { + "epoch": 0.163, + "grad_norm": 36.5, + "grad_norm_var": 10.517708333333333, + "learning_rate": 0.0001, + "loss": 6.823, + "loss/crossentropy": 1.9555616907775402, + "loss/hidden": 3.318359375, + "loss/jsd": 0.0, + "loss/logits": 0.17895318511873484, + "step": 3260 + }, + { + "epoch": 0.1635, + "grad_norm": 29.125, + "grad_norm_var": 8.909830729166666, + "learning_rate": 0.0001, + "loss": 6.892, + "loss/crossentropy": 1.843096625804901, + "loss/hidden": 3.33203125, + "loss/jsd": 0.0, + "loss/logits": 0.18395393253304065, + "step": 3270 + }, + { + "epoch": 0.164, + "grad_norm": 27.875, + "grad_norm_var": 7.260416666666667, + "learning_rate": 0.0001, + "loss": 6.9288, + "loss/crossentropy": 1.688144066929817, + "loss/hidden": 3.277734375, + "loss/jsd": 0.0, + "loss/logits": 0.172001248691231, + "step": 3280 + }, + { + "epoch": 0.1645, + "grad_norm": 37.5, + "grad_norm_var": 12.014518229166667, + "learning_rate": 0.0001, + "loss": 6.9012, + "loss/crossentropy": 1.6900858603417874, + "loss/hidden": 3.346875, + "loss/jsd": 0.0, + "loss/logits": 0.1850940717384219, + "step": 3290 + }, + { + "epoch": 0.165, + "grad_norm": 30.5, + "grad_norm_var": 11.887955729166666, + "learning_rate": 0.0001, + "loss": 7.0327, + "loss/crossentropy": 1.8690055832266808, + "loss/hidden": 3.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.2061467545107007, + "step": 3300 + }, + { + "epoch": 0.1655, + "grad_norm": 33.25, + "grad_norm_var": 44.0900390625, + "learning_rate": 0.0001, + "loss": 6.9398, + "loss/crossentropy": 1.864616620540619, + "loss/hidden": 3.21875, + "loss/jsd": 0.0, + "loss/logits": 0.15337421298027037, + "step": 3310 + }, + { + "epoch": 0.166, + "grad_norm": 37.25, + "grad_norm_var": 45.87473958333333, + "learning_rate": 0.0001, + "loss": 6.9275, + "loss/crossentropy": 1.8501743324100972, + "loss/hidden": 3.266796875, + "loss/jsd": 0.0, + "loss/logits": 0.17113643269985915, + "step": 3320 + }, + { + "epoch": 0.1665, + "grad_norm": 29.625, + "grad_norm_var": 1.1349774579334994e+18, + "learning_rate": 0.0001, + "loss": 7.0081, + "loss/crossentropy": 1.779020744562149, + "loss/hidden": 3.323046875, + "loss/jsd": 0.0, + "loss/logits": 0.1846176441758871, + "step": 3330 + }, + { + "epoch": 0.167, + "grad_norm": 35.75, + "grad_norm_var": 1.0819897936507308e+18, + "learning_rate": 0.0001, + "loss": 6.9779, + "loss/crossentropy": 1.7754384666681289, + "loss/hidden": 3.36796875, + "loss/jsd": 0.0, + "loss/logits": 0.19158907625824212, + "step": 3340 + }, + { + "epoch": 0.1675, + "grad_norm": 34.25, + "grad_norm_var": 1.081989793663733e+18, + "learning_rate": 0.0001, + "loss": 7.0539, + "loss/crossentropy": 1.759375052154064, + "loss/hidden": 3.32109375, + "loss/jsd": 0.0, + "loss/logits": 0.18603504877537488, + "step": 3350 + }, + { + "epoch": 0.168, + "grad_norm": 29.875, + "grad_norm_var": 5.620833333333334, + "learning_rate": 0.0001, + "loss": 6.8589, + "loss/crossentropy": 1.845319252461195, + "loss/hidden": 3.3328125, + "loss/jsd": 0.0, + "loss/logits": 0.18480119155719876, + "step": 3360 + }, + { + "epoch": 0.1685, + "grad_norm": 34.5, + "grad_norm_var": 19.439583333333335, + "learning_rate": 0.0001, + "loss": 6.9772, + "loss/crossentropy": 1.6411745361983776, + "loss/hidden": 3.321484375, + "loss/jsd": 0.0, + "loss/logits": 0.15529545303434134, + "step": 3370 + }, + { + "epoch": 0.169, + "grad_norm": 28.5, + "grad_norm_var": 36.9103515625, + "learning_rate": 0.0001, + "loss": 6.8489, + "loss/crossentropy": 1.7360669024288655, + "loss/hidden": 3.327734375, + "loss/jsd": 0.0, + "loss/logits": 0.17661824598908424, + "step": 3380 + }, + { + "epoch": 0.1695, + "grad_norm": 29.375, + "grad_norm_var": 35.46848958333333, + "learning_rate": 0.0001, + "loss": 6.7757, + "loss/crossentropy": 1.7902205429971219, + "loss/hidden": 3.38125, + "loss/jsd": 0.0, + "loss/logits": 0.17157120602205395, + "step": 3390 + }, + { + "epoch": 0.17, + "grad_norm": 28.875, + "grad_norm_var": 3.6395833333333334, + "learning_rate": 0.0001, + "loss": 6.8708, + "loss/crossentropy": 1.842449489980936, + "loss/hidden": 3.29921875, + "loss/jsd": 0.0, + "loss/logits": 0.16762932492420077, + "step": 3400 + }, + { + "epoch": 0.1705, + "grad_norm": 37.0, + "grad_norm_var": 6.513997395833333, + "learning_rate": 0.0001, + "loss": 6.8956, + "loss/crossentropy": 1.7051387749612332, + "loss/hidden": 3.3078125, + "loss/jsd": 0.0, + "loss/logits": 0.16933946274220943, + "step": 3410 + }, + { + "epoch": 0.171, + "grad_norm": 30.75, + "grad_norm_var": 9.762239583333333, + "learning_rate": 0.0001, + "loss": 6.9733, + "loss/crossentropy": 1.7448437750339507, + "loss/hidden": 3.3890625, + "loss/jsd": 0.0, + "loss/logits": 0.20119084492325784, + "step": 3420 + }, + { + "epoch": 0.1715, + "grad_norm": 78.0, + "grad_norm_var": 144.5125, + "learning_rate": 0.0001, + "loss": 7.0287, + "loss/crossentropy": 1.824779784679413, + "loss/hidden": 3.37421875, + "loss/jsd": 0.0, + "loss/logits": 0.17832597270607947, + "step": 3430 + }, + { + "epoch": 0.172, + "grad_norm": 28.125, + "grad_norm_var": 145.68430989583334, + "learning_rate": 0.0001, + "loss": 6.7435, + "loss/crossentropy": 1.6466563902795315, + "loss/hidden": 3.328515625, + "loss/jsd": 0.0, + "loss/logits": 0.16660706931725144, + "step": 3440 + }, + { + "epoch": 0.1725, + "grad_norm": 29.875, + "grad_norm_var": 8.811393229166667, + "learning_rate": 0.0001, + "loss": 6.934, + "loss/crossentropy": 1.8493422105908395, + "loss/hidden": 3.333984375, + "loss/jsd": 0.0, + "loss/logits": 0.2010068495757878, + "step": 3450 + }, + { + "epoch": 0.173, + "grad_norm": 29.0, + "grad_norm_var": 6.62890625, + "learning_rate": 0.0001, + "loss": 6.8496, + "loss/crossentropy": 1.6380462288856505, + "loss/hidden": 3.26015625, + "loss/jsd": 0.0, + "loss/logits": 0.18449038956314326, + "step": 3460 + }, + { + "epoch": 0.1735, + "grad_norm": 32.75, + "grad_norm_var": 32.5875, + "learning_rate": 0.0001, + "loss": 6.9309, + "loss/crossentropy": 1.6813900470733643, + "loss/hidden": 3.400390625, + "loss/jsd": 0.0, + "loss/logits": 0.18437479846179486, + "step": 3470 + }, + { + "epoch": 0.174, + "grad_norm": 31.625, + "grad_norm_var": 7.465419918819722e+17, + "learning_rate": 0.0001, + "loss": 7.1677, + "loss/crossentropy": 1.789808637648821, + "loss/hidden": 3.343359375, + "loss/jsd": 0.0, + "loss/logits": 0.17758243400603532, + "step": 3480 + }, + { + "epoch": 0.1745, + "grad_norm": 29.75, + "grad_norm_var": 58.84557291666667, + "learning_rate": 0.0001, + "loss": 6.8709, + "loss/crossentropy": 1.8069385841488839, + "loss/hidden": 3.403125, + "loss/jsd": 0.0, + "loss/logits": 0.19292932376265526, + "step": 3490 + }, + { + "epoch": 0.175, + "grad_norm": 28.25, + "grad_norm_var": 13.452018229166667, + "learning_rate": 0.0001, + "loss": 6.9084, + "loss/crossentropy": 1.6264689728617667, + "loss/hidden": 3.269140625, + "loss/jsd": 0.0, + "loss/logits": 0.16363061694428324, + "step": 3500 + }, + { + "epoch": 0.1755, + "grad_norm": 32.75, + "grad_norm_var": 1.459166261163747e+18, + "learning_rate": 0.0001, + "loss": 6.9837, + "loss/crossentropy": 1.7061957284808158, + "loss/hidden": 3.411328125, + "loss/jsd": 0.0, + "loss/logits": 0.18923843959346415, + "step": 3510 + }, + { + "epoch": 0.176, + "grad_norm": 29.75, + "grad_norm_var": 1.459166260217512e+18, + "learning_rate": 0.0001, + "loss": 6.9459, + "loss/crossentropy": 1.6986562974750996, + "loss/hidden": 3.453515625, + "loss/jsd": 0.0, + "loss/logits": 0.18663678420707583, + "step": 3520 + }, + { + "epoch": 0.1765, + "grad_norm": 31.0, + "grad_norm_var": 1.8478515625, + "learning_rate": 0.0001, + "loss": 6.9793, + "loss/crossentropy": 1.7609238177537918, + "loss/hidden": 3.38515625, + "loss/jsd": 0.0, + "loss/logits": 0.19038589783012866, + "step": 3530 + }, + { + "epoch": 0.177, + "grad_norm": 31.25, + "grad_norm_var": 3.1770833333333335, + "learning_rate": 0.0001, + "loss": 6.9966, + "loss/crossentropy": 1.9084905117750168, + "loss/hidden": 3.33984375, + "loss/jsd": 0.0, + "loss/logits": 0.1776235220953822, + "step": 3540 + }, + { + "epoch": 0.1775, + "grad_norm": 30.25, + "grad_norm_var": 2.051497395833333, + "learning_rate": 0.0001, + "loss": 6.9292, + "loss/crossentropy": 1.6809238217771054, + "loss/hidden": 3.355078125, + "loss/jsd": 0.0, + "loss/logits": 0.19617705075070263, + "step": 3550 + }, + { + "epoch": 0.178, + "grad_norm": 33.75, + "grad_norm_var": 1.9955729166666667, + "learning_rate": 0.0001, + "loss": 6.972, + "loss/crossentropy": 1.6389021024107933, + "loss/hidden": 3.3796875, + "loss/jsd": 0.0, + "loss/logits": 0.18174178060144186, + "step": 3560 + }, + { + "epoch": 0.1785, + "grad_norm": 36.5, + "grad_norm_var": 7.553059895833333, + "learning_rate": 0.0001, + "loss": 7.0848, + "loss/crossentropy": 1.7566796734929084, + "loss/hidden": 3.465234375, + "loss/jsd": 0.0, + "loss/logits": 0.1923373954370618, + "step": 3570 + }, + { + "epoch": 0.179, + "grad_norm": 28.125, + "grad_norm_var": 5.9603515625, + "learning_rate": 0.0001, + "loss": 6.956, + "loss/crossentropy": 1.7154954925179482, + "loss/hidden": 3.380859375, + "loss/jsd": 0.0, + "loss/logits": 0.17990761240944267, + "step": 3580 + }, + { + "epoch": 0.1795, + "grad_norm": 29.875, + "grad_norm_var": 4.399934895833334, + "learning_rate": 0.0001, + "loss": 7.0142, + "loss/crossentropy": 1.8327077120542525, + "loss/hidden": 3.35234375, + "loss/jsd": 0.0, + "loss/logits": 0.1800425429828465, + "step": 3590 + }, + { + "epoch": 0.18, + "grad_norm": 28.875, + "grad_norm_var": 3.3247395833333333, + "learning_rate": 0.0001, + "loss": 6.9351, + "loss/crossentropy": 1.8267195105552674, + "loss/hidden": 3.394140625, + "loss/jsd": 0.0, + "loss/logits": 0.19433746309950947, + "step": 3600 + }, + { + "epoch": 0.1805, + "grad_norm": 32.25, + "grad_norm_var": 24.673372395833333, + "learning_rate": 0.0001, + "loss": 6.8892, + "loss/crossentropy": 1.737992748618126, + "loss/hidden": 3.40390625, + "loss/jsd": 0.0, + "loss/logits": 0.20098126940429212, + "step": 3610 + }, + { + "epoch": 0.181, + "grad_norm": 30.25, + "grad_norm_var": 33.395572916666666, + "learning_rate": 0.0001, + "loss": 7.0103, + "loss/crossentropy": 1.8915371976792812, + "loss/hidden": 3.305078125, + "loss/jsd": 0.0, + "loss/logits": 0.1876732436940074, + "step": 3620 + }, + { + "epoch": 0.1815, + "grad_norm": 26.5, + "grad_norm_var": 38.799739583333334, + "learning_rate": 0.0001, + "loss": 6.981, + "loss/crossentropy": 1.7780213125050068, + "loss/hidden": 3.421875, + "loss/jsd": 0.0, + "loss/logits": 0.18925584964454173, + "step": 3630 + }, + { + "epoch": 0.182, + "grad_norm": 32.0, + "grad_norm_var": 1.0995116106143062e+18, + "learning_rate": 0.0001, + "loss": 7.039, + "loss/crossentropy": 1.7201772332191467, + "loss/hidden": 3.292578125, + "loss/jsd": 0.0, + "loss/logits": 0.1817839713767171, + "step": 3640 + }, + { + "epoch": 0.1825, + "grad_norm": 29.125, + "grad_norm_var": 1.0995116110905345e+18, + "learning_rate": 0.0001, + "loss": 6.7937, + "loss/crossentropy": 1.824095284193754, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.16389566464349628, + "step": 3650 + }, + { + "epoch": 0.183, + "grad_norm": 28.625, + "grad_norm_var": 14.382291666666667, + "learning_rate": 0.0001, + "loss": 6.918, + "loss/crossentropy": 1.7039800986647606, + "loss/hidden": 3.34921875, + "loss/jsd": 0.0, + "loss/logits": 0.17251853737980127, + "step": 3660 + }, + { + "epoch": 0.1835, + "grad_norm": 29.375, + "grad_norm_var": 0.82265625, + "learning_rate": 0.0001, + "loss": 6.8614, + "loss/crossentropy": 1.670785766094923, + "loss/hidden": 3.466015625, + "loss/jsd": 0.0, + "loss/logits": 0.1893833376467228, + "step": 3670 + }, + { + "epoch": 0.184, + "grad_norm": 28.375, + "grad_norm_var": 8.297916666666667, + "learning_rate": 0.0001, + "loss": 6.8747, + "loss/crossentropy": 1.7371518418192864, + "loss/hidden": 3.329296875, + "loss/jsd": 0.0, + "loss/logits": 0.17423492725938558, + "step": 3680 + }, + { + "epoch": 0.1845, + "grad_norm": 30.5, + "grad_norm_var": 11.51640625, + "learning_rate": 0.0001, + "loss": 7.1482, + "loss/crossentropy": 2.011937528848648, + "loss/hidden": 3.376171875, + "loss/jsd": 0.0, + "loss/logits": 0.19120746664702892, + "step": 3690 + }, + { + "epoch": 0.185, + "grad_norm": 29.75, + "grad_norm_var": 114.8119140625, + "learning_rate": 0.0001, + "loss": 6.9318, + "loss/crossentropy": 1.9779032841324806, + "loss/hidden": 3.508203125, + "loss/jsd": 0.0, + "loss/logits": 0.19792085662484168, + "step": 3700 + }, + { + "epoch": 0.1855, + "grad_norm": 29.5, + "grad_norm_var": 3.1666015625, + "learning_rate": 0.0001, + "loss": 6.9801, + "loss/crossentropy": 1.8196966513991355, + "loss/hidden": 3.364453125, + "loss/jsd": 0.0, + "loss/logits": 0.17692473586648702, + "step": 3710 + }, + { + "epoch": 0.186, + "grad_norm": 31.625, + "grad_norm_var": 7.036874289840129e+17, + "learning_rate": 0.0001, + "loss": 6.9754, + "loss/crossentropy": 1.7481721505522727, + "loss/hidden": 3.37734375, + "loss/jsd": 0.0, + "loss/logits": 0.1970391605515033, + "step": 3720 + }, + { + "epoch": 0.1865, + "grad_norm": 28.75, + "grad_norm_var": 7.036874289385746e+17, + "learning_rate": 0.0001, + "loss": 6.8514, + "loss/crossentropy": 1.609993650764227, + "loss/hidden": 3.378125, + "loss/jsd": 0.0, + "loss/logits": 0.19023605762049556, + "step": 3730 + }, + { + "epoch": 0.187, + "grad_norm": 32.75, + "grad_norm_var": 2.0978515625, + "learning_rate": 0.0001, + "loss": 7.0947, + "loss/crossentropy": 2.0539694875478745, + "loss/hidden": 3.399609375, + "loss/jsd": 0.0, + "loss/logits": 0.20270574633032085, + "step": 3740 + }, + { + "epoch": 0.1875, + "grad_norm": 29.0, + "grad_norm_var": 1.9103515625, + "learning_rate": 0.0001, + "loss": 6.9278, + "loss/crossentropy": 1.8215243116021156, + "loss/hidden": 3.2640625, + "loss/jsd": 0.0, + "loss/logits": 0.16428390927612782, + "step": 3750 + }, + { + "epoch": 0.188, + "grad_norm": 30.0, + "grad_norm_var": 2.0541666666666667, + "learning_rate": 0.0001, + "loss": 7.0503, + "loss/crossentropy": 1.8183038413524628, + "loss/hidden": 3.3234375, + "loss/jsd": 0.0, + "loss/logits": 0.19697826653718947, + "step": 3760 + }, + { + "epoch": 0.1885, + "grad_norm": 32.75, + "grad_norm_var": 1.0989583333333333, + "learning_rate": 0.0001, + "loss": 7.1034, + "loss/crossentropy": 1.7321583658456803, + "loss/hidden": 3.41484375, + "loss/jsd": 0.0, + "loss/logits": 0.19197138799354435, + "step": 3770 + }, + { + "epoch": 0.189, + "grad_norm": 28.25, + "grad_norm_var": 2.314322916666667, + "learning_rate": 0.0001, + "loss": 6.8113, + "loss/crossentropy": 1.8538015499711036, + "loss/hidden": 3.369140625, + "loss/jsd": 0.0, + "loss/logits": 0.18398043606430292, + "step": 3780 + }, + { + "epoch": 0.1895, + "grad_norm": 29.625, + "grad_norm_var": 5.827018229166667, + "learning_rate": 0.0001, + "loss": 7.0817, + "loss/crossentropy": 1.8768661253154277, + "loss/hidden": 3.369140625, + "loss/jsd": 0.0, + "loss/logits": 0.20710380356758834, + "step": 3790 + }, + { + "epoch": 0.19, + "grad_norm": 33.25, + "grad_norm_var": 4.195572916666666, + "learning_rate": 0.0001, + "loss": 7.0374, + "loss/crossentropy": 1.7977422267198562, + "loss/hidden": 3.28828125, + "loss/jsd": 0.0, + "loss/logits": 0.1821097361855209, + "step": 3800 + }, + { + "epoch": 0.1905, + "grad_norm": 36.0, + "grad_norm_var": 5.763997395833333, + "learning_rate": 0.0001, + "loss": 7.0815, + "loss/crossentropy": 1.743187139183283, + "loss/hidden": 3.371875, + "loss/jsd": 0.0, + "loss/logits": 0.18471882613375784, + "step": 3810 + }, + { + "epoch": 0.191, + "grad_norm": 30.25, + "grad_norm_var": 6.658333333333333, + "learning_rate": 0.0001, + "loss": 6.8304, + "loss/crossentropy": 1.7315315805375575, + "loss/hidden": 3.3015625, + "loss/jsd": 0.0, + "loss/logits": 0.16926794005557894, + "step": 3820 + }, + { + "epoch": 0.1915, + "grad_norm": 29.875, + "grad_norm_var": 7.5380756628017e+17, + "learning_rate": 0.0001, + "loss": 7.091, + "loss/crossentropy": 1.8176006272435188, + "loss/hidden": 3.350390625, + "loss/jsd": 0.0, + "loss/logits": 0.1842843361198902, + "step": 3830 + }, + { + "epoch": 0.192, + "grad_norm": 29.375, + "grad_norm_var": 7.563541666666667, + "learning_rate": 0.0001, + "loss": 6.9694, + "loss/crossentropy": 1.777810937166214, + "loss/hidden": 3.32578125, + "loss/jsd": 0.0, + "loss/logits": 0.1739983822219074, + "step": 3840 + }, + { + "epoch": 0.1925, + "grad_norm": 28.375, + "grad_norm_var": 5.01875, + "learning_rate": 0.0001, + "loss": 6.8715, + "loss/crossentropy": 1.9018649347126484, + "loss/hidden": 3.32421875, + "loss/jsd": 0.0, + "loss/logits": 0.18006115844473242, + "step": 3850 + }, + { + "epoch": 0.193, + "grad_norm": 30.0, + "grad_norm_var": 1.2455729166666667, + "learning_rate": 0.0001, + "loss": 6.881, + "loss/crossentropy": 1.8844246573746204, + "loss/hidden": 3.3390625, + "loss/jsd": 0.0, + "loss/logits": 0.19470994817093015, + "step": 3860 + }, + { + "epoch": 0.1935, + "grad_norm": 28.75, + "grad_norm_var": 2.6809895833333335, + "learning_rate": 0.0001, + "loss": 6.9021, + "loss/crossentropy": 1.7199362799525262, + "loss/hidden": 3.36328125, + "loss/jsd": 0.0, + "loss/logits": 0.18913396131247281, + "step": 3870 + }, + { + "epoch": 0.194, + "grad_norm": 32.25, + "grad_norm_var": 2.5197265625, + "learning_rate": 0.0001, + "loss": 6.9324, + "loss/crossentropy": 1.755439005047083, + "loss/hidden": 3.343359375, + "loss/jsd": 0.0, + "loss/logits": 0.1858789509162307, + "step": 3880 + }, + { + "epoch": 0.1945, + "grad_norm": 31.125, + "grad_norm_var": 3.384375, + "learning_rate": 0.0001, + "loss": 6.9477, + "loss/crossentropy": 1.7906312070786954, + "loss/hidden": 3.334765625, + "loss/jsd": 0.0, + "loss/logits": 0.19127205722033977, + "step": 3890 + }, + { + "epoch": 0.195, + "grad_norm": 32.25, + "grad_norm_var": 2.1504557291666666, + "learning_rate": 0.0001, + "loss": 7.1196, + "loss/crossentropy": 1.9957764573395251, + "loss/hidden": 3.36015625, + "loss/jsd": 0.0, + "loss/logits": 0.18469135276973248, + "step": 3900 + }, + { + "epoch": 0.1955, + "grad_norm": 31.875, + "grad_norm_var": 9.387366238726391e+17, + "learning_rate": 0.0001, + "loss": 6.9857, + "loss/crossentropy": 1.7901725992560387, + "loss/hidden": 3.300390625, + "loss/jsd": 0.0, + "loss/logits": 0.18710751123726369, + "step": 3910 + }, + { + "epoch": 0.196, + "grad_norm": 30.625, + "grad_norm_var": 27.239322916666666, + "learning_rate": 0.0001, + "loss": 6.9815, + "loss/crossentropy": 1.7652528271079064, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.16305868746712804, + "step": 3920 + }, + { + "epoch": 0.1965, + "grad_norm": 29.75, + "grad_norm_var": 21.822916666666668, + "learning_rate": 0.0001, + "loss": 6.873, + "loss/crossentropy": 1.7368287414312362, + "loss/hidden": 3.433984375, + "loss/jsd": 0.0, + "loss/logits": 0.19844600670039653, + "step": 3930 + }, + { + "epoch": 0.197, + "grad_norm": 29.625, + "grad_norm_var": 2.035416666666667, + "learning_rate": 0.0001, + "loss": 7.0347, + "loss/crossentropy": 1.9710937917232514, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.17006599269807338, + "step": 3940 + }, + { + "epoch": 0.1975, + "grad_norm": 27.875, + "grad_norm_var": 55.904622395833336, + "learning_rate": 0.0001, + "loss": 6.9427, + "loss/crossentropy": 1.6834511645138264, + "loss/hidden": 3.315234375, + "loss/jsd": 0.0, + "loss/logits": 0.18321871096268297, + "step": 3950 + }, + { + "epoch": 0.198, + "grad_norm": 28.25, + "grad_norm_var": 4.515559895833333, + "learning_rate": 0.0001, + "loss": 6.8793, + "loss/crossentropy": 1.8481898710131646, + "loss/hidden": 3.219921875, + "loss/jsd": 0.0, + "loss/logits": 0.16018803734332324, + "step": 3960 + }, + { + "epoch": 0.1985, + "grad_norm": 31.375, + "grad_norm_var": 4.105208333333334, + "learning_rate": 0.0001, + "loss": 6.9158, + "loss/crossentropy": 1.7632210277020932, + "loss/hidden": 3.35390625, + "loss/jsd": 0.0, + "loss/logits": 0.17569016199558973, + "step": 3970 + }, + { + "epoch": 0.199, + "grad_norm": 31.5, + "grad_norm_var": 3.6056640625, + "learning_rate": 0.0001, + "loss": 7.0644, + "loss/crossentropy": 1.8658879399299622, + "loss/hidden": 3.445703125, + "loss/jsd": 0.0, + "loss/logits": 0.2075220150873065, + "step": 3980 + }, + { + "epoch": 0.1995, + "grad_norm": 30.0, + "grad_norm_var": 3.3301432291666666, + "learning_rate": 0.0001, + "loss": 7.1057, + "loss/crossentropy": 1.915429985523224, + "loss/hidden": 3.390234375, + "loss/jsd": 0.0, + "loss/logits": 0.19412722568958998, + "step": 3990 + }, + { + "epoch": 0.2, + "grad_norm": 29.75, + "grad_norm_var": 139.13170572916667, + "learning_rate": 0.0001, + "loss": 6.9355, + "loss/crossentropy": 1.8257215216755867, + "loss/hidden": 3.39375, + "loss/jsd": 0.0, + "loss/logits": 0.18498760322108865, + "step": 4000 + } + ], + "logging_steps": 10, + "max_steps": 20000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 4000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1430040128035226e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}