{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2, "eval_steps": 2000, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005, "grad_norm": 30.875, "learning_rate": 0.0001, "loss": 7.1506, "loss/crossentropy": 1.9750229328870774, "loss/hidden": 3.38984375, "loss/jsd": 0.0, "loss/logits": 0.18868114035576583, "step": 10 }, { "epoch": 0.001, "grad_norm": 30.75, "grad_norm_var": 2.09765625, "learning_rate": 0.0001, "loss": 7.266, "loss/crossentropy": 1.915299428999424, "loss/hidden": 3.368359375, "loss/jsd": 0.0, "loss/logits": 0.19173294119536877, "step": 20 }, { "epoch": 0.0015, "grad_norm": 31.625, "grad_norm_var": 35.572330729166666, "learning_rate": 0.0001, "loss": 7.1477, "loss/crossentropy": 1.845322072505951, "loss/hidden": 3.42421875, "loss/jsd": 0.0, "loss/logits": 0.1835887383669615, "step": 30 }, { "epoch": 0.002, "grad_norm": 30.25, "grad_norm_var": 5.803580729166667, "learning_rate": 0.0001, "loss": 7.125, "loss/crossentropy": 1.8556978717446326, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.22780380193144084, "step": 40 }, { "epoch": 0.0025, "grad_norm": 39.5, "grad_norm_var": 6.737239583333333, "learning_rate": 0.0001, "loss": 7.2665, "loss/crossentropy": 2.051687541604042, "loss/hidden": 3.45078125, "loss/jsd": 0.0, "loss/logits": 0.21537381634116173, "step": 50 }, { "epoch": 0.003, "grad_norm": 36.5, "grad_norm_var": 11.058333333333334, "learning_rate": 0.0001, "loss": 7.2095, "loss/crossentropy": 1.9898784533143044, "loss/hidden": 3.3953125, "loss/jsd": 0.0, "loss/logits": 0.19060547631233932, "step": 60 }, { "epoch": 0.0035, "grad_norm": 27.0, "grad_norm_var": 6.45390625, "learning_rate": 0.0001, "loss": 7.2606, "loss/crossentropy": 1.8448080085217953, "loss/hidden": 3.394140625, "loss/jsd": 0.0, "loss/logits": 0.18068002099171282, "step": 70 }, { "epoch": 0.004, "grad_norm": 38.75, "grad_norm_var": 1.3401023445121106e+18, "learning_rate": 0.0001, "loss": 7.4871, "loss/crossentropy": 2.0318232350051404, "loss/hidden": 3.733984375, "loss/jsd": 0.0, "loss/logits": 0.337183965742588, "step": 80 }, { "epoch": 0.0045, "grad_norm": 35.25, "grad_norm_var": 1.3401023442516444e+18, "learning_rate": 0.0001, "loss": 7.1923, "loss/crossentropy": 1.7826939225196838, "loss/hidden": 3.587890625, "loss/jsd": 0.0, "loss/logits": 0.2118432404473424, "step": 90 }, { "epoch": 0.005, "grad_norm": 32.75, "grad_norm_var": 2.7309895833333333, "learning_rate": 0.0001, "loss": 7.2487, "loss/crossentropy": 1.88408655077219, "loss/hidden": 3.48671875, "loss/jsd": 0.0, "loss/logits": 0.1903762748464942, "step": 100 }, { "epoch": 0.0055, "grad_norm": 34.25, "grad_norm_var": 4.268489583333333, "learning_rate": 0.0001, "loss": 7.1643, "loss/crossentropy": 1.83259879052639, "loss/hidden": 3.41953125, "loss/jsd": 0.0, "loss/logits": 0.19554968569427728, "step": 110 }, { "epoch": 0.006, "grad_norm": 33.0, "grad_norm_var": 6.548958333333333, "learning_rate": 0.0001, "loss": 7.1535, "loss/crossentropy": 1.8173740945756436, "loss/hidden": 3.34609375, "loss/jsd": 0.0, "loss/logits": 0.17036083210259675, "step": 120 }, { "epoch": 0.0065, "grad_norm": 32.25, "grad_norm_var": 3.220572916666667, "learning_rate": 0.0001, "loss": 7.2113, "loss/crossentropy": 1.8991591855883598, "loss/hidden": 3.4359375, "loss/jsd": 0.0, "loss/logits": 0.20231554415076972, "step": 130 }, { "epoch": 0.007, "grad_norm": 120.0, "grad_norm_var": 494.52890625, "learning_rate": 0.0001, "loss": 7.1589, "loss/crossentropy": 1.9234379842877387, "loss/hidden": 3.348828125, "loss/jsd": 0.0, "loss/logits": 0.19592595770955085, "step": 140 }, { "epoch": 0.0075, "grad_norm": 30.375, "grad_norm_var": 496.27265625, "learning_rate": 0.0001, "loss": 7.1392, "loss/crossentropy": 1.7669467806816102, "loss/hidden": 3.39453125, "loss/jsd": 0.0, "loss/logits": 0.1691664818674326, "step": 150 }, { "epoch": 0.008, "grad_norm": 35.25, "grad_norm_var": 202.11354166666666, "learning_rate": 0.0001, "loss": 7.2551, "loss/crossentropy": 1.979496531933546, "loss/hidden": 3.51484375, "loss/jsd": 0.0, "loss/logits": 0.2397671105340123, "step": 160 }, { "epoch": 0.0085, "grad_norm": 29.75, "grad_norm_var": 41.73118489583333, "learning_rate": 0.0001, "loss": 7.0709, "loss/crossentropy": 1.6596970088779925, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.1801933947019279, "step": 170 }, { "epoch": 0.009, "grad_norm": 31.375, "grad_norm_var": 3.1510416666666665, "learning_rate": 0.0001, "loss": 7.1329, "loss/crossentropy": 1.8317318260669708, "loss/hidden": 3.470703125, "loss/jsd": 0.0, "loss/logits": 0.2027322521433234, "step": 180 }, { "epoch": 0.0095, "grad_norm": 31.25, "grad_norm_var": 1.034375, "learning_rate": 0.0001, "loss": 7.2704, "loss/crossentropy": 1.7871993221342564, "loss/hidden": 3.3296875, "loss/jsd": 0.0, "loss/logits": 0.17167234625667332, "step": 190 }, { "epoch": 0.01, "grad_norm": 29.375, "grad_norm_var": 1.4218098958333334, "learning_rate": 0.0001, "loss": 7.2074, "loss/crossentropy": 1.9208836354315282, "loss/hidden": 3.355859375, "loss/jsd": 0.0, "loss/logits": 0.18774686167016624, "step": 200 }, { "epoch": 0.0105, "grad_norm": 29.75, "grad_norm_var": 5.548958333333333, "learning_rate": 0.0001, "loss": 7.2446, "loss/crossentropy": 1.8792764976620675, "loss/hidden": 3.430859375, "loss/jsd": 0.0, "loss/logits": 0.19080359637737274, "step": 210 }, { "epoch": 0.011, "grad_norm": 32.25, "grad_norm_var": 11.7619140625, "learning_rate": 0.0001, "loss": 7.2031, "loss/crossentropy": 1.926865078508854, "loss/hidden": 3.387890625, "loss/jsd": 0.0, "loss/logits": 0.19636590238660573, "step": 220 }, { "epoch": 0.0115, "grad_norm": 29.25, "grad_norm_var": 4.170247395833333, "learning_rate": 0.0001, "loss": 7.0576, "loss/crossentropy": 1.8266212515532971, "loss/hidden": 3.377734375, "loss/jsd": 0.0, "loss/logits": 0.18201391287148, "step": 230 }, { "epoch": 0.012, "grad_norm": 31.5, "grad_norm_var": 1.81015625, "learning_rate": 0.0001, "loss": 7.1432, "loss/crossentropy": 1.8445213377475738, "loss/hidden": 3.34140625, "loss/jsd": 0.0, "loss/logits": 0.18868241235613822, "step": 240 }, { "epoch": 0.0125, "grad_norm": 33.75, "grad_norm_var": 1.9625138843884541e+18, "learning_rate": 0.0001, "loss": 7.0655, "loss/crossentropy": 1.8239912115037442, "loss/hidden": 3.298828125, "loss/jsd": 0.0, "loss/logits": 0.17756748497486113, "step": 250 }, { "epoch": 0.013, "grad_norm": 31.875, "grad_norm_var": 1.56640625, "learning_rate": 0.0001, "loss": 7.1575, "loss/crossentropy": 1.7626003332436084, "loss/hidden": 3.4109375, "loss/jsd": 0.0, "loss/logits": 0.18398213125765323, "step": 260 }, { "epoch": 0.0135, "grad_norm": 32.25, "grad_norm_var": 1.1129557291666667, "learning_rate": 0.0001, "loss": 7.1441, "loss/crossentropy": 1.7845010846853255, "loss/hidden": 3.344140625, "loss/jsd": 0.0, "loss/logits": 0.18147525601089, "step": 270 }, { "epoch": 0.014, "grad_norm": 30.25, "grad_norm_var": 2.9822265625, "learning_rate": 0.0001, "loss": 7.1286, "loss/crossentropy": 1.8358447797596456, "loss/hidden": 3.358203125, "loss/jsd": 0.0, "loss/logits": 0.17241306640207768, "step": 280 }, { "epoch": 0.0145, "grad_norm": 33.0, "grad_norm_var": 10.982291666666667, "learning_rate": 0.0001, "loss": 7.1123, "loss/crossentropy": 1.843992917239666, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.19916406068950893, "step": 290 }, { "epoch": 0.015, "grad_norm": 31.5, "grad_norm_var": 3.6176432291666667, "learning_rate": 0.0001, "loss": 6.9761, "loss/crossentropy": 1.710184234380722, "loss/hidden": 3.385546875, "loss/jsd": 0.0, "loss/logits": 0.1904242929071188, "step": 300 }, { "epoch": 0.0155, "grad_norm": 30.625, "grad_norm_var": 1.4795028269701094e+18, "learning_rate": 0.0001, "loss": 7.1128, "loss/crossentropy": 1.783938717842102, "loss/hidden": 3.38515625, "loss/jsd": 0.0, "loss/logits": 0.19371993821114303, "step": 310 }, { "epoch": 0.016, "grad_norm": 27.375, "grad_norm_var": 9.558072916666667, "learning_rate": 0.0001, "loss": 7.1587, "loss/crossentropy": 1.799688772857189, "loss/hidden": 3.35078125, "loss/jsd": 0.0, "loss/logits": 0.18227657950483261, "step": 320 }, { "epoch": 0.0165, "grad_norm": 30.75, "grad_norm_var": 5.827235584899985e+17, "learning_rate": 0.0001, "loss": 7.1719, "loss/crossentropy": 1.8475290067493915, "loss/hidden": 3.490234375, "loss/jsd": 0.0, "loss/logits": 0.20651640743017197, "step": 330 }, { "epoch": 0.017, "grad_norm": 31.875, "grad_norm_var": 1.0473683707078467e+18, "learning_rate": 0.0001, "loss": 7.2024, "loss/crossentropy": 1.7877734430134296, "loss/hidden": 3.341015625, "loss/jsd": 0.0, "loss/logits": 0.17529369578696788, "step": 340 }, { "epoch": 0.0175, "grad_norm": 29.625, "grad_norm_var": 1.0473683706481477e+18, "learning_rate": 0.0001, "loss": 7.0127, "loss/crossentropy": 1.8476789727807046, "loss/hidden": 3.376953125, "loss/jsd": 0.0, "loss/logits": 0.18340907394886016, "step": 350 }, { "epoch": 0.018, "grad_norm": 31.5, "grad_norm_var": 4.201822916666667, "learning_rate": 0.0001, "loss": 7.0837, "loss/crossentropy": 1.9127952009439468, "loss/hidden": 3.274609375, "loss/jsd": 0.0, "loss/logits": 0.18515819907188416, "step": 360 }, { "epoch": 0.0185, "grad_norm": 33.25, "grad_norm_var": 3.4580729166666666, "learning_rate": 0.0001, "loss": 7.1494, "loss/crossentropy": 1.7446002267301082, "loss/hidden": 3.410546875, "loss/jsd": 0.0, "loss/logits": 0.18972037807106973, "step": 370 }, { "epoch": 0.019, "grad_norm": 32.25, "grad_norm_var": 4.0712890625, "learning_rate": 0.0001, "loss": 6.9798, "loss/crossentropy": 1.6596938122063876, "loss/hidden": 3.39296875, "loss/jsd": 0.0, "loss/logits": 0.16941323587670923, "step": 380 }, { "epoch": 0.0195, "grad_norm": 31.5, "grad_norm_var": 1.8014398298089062e+18, "learning_rate": 0.0001, "loss": 7.1659, "loss/crossentropy": 1.8092470526695252, "loss/hidden": 3.278515625, "loss/jsd": 0.0, "loss/logits": 0.16989028006792067, "step": 390 }, { "epoch": 0.02, "grad_norm": 29.25, "grad_norm_var": 1.801439829596395e+18, "learning_rate": 0.0001, "loss": 7.1246, "loss/crossentropy": 1.803744176030159, "loss/hidden": 3.365625, "loss/jsd": 0.0, "loss/logits": 0.19061805782839655, "step": 400 }, { "epoch": 0.0205, "grad_norm": 30.75, "grad_norm_var": 1.1895833333333334, "learning_rate": 0.0001, "loss": 6.8644, "loss/crossentropy": 1.711807917803526, "loss/hidden": 3.348046875, "loss/jsd": 0.0, "loss/logits": 0.17410435527563095, "step": 410 }, { "epoch": 0.021, "grad_norm": 28.75, "grad_norm_var": 1.0518229166666666, "learning_rate": 0.0001, "loss": 6.9733, "loss/crossentropy": 1.9412737876176833, "loss/hidden": 3.32109375, "loss/jsd": 0.0, "loss/logits": 0.1845760691910982, "step": 420 }, { "epoch": 0.0215, "grad_norm": 33.75, "grad_norm_var": 3.36875, "learning_rate": 0.0001, "loss": 7.0425, "loss/crossentropy": 1.6975354842841626, "loss/hidden": 3.30703125, "loss/jsd": 0.0, "loss/logits": 0.17426773644983767, "step": 430 }, { "epoch": 0.022, "grad_norm": 28.875, "grad_norm_var": 4.533072916666667, "learning_rate": 0.0001, "loss": 7.0644, "loss/crossentropy": 1.8431582309305667, "loss/hidden": 3.309765625, "loss/jsd": 0.0, "loss/logits": 0.19988675275817513, "step": 440 }, { "epoch": 0.0225, "grad_norm": 28.5, "grad_norm_var": 4.65, "learning_rate": 0.0001, "loss": 7.1091, "loss/crossentropy": 1.845390348136425, "loss/hidden": 3.395703125, "loss/jsd": 0.0, "loss/logits": 0.18364266194403173, "step": 450 }, { "epoch": 0.023, "grad_norm": 30.75, "grad_norm_var": 4.459375, "learning_rate": 0.0001, "loss": 7.0581, "loss/crossentropy": 1.7513741821050643, "loss/hidden": 3.42109375, "loss/jsd": 0.0, "loss/logits": 0.186102606728673, "step": 460 }, { "epoch": 0.0235, "grad_norm": 27.375, "grad_norm_var": 4.786458333333333, "learning_rate": 0.0001, "loss": 6.9763, "loss/crossentropy": 1.779174941033125, "loss/hidden": 3.373046875, "loss/jsd": 0.0, "loss/logits": 0.17763521214947103, "step": 470 }, { "epoch": 0.024, "grad_norm": 32.75, "grad_norm_var": 4.1, "learning_rate": 0.0001, "loss": 6.9638, "loss/crossentropy": 1.7178381219506265, "loss/hidden": 3.36484375, "loss/jsd": 0.0, "loss/logits": 0.17294319327920676, "step": 480 }, { "epoch": 0.0245, "grad_norm": 33.75, "grad_norm_var": 3.40625, "learning_rate": 0.0001, "loss": 6.9397, "loss/crossentropy": 1.8609587274491788, "loss/hidden": 3.309765625, "loss/jsd": 0.0, "loss/logits": 0.1921778223477304, "step": 490 }, { "epoch": 0.025, "grad_norm": 30.125, "grad_norm_var": 7.0625, "learning_rate": 0.0001, "loss": 7.1176, "loss/crossentropy": 1.8291713461279868, "loss/hidden": 3.390234375, "loss/jsd": 0.0, "loss/logits": 0.18730791788548232, "step": 500 }, { "epoch": 0.0255, "grad_norm": 30.375, "grad_norm_var": 6.520572916666667, "learning_rate": 0.0001, "loss": 7.097, "loss/crossentropy": 1.6978721603751183, "loss/hidden": 3.354296875, "loss/jsd": 0.0, "loss/logits": 0.16910959454253316, "step": 510 }, { "epoch": 0.026, "grad_norm": 31.5, "grad_norm_var": 5.492708333333334, "learning_rate": 0.0001, "loss": 7.1184, "loss/crossentropy": 1.7646001767367125, "loss/hidden": 3.49609375, "loss/jsd": 0.0, "loss/logits": 0.18606224549002945, "step": 520 }, { "epoch": 0.0265, "grad_norm": 33.25, "grad_norm_var": 3.2478515625, "learning_rate": 0.0001, "loss": 6.9289, "loss/crossentropy": 1.7254683546721936, "loss/hidden": 3.414453125, "loss/jsd": 0.0, "loss/logits": 0.19350956091657281, "step": 530 }, { "epoch": 0.027, "grad_norm": 28.5, "grad_norm_var": 3.2426432291666667, "learning_rate": 0.0001, "loss": 7.0072, "loss/crossentropy": 1.8291743457317353, "loss/hidden": 3.2703125, "loss/jsd": 0.0, "loss/logits": 0.17015220914036036, "step": 540 }, { "epoch": 0.0275, "grad_norm": 29.375, "grad_norm_var": 6.1978515625, "learning_rate": 0.0001, "loss": 7.0714, "loss/crossentropy": 1.7038650900125503, "loss/hidden": 3.35546875, "loss/jsd": 0.0, "loss/logits": 0.17573642041534185, "step": 550 }, { "epoch": 0.028, "grad_norm": 28.875, "grad_norm_var": 5.530143229166667, "learning_rate": 0.0001, "loss": 7.0376, "loss/crossentropy": 2.000048974901438, "loss/hidden": 3.3921875, "loss/jsd": 0.0, "loss/logits": 0.20670556500554085, "step": 560 }, { "epoch": 0.0285, "grad_norm": 30.125, "grad_norm_var": 37.509830729166666, "learning_rate": 0.0001, "loss": 7.0782, "loss/crossentropy": 1.7484589993953705, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.20398099757730961, "step": 570 }, { "epoch": 0.029, "grad_norm": 30.75, "grad_norm_var": 37.80930989583333, "learning_rate": 0.0001, "loss": 7.1094, "loss/crossentropy": 1.747946521639824, "loss/hidden": 3.325, "loss/jsd": 0.0, "loss/logits": 0.1723929913714528, "step": 580 }, { "epoch": 0.0295, "grad_norm": 31.5, "grad_norm_var": 1.9410807291666667, "learning_rate": 0.0001, "loss": 7.0532, "loss/crossentropy": 1.714518916606903, "loss/hidden": 3.395703125, "loss/jsd": 0.0, "loss/logits": 0.17450172062963248, "step": 590 }, { "epoch": 0.03, "grad_norm": 31.375, "grad_norm_var": 6.620995009586922e+17, "learning_rate": 0.0001, "loss": 7.2589, "loss/crossentropy": 1.7456246592104434, "loss/hidden": 3.3328125, "loss/jsd": 0.0, "loss/logits": 0.18539317091926932, "step": 600 }, { "epoch": 0.0305, "grad_norm": 31.625, "grad_norm_var": 6.620995011655063e+17, "learning_rate": 0.0001, "loss": 7.1014, "loss/crossentropy": 1.6763587422668933, "loss/hidden": 3.4015625, "loss/jsd": 0.0, "loss/logits": 0.1931827544234693, "step": 610 }, { "epoch": 0.031, "grad_norm": 31.5, "grad_norm_var": 4.528125, "learning_rate": 0.0001, "loss": 7.115, "loss/crossentropy": 1.849663856625557, "loss/hidden": 3.41953125, "loss/jsd": 0.0, "loss/logits": 0.21164124589413405, "step": 620 }, { "epoch": 0.0315, "grad_norm": 31.25, "grad_norm_var": 3.027083333333333, "learning_rate": 0.0001, "loss": 7.1975, "loss/crossentropy": 1.765239630639553, "loss/hidden": 3.3609375, "loss/jsd": 0.0, "loss/logits": 0.18264974560588598, "step": 630 }, { "epoch": 0.032, "grad_norm": 29.25, "grad_norm_var": 3.428580729166667, "learning_rate": 0.0001, "loss": 7.1206, "loss/crossentropy": 1.8783695727586747, "loss/hidden": 3.369921875, "loss/jsd": 0.0, "loss/logits": 0.18768006665632128, "step": 640 }, { "epoch": 0.0325, "grad_norm": 30.75, "grad_norm_var": 3.9385416666666666, "learning_rate": 0.0001, "loss": 7.1671, "loss/crossentropy": 1.8120282679796218, "loss/hidden": 3.41484375, "loss/jsd": 0.0, "loss/logits": 0.21209220625460148, "step": 650 }, { "epoch": 0.033, "grad_norm": 31.75, "grad_norm_var": 1.77265625, "learning_rate": 0.0001, "loss": 7.0683, "loss/crossentropy": 1.6486516989767552, "loss/hidden": 3.3765625, "loss/jsd": 0.0, "loss/logits": 0.17768741883337497, "step": 660 }, { "epoch": 0.0335, "grad_norm": 28.5, "grad_norm_var": 1.9622395833333333, "learning_rate": 0.0001, "loss": 7.0341, "loss/crossentropy": 1.5188174404203891, "loss/hidden": 3.355078125, "loss/jsd": 0.0, "loss/logits": 0.17400255370885134, "step": 670 }, { "epoch": 0.034, "grad_norm": 29.25, "grad_norm_var": 3.075, "learning_rate": 0.0001, "loss": 7.0187, "loss/crossentropy": 1.7111039966344834, "loss/hidden": 3.42734375, "loss/jsd": 0.0, "loss/logits": 0.20188356712460517, "step": 680 }, { "epoch": 0.0345, "grad_norm": 30.5, "grad_norm_var": 1.5458333333333334, "learning_rate": 0.0001, "loss": 7.1392, "loss/crossentropy": 1.7463210627436638, "loss/hidden": 3.380078125, "loss/jsd": 0.0, "loss/logits": 0.18064118530601264, "step": 690 }, { "epoch": 0.035, "grad_norm": 30.0, "grad_norm_var": 1.6020833333333333, "learning_rate": 0.0001, "loss": 7.0488, "loss/crossentropy": 1.913002396374941, "loss/hidden": 3.248046875, "loss/jsd": 0.0, "loss/logits": 0.17795131383463741, "step": 700 }, { "epoch": 0.0355, "grad_norm": 3674210304.0, "grad_norm_var": 2.2729279965717071e+18, "learning_rate": 0.0001, "loss": 7.1836, "loss/crossentropy": 1.7232265777885913, "loss/hidden": 3.417578125, "loss/jsd": 0.0, "loss/logits": 0.19430895978584886, "step": 710 }, { "epoch": 0.036, "grad_norm": 29.125, "grad_norm_var": 8.437388195823355e+17, "learning_rate": 0.0001, "loss": 6.9841, "loss/crossentropy": 1.8030119113624097, "loss/hidden": 3.39453125, "loss/jsd": 0.0, "loss/logits": 0.18302876157686115, "step": 720 }, { "epoch": 0.0365, "grad_norm": 30.375, "grad_norm_var": 2.85, "learning_rate": 0.0001, "loss": 6.9804, "loss/crossentropy": 1.9009442821145057, "loss/hidden": 3.266796875, "loss/jsd": 0.0, "loss/logits": 0.16866004383191466, "step": 730 }, { "epoch": 0.037, "grad_norm": 30.0, "grad_norm_var": 9.339322916666667, "learning_rate": 0.0001, "loss": 6.9876, "loss/crossentropy": 1.6418433368206025, "loss/hidden": 3.438671875, "loss/jsd": 0.0, "loss/logits": 0.191958365496248, "step": 740 }, { "epoch": 0.0375, "grad_norm": 30.875, "grad_norm_var": 7.639322916666667, "learning_rate": 0.0001, "loss": 7.0538, "loss/crossentropy": 1.853764034062624, "loss/hidden": 3.32578125, "loss/jsd": 0.0, "loss/logits": 0.17473467853851615, "step": 750 }, { "epoch": 0.038, "grad_norm": 31.125, "grad_norm_var": 1.0613932291666666, "learning_rate": 0.0001, "loss": 7.1458, "loss/crossentropy": 1.8514880582690239, "loss/hidden": 3.378515625, "loss/jsd": 0.0, "loss/logits": 0.19726306498050689, "step": 760 }, { "epoch": 0.0385, "grad_norm": 28.875, "grad_norm_var": 1.7997395833333334, "learning_rate": 0.0001, "loss": 7.0766, "loss/crossentropy": 1.8405121728777885, "loss/hidden": 3.324609375, "loss/jsd": 0.0, "loss/logits": 0.19442977402359246, "step": 770 }, { "epoch": 0.039, "grad_norm": 29.0, "grad_norm_var": 2.3802083333333335, "learning_rate": 0.0001, "loss": 7.0214, "loss/crossentropy": 1.9466332450509072, "loss/hidden": 3.289453125, "loss/jsd": 0.0, "loss/logits": 0.170109105668962, "step": 780 }, { "epoch": 0.0395, "grad_norm": 30.0, "grad_norm_var": 1.6124348958333334, "learning_rate": 0.0001, "loss": 7.1306, "loss/crossentropy": 1.8399325378239155, "loss/hidden": 3.46015625, "loss/jsd": 0.0, "loss/logits": 0.20626397961750625, "step": 790 }, { "epoch": 0.04, "grad_norm": 31.75, "grad_norm_var": 1.6559895833333333, "learning_rate": 0.0001, "loss": 7.1375, "loss/crossentropy": 1.9278223380446433, "loss/hidden": 3.41015625, "loss/jsd": 0.0, "loss/logits": 0.2024382423609495, "step": 800 }, { "epoch": 0.0405, "grad_norm": 27.5, "grad_norm_var": 16.089322916666667, "learning_rate": 0.0001, "loss": 7.0363, "loss/crossentropy": 1.859210267663002, "loss/hidden": 3.345703125, "loss/jsd": 0.0, "loss/logits": 0.18585832975804806, "step": 810 }, { "epoch": 0.041, "grad_norm": 28.25, "grad_norm_var": 38.77265625, "learning_rate": 0.0001, "loss": 6.9378, "loss/crossentropy": 1.8994540706276895, "loss/hidden": 3.376953125, "loss/jsd": 0.0, "loss/logits": 0.2018324811011553, "step": 820 }, { "epoch": 0.0415, "grad_norm": 32.0, "grad_norm_var": 38.8375, "learning_rate": 0.0001, "loss": 7.002, "loss/crossentropy": 1.8244094364345074, "loss/hidden": 3.415625, "loss/jsd": 0.0, "loss/logits": 0.20930232629179954, "step": 830 }, { "epoch": 0.042, "grad_norm": 30.25, "grad_norm_var": 2.0634765625, "learning_rate": 0.0001, "loss": 6.9688, "loss/crossentropy": 1.8976417139172554, "loss/hidden": 3.33515625, "loss/jsd": 0.0, "loss/logits": 0.1871755332686007, "step": 840 }, { "epoch": 0.0425, "grad_norm": 50.75, "grad_norm_var": 28.351497395833334, "learning_rate": 0.0001, "loss": 6.992, "loss/crossentropy": 1.899886740744114, "loss/hidden": 3.417578125, "loss/jsd": 0.0, "loss/logits": 0.18904313631355762, "step": 850 }, { "epoch": 0.043, "grad_norm": 29.0, "grad_norm_var": 27.3056640625, "learning_rate": 0.0001, "loss": 7.0939, "loss/crossentropy": 1.8286892741918563, "loss/hidden": 3.362109375, "loss/jsd": 0.0, "loss/logits": 0.18909739144146442, "step": 860 }, { "epoch": 0.0435, "grad_norm": 28.375, "grad_norm_var": 1.3247395833333333, "learning_rate": 0.0001, "loss": 6.9381, "loss/crossentropy": 1.9782623961567878, "loss/hidden": 3.305859375, "loss/jsd": 0.0, "loss/logits": 0.1766037069261074, "step": 870 }, { "epoch": 0.044, "grad_norm": 29.0, "grad_norm_var": 2.1988932291666665, "learning_rate": 0.0001, "loss": 6.8414, "loss/crossentropy": 1.8968854755163194, "loss/hidden": 3.413671875, "loss/jsd": 0.0, "loss/logits": 0.20138736004009844, "step": 880 }, { "epoch": 0.0445, "grad_norm": 32.75, "grad_norm_var": 1.92890625, "learning_rate": 0.0001, "loss": 7.1271, "loss/crossentropy": 1.8630956932902336, "loss/hidden": 3.428125, "loss/jsd": 0.0, "loss/logits": 0.21029497124254704, "step": 890 }, { "epoch": 0.045, "grad_norm": 29.25, "grad_norm_var": 2.037239583333333, "learning_rate": 0.0001, "loss": 7.0435, "loss/crossentropy": 1.8676601111888886, "loss/hidden": 3.351953125, "loss/jsd": 0.0, "loss/logits": 0.19789310321211814, "step": 900 }, { "epoch": 0.0455, "grad_norm": 30.25, "grad_norm_var": 4.2265225949129395e+17, "learning_rate": 0.0001, "loss": 7.1233, "loss/crossentropy": 1.8434145867824554, "loss/hidden": 3.378125, "loss/jsd": 0.0, "loss/logits": 0.18832013495266436, "step": 910 }, { "epoch": 0.046, "grad_norm": 29.375, "grad_norm_var": 4.2265225969445555e+17, "learning_rate": 0.0001, "loss": 6.8733, "loss/crossentropy": 1.81582195982337, "loss/hidden": 3.416796875, "loss/jsd": 0.0, "loss/logits": 0.18773540575057268, "step": 920 }, { "epoch": 0.0465, "grad_norm": 33.0, "grad_norm_var": 4.476822916666666, "learning_rate": 0.0001, "loss": 7.0752, "loss/crossentropy": 1.8667447365820409, "loss/hidden": 3.336328125, "loss/jsd": 0.0, "loss/logits": 0.18054497512057424, "step": 930 }, { "epoch": 0.047, "grad_norm": 28.625, "grad_norm_var": 6.144205729166667, "learning_rate": 0.0001, "loss": 7.0032, "loss/crossentropy": 1.8144822165369987, "loss/hidden": 3.271484375, "loss/jsd": 0.0, "loss/logits": 0.1632128401659429, "step": 940 }, { "epoch": 0.0475, "grad_norm": 30.375, "grad_norm_var": 5.01875, "learning_rate": 0.0001, "loss": 6.8626, "loss/crossentropy": 1.8152224607765675, "loss/hidden": 3.394140625, "loss/jsd": 0.0, "loss/logits": 0.18933067489415406, "step": 950 }, { "epoch": 0.048, "grad_norm": 37.0, "grad_norm_var": 7.297916666666667, "learning_rate": 0.0001, "loss": 7.0437, "loss/crossentropy": 1.6399064034223556, "loss/hidden": 3.39140625, "loss/jsd": 0.0, "loss/logits": 0.18825935963541268, "step": 960 }, { "epoch": 0.0485, "grad_norm": 29.75, "grad_norm_var": 4.739583333333333, "learning_rate": 0.0001, "loss": 7.0331, "loss/crossentropy": 1.6737658925354482, "loss/hidden": 3.412890625, "loss/jsd": 0.0, "loss/logits": 0.17548465421423315, "step": 970 }, { "epoch": 0.049, "grad_norm": 30.0, "grad_norm_var": 18.1541015625, "learning_rate": 0.0001, "loss": 6.9385, "loss/crossentropy": 1.8608146458864212, "loss/hidden": 3.35234375, "loss/jsd": 0.0, "loss/logits": 0.19196428768336773, "step": 980 }, { "epoch": 0.0495, "grad_norm": 33.75, "grad_norm_var": 4.003125, "learning_rate": 0.0001, "loss": 7.0686, "loss/crossentropy": 1.8301926247775555, "loss/hidden": 3.347265625, "loss/jsd": 0.0, "loss/logits": 0.18049606634303927, "step": 990 }, { "epoch": 0.05, "grad_norm": 31.75, "grad_norm_var": 1.0473683721235639e+18, "learning_rate": 0.0001, "loss": 7.0193, "loss/crossentropy": 1.7465273767709732, "loss/hidden": 3.369921875, "loss/jsd": 0.0, "loss/logits": 0.17173261381685734, "step": 1000 }, { "epoch": 0.0505, "grad_norm": 29.75, "grad_norm_var": 22.408268229166666, "learning_rate": 0.0001, "loss": 6.9709, "loss/crossentropy": 1.7683202728629113, "loss/hidden": 3.419921875, "loss/jsd": 0.0, "loss/logits": 0.210743809863925, "step": 1010 }, { "epoch": 0.051, "grad_norm": 28.625, "grad_norm_var": 2.371875, "learning_rate": 0.0001, "loss": 7.0597, "loss/crossentropy": 2.046058624982834, "loss/hidden": 3.3375, "loss/jsd": 0.0, "loss/logits": 0.18963768277317286, "step": 1020 }, { "epoch": 0.0515, "grad_norm": 30.0, "grad_norm_var": 1.3184895833333334, "learning_rate": 0.0001, "loss": 7.0245, "loss/crossentropy": 1.745854178071022, "loss/hidden": 3.30390625, "loss/jsd": 0.0, "loss/logits": 0.17351055853068828, "step": 1030 }, { "epoch": 0.052, "grad_norm": 34.75, "grad_norm_var": 2.8108723958333335, "learning_rate": 0.0001, "loss": 6.9474, "loss/crossentropy": 1.8277953140437604, "loss/hidden": 3.33984375, "loss/jsd": 0.0, "loss/logits": 0.16915141120553018, "step": 1040 }, { "epoch": 0.0525, "grad_norm": 32.5, "grad_norm_var": 3.39765625, "learning_rate": 0.0001, "loss": 6.9366, "loss/crossentropy": 1.9404960587620734, "loss/hidden": 3.35625, "loss/jsd": 0.0, "loss/logits": 0.18970660548657178, "step": 1050 }, { "epoch": 0.053, "grad_norm": 35.75, "grad_norm_var": 1.1892317588406927e+18, "learning_rate": 0.0001, "loss": 7.0954, "loss/crossentropy": 1.8612810902297496, "loss/hidden": 3.31171875, "loss/jsd": 0.0, "loss/logits": 0.17269262354820966, "step": 1060 }, { "epoch": 0.0535, "grad_norm": 29.875, "grad_norm_var": 1.1892317588497805e+18, "learning_rate": 0.0001, "loss": 7.0259, "loss/crossentropy": 1.743497943878174, "loss/hidden": 3.2609375, "loss/jsd": 0.0, "loss/logits": 0.1666251303628087, "step": 1070 }, { "epoch": 0.054, "grad_norm": 29.625, "grad_norm_var": 2.903059895833333, "learning_rate": 0.0001, "loss": 7.0055, "loss/crossentropy": 1.9657445706427097, "loss/hidden": 3.32734375, "loss/jsd": 0.0, "loss/logits": 0.18259168425574898, "step": 1080 }, { "epoch": 0.0545, "grad_norm": 30.25, "grad_norm_var": 51.16015625, "learning_rate": 0.0001, "loss": 7.1126, "loss/crossentropy": 2.0204195216298104, "loss/hidden": 3.334765625, "loss/jsd": 0.0, "loss/logits": 0.20481194872409106, "step": 1090 }, { "epoch": 0.055, "grad_norm": 29.625, "grad_norm_var": 2.90390625, "learning_rate": 0.0001, "loss": 7.0413, "loss/crossentropy": 1.589720468968153, "loss/hidden": 3.275, "loss/jsd": 0.0, "loss/logits": 0.18000307623296977, "step": 1100 }, { "epoch": 0.0555, "grad_norm": 29.375, "grad_norm_var": 2.2613932291666665, "learning_rate": 0.0001, "loss": 6.9722, "loss/crossentropy": 1.7191244810819626, "loss/hidden": 3.45390625, "loss/jsd": 0.0, "loss/logits": 0.18164545409381389, "step": 1110 }, { "epoch": 0.056, "grad_norm": 28.875, "grad_norm_var": 1.7520833333333334, "learning_rate": 0.0001, "loss": 6.9492, "loss/crossentropy": 1.8928776159882545, "loss/hidden": 3.358203125, "loss/jsd": 0.0, "loss/logits": 0.18985262140631676, "step": 1120 }, { "epoch": 0.0565, "grad_norm": 30.0, "grad_norm_var": 1.2447265625, "learning_rate": 0.0001, "loss": 7.1367, "loss/crossentropy": 1.7702923499047756, "loss/hidden": 3.32109375, "loss/jsd": 0.0, "loss/logits": 0.17983693201094866, "step": 1130 }, { "epoch": 0.057, "grad_norm": 30.25, "grad_norm_var": 3.3080729166666667, "learning_rate": 0.0001, "loss": 7.0322, "loss/crossentropy": 1.8519952863454818, "loss/hidden": 3.465234375, "loss/jsd": 0.0, "loss/logits": 0.20197003111243247, "step": 1140 }, { "epoch": 0.0575, "grad_norm": 31.125, "grad_norm_var": 3.1962890625, "learning_rate": 0.0001, "loss": 7.0557, "loss/crossentropy": 1.8624355979263783, "loss/hidden": 3.526953125, "loss/jsd": 0.0, "loss/logits": 0.20604186709970235, "step": 1150 }, { "epoch": 0.058, "grad_norm": 28.5, "grad_norm_var": 22.8462890625, "learning_rate": 0.0001, "loss": 6.9562, "loss/crossentropy": 1.8102556586265564, "loss/hidden": 3.44609375, "loss/jsd": 0.0, "loss/logits": 0.20240887869149446, "step": 1160 }, { "epoch": 0.0585, "grad_norm": 32.25, "grad_norm_var": 23.950455729166666, "learning_rate": 0.0001, "loss": 6.9857, "loss/crossentropy": 1.8860370084643363, "loss/hidden": 3.339453125, "loss/jsd": 0.0, "loss/logits": 0.18206186592578888, "step": 1170 }, { "epoch": 0.059, "grad_norm": 30.125, "grad_norm_var": 1.6518229166666667, "learning_rate": 0.0001, "loss": 7.056, "loss/crossentropy": 1.9338740326464177, "loss/hidden": 3.42265625, "loss/jsd": 0.0, "loss/logits": 0.22607974465936423, "step": 1180 }, { "epoch": 0.0595, "grad_norm": 29.5, "grad_norm_var": 11.267708333333333, "learning_rate": 0.0001, "loss": 6.931, "loss/crossentropy": 1.9357615426182746, "loss/hidden": 3.351953125, "loss/jsd": 0.0, "loss/logits": 0.1852928228676319, "step": 1190 }, { "epoch": 0.06, "grad_norm": 39.25, "grad_norm_var": 1.2635411532464435e+18, "learning_rate": 0.0001, "loss": 7.0138, "loss/crossentropy": 1.669256182014942, "loss/hidden": 3.31328125, "loss/jsd": 0.0, "loss/logits": 0.1792891369201243, "step": 1200 }, { "epoch": 0.0605, "grad_norm": 30.125, "grad_norm_var": 2.2555340145024479e+18, "learning_rate": 0.0001, "loss": 7.003, "loss/crossentropy": 1.8537344850599766, "loss/hidden": 3.6078125, "loss/jsd": 0.0, "loss/logits": 0.18713028654456138, "step": 1210 }, { "epoch": 0.061, "grad_norm": 30.75, "grad_norm_var": 1.1529214881025404e+18, "learning_rate": 0.0001, "loss": 6.9982, "loss/crossentropy": 1.8868144243955611, "loss/hidden": 3.259375, "loss/jsd": 0.0, "loss/logits": 0.16826356202363968, "step": 1220 }, { "epoch": 0.0615, "grad_norm": 38.0, "grad_norm_var": 11.041080729166667, "learning_rate": 0.0001, "loss": 7.1145, "loss/crossentropy": 1.7373395457863807, "loss/hidden": 3.26328125, "loss/jsd": 0.0, "loss/logits": 0.16631986051797867, "step": 1230 }, { "epoch": 0.062, "grad_norm": 28.625, "grad_norm_var": 6.718489583333334, "learning_rate": 0.0001, "loss": 6.8881, "loss/crossentropy": 1.610298927500844, "loss/hidden": 3.358984375, "loss/jsd": 0.0, "loss/logits": 0.1909397032111883, "step": 1240 }, { "epoch": 0.0625, "grad_norm": 29.625, "grad_norm_var": 4.344205729166666, "learning_rate": 0.0001, "loss": 7.0797, "loss/crossentropy": 1.7361410059034825, "loss/hidden": 3.366796875, "loss/jsd": 0.0, "loss/logits": 0.18541559688746928, "step": 1250 }, { "epoch": 0.063, "grad_norm": 27.875, "grad_norm_var": 3.3889973958333335, "learning_rate": 0.0001, "loss": 6.9329, "loss/crossentropy": 1.7078735738992692, "loss/hidden": 3.4015625, "loss/jsd": 0.0, "loss/logits": 0.18024133574217557, "step": 1260 }, { "epoch": 0.0635, "grad_norm": 35.0, "grad_norm_var": 6.6166015625, "learning_rate": 0.0001, "loss": 6.9738, "loss/crossentropy": 1.8044774197041988, "loss/hidden": 3.276171875, "loss/jsd": 0.0, "loss/logits": 0.1794836211949587, "step": 1270 }, { "epoch": 0.064, "grad_norm": 29.375, "grad_norm_var": 13.601822916666666, "learning_rate": 0.0001, "loss": 6.9062, "loss/crossentropy": 1.8313415050506592, "loss/hidden": 3.3140625, "loss/jsd": 0.0, "loss/logits": 0.18087668968364595, "step": 1280 }, { "epoch": 0.0645, "grad_norm": 29.75, "grad_norm_var": 3.6020182291666667, "learning_rate": 0.0001, "loss": 6.9407, "loss/crossentropy": 1.6438103877007961, "loss/hidden": 3.41875, "loss/jsd": 0.0, "loss/logits": 0.1820345466956496, "step": 1290 }, { "epoch": 0.065, "grad_norm": 30.25, "grad_norm_var": 1.2379557291666667, "learning_rate": 0.0001, "loss": 7.0302, "loss/crossentropy": 1.7621051207184792, "loss/hidden": 3.41171875, "loss/jsd": 0.0, "loss/logits": 0.19308385904878378, "step": 1300 }, { "epoch": 0.0655, "grad_norm": 29.375, "grad_norm_var": 3.46640625, "learning_rate": 0.0001, "loss": 7.1178, "loss/crossentropy": 1.871315811574459, "loss/hidden": 3.3875, "loss/jsd": 0.0, "loss/logits": 0.19272034596651794, "step": 1310 }, { "epoch": 0.066, "grad_norm": 31.625, "grad_norm_var": 3.609375, "learning_rate": 0.0001, "loss": 7.0298, "loss/crossentropy": 1.8252998240292073, "loss/hidden": 3.36875, "loss/jsd": 0.0, "loss/logits": 0.21978344805538655, "step": 1320 }, { "epoch": 0.0665, "grad_norm": 33.5, "grad_norm_var": 1.3990009840566536e+18, "learning_rate": 0.0001, "loss": 7.068, "loss/crossentropy": 1.639507355540991, "loss/hidden": 3.60703125, "loss/jsd": 0.0, "loss/logits": 0.18024437148123978, "step": 1330 }, { "epoch": 0.067, "grad_norm": 28.75, "grad_norm_var": 1.3990009842291443e+18, "learning_rate": 0.0001, "loss": 6.9556, "loss/crossentropy": 1.8158223167061807, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.18003626042045653, "step": 1340 }, { "epoch": 0.0675, "grad_norm": 29.75, "grad_norm_var": 3.21640625, "learning_rate": 0.0001, "loss": 6.7859, "loss/crossentropy": 1.6335266396403312, "loss/hidden": 3.38046875, "loss/jsd": 0.0, "loss/logits": 0.1845483684912324, "step": 1350 }, { "epoch": 0.068, "grad_norm": 30.75, "grad_norm_var": 2.5497395833333334, "learning_rate": 0.0001, "loss": 6.8607, "loss/crossentropy": 1.7433619983494282, "loss/hidden": 3.3484375, "loss/jsd": 0.0, "loss/logits": 0.17121702507138253, "step": 1360 }, { "epoch": 0.0685, "grad_norm": 28.0, "grad_norm_var": 4.353580729166667, "learning_rate": 0.0001, "loss": 7.1422, "loss/crossentropy": 1.8455571182072164, "loss/hidden": 3.333203125, "loss/jsd": 0.0, "loss/logits": 0.2054300512187183, "step": 1370 }, { "epoch": 0.069, "grad_norm": 29.625, "grad_norm_var": 3.388541666666667, "learning_rate": 0.0001, "loss": 7.0213, "loss/crossentropy": 1.8241696588695049, "loss/hidden": 3.321875, "loss/jsd": 0.0, "loss/logits": 0.18985041994601487, "step": 1380 }, { "epoch": 0.0695, "grad_norm": 31.25, "grad_norm_var": 8.0431640625, "learning_rate": 0.0001, "loss": 7.0, "loss/crossentropy": 1.7940153643488883, "loss/hidden": 3.331640625, "loss/jsd": 0.0, "loss/logits": 0.18176266234368085, "step": 1390 }, { "epoch": 0.07, "grad_norm": 33.0, "grad_norm_var": 14.3041015625, "learning_rate": 0.0001, "loss": 6.898, "loss/crossentropy": 1.8607503667473793, "loss/hidden": 3.326953125, "loss/jsd": 0.0, "loss/logits": 0.17468307819217443, "step": 1400 }, { "epoch": 0.0705, "grad_norm": 28.125, "grad_norm_var": 13.432291666666666, "learning_rate": 0.0001, "loss": 7.031, "loss/crossentropy": 1.6316836021840573, "loss/hidden": 3.240234375, "loss/jsd": 0.0, "loss/logits": 0.15119749261066318, "step": 1410 }, { "epoch": 0.071, "grad_norm": 28.25, "grad_norm_var": 45.9634765625, "learning_rate": 0.0001, "loss": 7.1507, "loss/crossentropy": 1.8821631267666816, "loss/hidden": 3.465625, "loss/jsd": 0.0, "loss/logits": 0.19027305245399476, "step": 1420 }, { "epoch": 0.0715, "grad_norm": 28.375, "grad_norm_var": 46.1884765625, "learning_rate": 0.0001, "loss": 7.063, "loss/crossentropy": 1.6992614693939685, "loss/hidden": 3.4625, "loss/jsd": 0.0, "loss/logits": 0.2002884623594582, "step": 1430 }, { "epoch": 0.072, "grad_norm": 29.625, "grad_norm_var": 6.732291666666667, "learning_rate": 0.0001, "loss": 6.9439, "loss/crossentropy": 1.7733798533678056, "loss/hidden": 3.307421875, "loss/jsd": 0.0, "loss/logits": 0.17554995641112328, "step": 1440 }, { "epoch": 0.0725, "grad_norm": 30.625, "grad_norm_var": 24.97265625, "learning_rate": 0.0001, "loss": 7.0264, "loss/crossentropy": 1.8444553710520268, "loss/hidden": 3.412109375, "loss/jsd": 0.0, "loss/logits": 0.1976129287853837, "step": 1450 }, { "epoch": 0.073, "grad_norm": 41.5, "grad_norm_var": 18.2275390625, "learning_rate": 0.0001, "loss": 7.0056, "loss/crossentropy": 1.778428715467453, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.17879956895485521, "step": 1460 }, { "epoch": 0.0735, "grad_norm": 40.75, "grad_norm_var": 14.88515625, "learning_rate": 0.0001, "loss": 6.8647, "loss/crossentropy": 1.8260969623923302, "loss/hidden": 3.431640625, "loss/jsd": 0.0, "loss/logits": 0.18223165888339282, "step": 1470 }, { "epoch": 0.074, "grad_norm": 30.75, "grad_norm_var": 12.42265625, "learning_rate": 0.0001, "loss": 6.9814, "loss/crossentropy": 1.852180902659893, "loss/hidden": 3.188671875, "loss/jsd": 0.0, "loss/logits": 0.15915404492989182, "step": 1480 }, { "epoch": 0.0745, "grad_norm": 32.0, "grad_norm_var": 17.264518229166665, "learning_rate": 0.0001, "loss": 6.9467, "loss/crossentropy": 1.8016018435359, "loss/hidden": 3.30234375, "loss/jsd": 0.0, "loss/logits": 0.17374343778938056, "step": 1490 }, { "epoch": 0.075, "grad_norm": 27.75, "grad_norm_var": 16.795572916666668, "learning_rate": 0.0001, "loss": 6.9688, "loss/crossentropy": 1.7803546212613583, "loss/hidden": 3.230078125, "loss/jsd": 0.0, "loss/logits": 0.1623454326763749, "step": 1500 }, { "epoch": 0.0755, "grad_norm": 27.125, "grad_norm_var": 11.0072265625, "learning_rate": 0.0001, "loss": 6.9148, "loss/crossentropy": 1.7990518882870674, "loss/hidden": 3.341015625, "loss/jsd": 0.0, "loss/logits": 0.1776049867272377, "step": 1510 }, { "epoch": 0.076, "grad_norm": 28.875, "grad_norm_var": 9.0009765625, "learning_rate": 0.0001, "loss": 6.9834, "loss/crossentropy": 1.7659361466765404, "loss/hidden": 3.229296875, "loss/jsd": 0.0, "loss/logits": 0.17018448635935784, "step": 1520 }, { "epoch": 0.0765, "grad_norm": 28.75, "grad_norm_var": 5.566666666666666, "learning_rate": 0.0001, "loss": 6.9513, "loss/crossentropy": 1.948898734152317, "loss/hidden": 3.368359375, "loss/jsd": 0.0, "loss/logits": 0.20332392100244762, "step": 1530 }, { "epoch": 0.077, "grad_norm": 37.0, "grad_norm_var": 12.0337890625, "learning_rate": 0.0001, "loss": 6.9845, "loss/crossentropy": 1.897236557304859, "loss/hidden": 3.305078125, "loss/jsd": 0.0, "loss/logits": 0.1786106862127781, "step": 1540 }, { "epoch": 0.0775, "grad_norm": 30.75, "grad_norm_var": 10.74140625, "learning_rate": 0.0001, "loss": 6.9651, "loss/crossentropy": 1.668473443388939, "loss/hidden": 3.30390625, "loss/jsd": 0.0, "loss/logits": 0.18010491924360394, "step": 1550 }, { "epoch": 0.078, "grad_norm": 35.0, "grad_norm_var": 11.645768229166666, "learning_rate": 0.0001, "loss": 7.0873, "loss/crossentropy": 1.8844516187906266, "loss/hidden": 3.323828125, "loss/jsd": 0.0, "loss/logits": 0.19164156243205072, "step": 1560 }, { "epoch": 0.0785, "grad_norm": 36.5, "grad_norm_var": 9.326497395833334, "learning_rate": 0.0001, "loss": 6.9175, "loss/crossentropy": 1.7603260070085525, "loss/hidden": 3.276953125, "loss/jsd": 0.0, "loss/logits": 0.17738686297088863, "step": 1570 }, { "epoch": 0.079, "grad_norm": 28.25, "grad_norm_var": 11.4259765625, "learning_rate": 0.0001, "loss": 7.0352, "loss/crossentropy": 1.8728493131697177, "loss/hidden": 3.341796875, "loss/jsd": 0.0, "loss/logits": 0.19688725294545292, "step": 1580 }, { "epoch": 0.0795, "grad_norm": 29.25, "grad_norm_var": 8.5375, "learning_rate": 0.0001, "loss": 6.955, "loss/crossentropy": 1.8099886417388915, "loss/hidden": 3.29375, "loss/jsd": 0.0, "loss/logits": 0.18610341083258392, "step": 1590 }, { "epoch": 0.08, "grad_norm": 36.0, "grad_norm_var": 19.722330729166668, "learning_rate": 0.0001, "loss": 6.9313, "loss/crossentropy": 1.7017989411950112, "loss/hidden": 3.35234375, "loss/jsd": 0.0, "loss/logits": 0.17710780492052436, "step": 1600 }, { "epoch": 0.0805, "grad_norm": 32.25, "grad_norm_var": 21.603125, "learning_rate": 0.0001, "loss": 7.069, "loss/crossentropy": 1.7873531341552735, "loss/hidden": 3.333203125, "loss/jsd": 0.0, "loss/logits": 0.1812642457894981, "step": 1610 }, { "epoch": 0.081, "grad_norm": 28.875, "grad_norm_var": 3.2207682291666666, "learning_rate": 0.0001, "loss": 7.0405, "loss/crossentropy": 1.7903928458690643, "loss/hidden": 3.394921875, "loss/jsd": 0.0, "loss/logits": 0.19645511778071523, "step": 1620 }, { "epoch": 0.0815, "grad_norm": 29.75, "grad_norm_var": 2.874739583333333, "learning_rate": 0.0001, "loss": 7.0022, "loss/crossentropy": 1.6019535034894943, "loss/hidden": 3.271484375, "loss/jsd": 0.0, "loss/logits": 0.1628541074693203, "step": 1630 }, { "epoch": 0.082, "grad_norm": 31.375, "grad_norm_var": 6.37265625, "learning_rate": 0.0001, "loss": 6.7734, "loss/crossentropy": 1.7893570616841317, "loss/hidden": 3.371484375, "loss/jsd": 0.0, "loss/logits": 0.2000499103218317, "step": 1640 }, { "epoch": 0.0825, "grad_norm": 30.5, "grad_norm_var": 6.910416666666666, "learning_rate": 0.0001, "loss": 6.9578, "loss/crossentropy": 1.6443258710205555, "loss/hidden": 3.259765625, "loss/jsd": 0.0, "loss/logits": 0.16416865289211274, "step": 1650 }, { "epoch": 0.083, "grad_norm": 30.5, "grad_norm_var": 35.25182291666667, "learning_rate": 0.0001, "loss": 7.0861, "loss/crossentropy": 1.8358689159154893, "loss/hidden": 3.28359375, "loss/jsd": 0.0, "loss/logits": 0.1853348884731531, "step": 1660 }, { "epoch": 0.0835, "grad_norm": 30.0, "grad_norm_var": 15.6587890625, "learning_rate": 0.0001, "loss": 6.9008, "loss/crossentropy": 1.9014468491077423, "loss/hidden": 3.34140625, "loss/jsd": 0.0, "loss/logits": 0.19975380562245845, "step": 1670 }, { "epoch": 0.084, "grad_norm": 28.25, "grad_norm_var": 4.9666015625, "learning_rate": 0.0001, "loss": 7.0062, "loss/crossentropy": 1.7637556672096253, "loss/hidden": 3.40703125, "loss/jsd": 0.0, "loss/logits": 0.19306765552610158, "step": 1680 }, { "epoch": 0.0845, "grad_norm": 44.0, "grad_norm_var": 14.08125, "learning_rate": 0.0001, "loss": 6.9184, "loss/crossentropy": 1.7980270460247993, "loss/hidden": 3.336328125, "loss/jsd": 0.0, "loss/logits": 0.17251317510381342, "step": 1690 }, { "epoch": 0.085, "grad_norm": 30.0, "grad_norm_var": 16.656184895833334, "learning_rate": 0.0001, "loss": 6.8985, "loss/crossentropy": 1.9003560155630113, "loss/hidden": 3.336328125, "loss/jsd": 0.0, "loss/logits": 0.19372209012508393, "step": 1700 }, { "epoch": 0.0855, "grad_norm": 28.375, "grad_norm_var": 4.02265625, "learning_rate": 0.0001, "loss": 6.8638, "loss/crossentropy": 1.7488896727561951, "loss/hidden": 3.31484375, "loss/jsd": 0.0, "loss/logits": 0.16111841816455125, "step": 1710 }, { "epoch": 0.086, "grad_norm": 4362076160.0, "grad_norm_var": 1.1892317599584748e+18, "learning_rate": 0.0001, "loss": 7.061, "loss/crossentropy": 1.7708093903958797, "loss/hidden": 3.35625, "loss/jsd": 0.0, "loss/logits": 0.19512660000473261, "step": 1720 }, { "epoch": 0.0865, "grad_norm": 30.375, "grad_norm_var": 1.1892317591996554e+18, "learning_rate": 0.0001, "loss": 6.8861, "loss/crossentropy": 1.6944726780056953, "loss/hidden": 3.333203125, "loss/jsd": 0.0, "loss/logits": 0.16455791369080544, "step": 1730 }, { "epoch": 0.087, "grad_norm": 29.375, "grad_norm_var": 3.2905598958333333, "learning_rate": 0.0001, "loss": 6.8425, "loss/crossentropy": 1.7352489478886128, "loss/hidden": 3.32421875, "loss/jsd": 0.0, "loss/logits": 0.16651339596137404, "step": 1740 }, { "epoch": 0.0875, "grad_norm": 29.875, "grad_norm_var": 1.81015625, "learning_rate": 0.0001, "loss": 6.886, "loss/crossentropy": 1.775932352244854, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.18791395220905543, "step": 1750 }, { "epoch": 0.088, "grad_norm": 29.25, "grad_norm_var": 2.9848307291666667, "learning_rate": 0.0001, "loss": 6.8755, "loss/crossentropy": 1.700956543534994, "loss/hidden": 3.359765625, "loss/jsd": 0.0, "loss/logits": 0.17034402694553136, "step": 1760 }, { "epoch": 0.0885, "grad_norm": 30.375, "grad_norm_var": 2.0660807291666665, "learning_rate": 0.0001, "loss": 6.9996, "loss/crossentropy": 1.6696124613285064, "loss/hidden": 3.317578125, "loss/jsd": 0.0, "loss/logits": 0.17471891567111014, "step": 1770 }, { "epoch": 0.089, "grad_norm": 29.0, "grad_norm_var": 2.7729166666666667, "learning_rate": 0.0001, "loss": 6.8325, "loss/crossentropy": 1.6660587199032306, "loss/hidden": 3.328515625, "loss/jsd": 0.0, "loss/logits": 0.1662266943603754, "step": 1780 }, { "epoch": 0.0895, "grad_norm": 32.5, "grad_norm_var": 4.6900390625, "learning_rate": 0.0001, "loss": 6.947, "loss/crossentropy": 1.8900059774518012, "loss/hidden": 3.315234375, "loss/jsd": 0.0, "loss/logits": 0.18781680446118115, "step": 1790 }, { "epoch": 0.09, "grad_norm": 30.0, "grad_norm_var": 4.231705729166666, "learning_rate": 0.0001, "loss": 6.9437, "loss/crossentropy": 1.8869778975844382, "loss/hidden": 3.269921875, "loss/jsd": 0.0, "loss/logits": 0.17426692880690098, "step": 1800 }, { "epoch": 0.0905, "grad_norm": 33.0, "grad_norm_var": 2.8309895833333334, "learning_rate": 0.0001, "loss": 6.9652, "loss/crossentropy": 1.8232818126678467, "loss/hidden": 3.331640625, "loss/jsd": 0.0, "loss/logits": 0.16745625659823418, "step": 1810 }, { "epoch": 0.091, "grad_norm": 34.25, "grad_norm_var": 4.40390625, "learning_rate": 0.0001, "loss": 7.0219, "loss/crossentropy": 1.8258642494678496, "loss/hidden": 3.315234375, "loss/jsd": 0.0, "loss/logits": 0.19198300442658364, "step": 1820 }, { "epoch": 0.0915, "grad_norm": 32.25, "grad_norm_var": 8.268684895833333, "learning_rate": 0.0001, "loss": 6.8434, "loss/crossentropy": 1.7024194486439228, "loss/hidden": 3.409765625, "loss/jsd": 0.0, "loss/logits": 0.18930096151307224, "step": 1830 }, { "epoch": 0.092, "grad_norm": 31.625, "grad_norm_var": 6.74765625, "learning_rate": 0.0001, "loss": 6.9231, "loss/crossentropy": 1.7479817308485508, "loss/hidden": 3.3453125, "loss/jsd": 0.0, "loss/logits": 0.1829341644886881, "step": 1840 }, { "epoch": 0.0925, "grad_norm": 33.75, "grad_norm_var": 4.48515625, "learning_rate": 0.0001, "loss": 7.0635, "loss/crossentropy": 2.0127600729465485, "loss/hidden": 3.2953125, "loss/jsd": 0.0, "loss/logits": 0.18128359764814378, "step": 1850 }, { "epoch": 0.093, "grad_norm": 31.75, "grad_norm_var": 11.642708333333333, "learning_rate": 0.0001, "loss": 6.9505, "loss/crossentropy": 1.7567149683833123, "loss/hidden": 3.343359375, "loss/jsd": 0.0, "loss/logits": 0.1842447452247143, "step": 1860 }, { "epoch": 0.0935, "grad_norm": 34.5, "grad_norm_var": 1.5832967231255347e+18, "learning_rate": 0.0001, "loss": 7.1294, "loss/crossentropy": 1.8183075070381165, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.17170923966914414, "step": 1870 }, { "epoch": 0.094, "grad_norm": 36.0, "grad_norm_var": 14.670833333333333, "learning_rate": 0.0001, "loss": 6.7269, "loss/crossentropy": 1.6782560005784035, "loss/hidden": 3.329296875, "loss/jsd": 0.0, "loss/logits": 0.16191824562847615, "step": 1880 }, { "epoch": 0.0945, "grad_norm": 29.5, "grad_norm_var": 8.283984344848707e+17, "learning_rate": 0.0001, "loss": 6.9423, "loss/crossentropy": 1.7822233349084855, "loss/hidden": 3.319140625, "loss/jsd": 0.0, "loss/logits": 0.15704208929091693, "step": 1890 }, { "epoch": 0.095, "grad_norm": 27.25, "grad_norm_var": 12.049739583333333, "learning_rate": 0.0001, "loss": 6.8598, "loss/crossentropy": 1.8880347676575184, "loss/hidden": 3.30546875, "loss/jsd": 0.0, "loss/logits": 0.18590961638838052, "step": 1900 }, { "epoch": 0.0955, "grad_norm": 32.75, "grad_norm_var": 6.827351348981094e+17, "learning_rate": 0.0001, "loss": 7.0671, "loss/crossentropy": 1.6947499185800552, "loss/hidden": 3.341015625, "loss/jsd": 0.0, "loss/logits": 0.17880834415555, "step": 1910 }, { "epoch": 0.096, "grad_norm": 30.875, "grad_norm_var": 7.036874278235887e+17, "learning_rate": 0.0001, "loss": 6.8978, "loss/crossentropy": 1.6141892828047275, "loss/hidden": 3.35390625, "loss/jsd": 0.0, "loss/logits": 0.18202604549005627, "step": 1920 }, { "epoch": 0.0965, "grad_norm": 29.625, "grad_norm_var": 12.239583333333334, "learning_rate": 0.0001, "loss": 6.9659, "loss/crossentropy": 1.7211613908410073, "loss/hidden": 3.29453125, "loss/jsd": 0.0, "loss/logits": 0.19102244451642036, "step": 1930 }, { "epoch": 0.097, "grad_norm": 28.375, "grad_norm_var": 15.983268229166667, "learning_rate": 0.0001, "loss": 6.8912, "loss/crossentropy": 1.7675188466906548, "loss/hidden": 3.32421875, "loss/jsd": 0.0, "loss/logits": 0.19818378714844584, "step": 1940 }, { "epoch": 0.0975, "grad_norm": 32.75, "grad_norm_var": 9.306266259729068e+17, "learning_rate": 0.0001, "loss": 6.9645, "loss/crossentropy": 1.7558425486087799, "loss/hidden": 3.419921875, "loss/jsd": 0.0, "loss/logits": 0.1911760584451258, "step": 1950 }, { "epoch": 0.098, "grad_norm": 27.625, "grad_norm_var": 1.5205981735288307e+18, "learning_rate": 0.0001, "loss": 6.8635, "loss/crossentropy": 1.7457415886223315, "loss/hidden": 3.384765625, "loss/jsd": 0.0, "loss/logits": 0.1852768061682582, "step": 1960 }, { "epoch": 0.0985, "grad_norm": 32.75, "grad_norm_var": 14.7125, "learning_rate": 0.0001, "loss": 6.8508, "loss/crossentropy": 1.683419554680586, "loss/hidden": 3.337890625, "loss/jsd": 0.0, "loss/logits": 0.1731728465296328, "step": 1970 }, { "epoch": 0.099, "grad_norm": 30.625, "grad_norm_var": 1.0302687666727377e+18, "learning_rate": 0.0001, "loss": 7.0005, "loss/crossentropy": 1.727415306866169, "loss/hidden": 3.297265625, "loss/jsd": 0.0, "loss/logits": 0.18517111875116826, "step": 1980 }, { "epoch": 0.0995, "grad_norm": 32.25, "grad_norm_var": 22.14375, "learning_rate": 0.0001, "loss": 6.9138, "loss/crossentropy": 1.8120180189609527, "loss/hidden": 3.434375, "loss/jsd": 0.0, "loss/logits": 0.20129222217947246, "step": 1990 }, { "epoch": 0.1, "grad_norm": 35.5, "grad_norm_var": 8.491080729166667, "learning_rate": 0.0001, "loss": 6.9525, "loss/crossentropy": 1.8299045406281949, "loss/hidden": 3.251171875, "loss/jsd": 0.0, "loss/logits": 0.17095453599467875, "step": 2000 }, { "epoch": 0.1005, "grad_norm": 32.75, "grad_norm_var": 8.586458333333333, "learning_rate": 0.0001, "loss": 6.7871, "loss/crossentropy": 1.7243870817124844, "loss/hidden": 3.3703125, "loss/jsd": 0.0, "loss/logits": 0.16602067481726407, "step": 2010 }, { "epoch": 0.101, "grad_norm": 29.625, "grad_norm_var": 9.378125, "learning_rate": 0.0001, "loss": 6.855, "loss/crossentropy": 1.6784847162663936, "loss/hidden": 3.225, "loss/jsd": 0.0, "loss/logits": 0.16919725136831404, "step": 2020 }, { "epoch": 0.1015, "grad_norm": 41.0, "grad_norm_var": 112.83170572916667, "learning_rate": 0.0001, "loss": 6.9616, "loss/crossentropy": 1.8477609053254127, "loss/hidden": 3.259375, "loss/jsd": 0.0, "loss/logits": 0.16309508439153433, "step": 2030 }, { "epoch": 0.102, "grad_norm": 30.0, "grad_norm_var": 111.6259765625, "learning_rate": 0.0001, "loss": 6.9517, "loss/crossentropy": 1.7308252967894078, "loss/hidden": 3.202734375, "loss/jsd": 0.0, "loss/logits": 0.1722710312344134, "step": 2040 }, { "epoch": 0.1025, "grad_norm": 30.625, "grad_norm_var": 4.073893229166667, "learning_rate": 0.0001, "loss": 6.9088, "loss/crossentropy": 1.7544417701661588, "loss/hidden": 3.41796875, "loss/jsd": 0.0, "loss/logits": 0.19881883040070533, "step": 2050 }, { "epoch": 0.103, "grad_norm": 38.0, "grad_norm_var": 13.948958333333334, "learning_rate": 0.0001, "loss": 6.9474, "loss/crossentropy": 1.9995075345039368, "loss/hidden": 3.271875, "loss/jsd": 0.0, "loss/logits": 0.17399701047688723, "step": 2060 }, { "epoch": 0.1035, "grad_norm": 31.75, "grad_norm_var": 21.0744140625, "learning_rate": 0.0001, "loss": 6.8732, "loss/crossentropy": 1.8493791602551937, "loss/hidden": 3.2421875, "loss/jsd": 0.0, "loss/logits": 0.16063635479658842, "step": 2070 }, { "epoch": 0.104, "grad_norm": 32.25, "grad_norm_var": 17.897916666666667, "learning_rate": 0.0001, "loss": 6.9556, "loss/crossentropy": 1.737601400911808, "loss/hidden": 3.333984375, "loss/jsd": 0.0, "loss/logits": 0.18038861453533173, "step": 2080 }, { "epoch": 0.1045, "grad_norm": 32.25, "grad_norm_var": 3.38515625, "learning_rate": 0.0001, "loss": 6.979, "loss/crossentropy": 1.7256839543581008, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.19299248773604633, "step": 2090 }, { "epoch": 0.105, "grad_norm": 31.0, "grad_norm_var": 3.4853515625, "learning_rate": 0.0001, "loss": 6.8191, "loss/crossentropy": 1.7587849080562592, "loss/hidden": 3.25, "loss/jsd": 0.0, "loss/logits": 0.16214433256536723, "step": 2100 }, { "epoch": 0.1055, "grad_norm": 33.5, "grad_norm_var": 4.112239583333333, "learning_rate": 0.0001, "loss": 7.0774, "loss/crossentropy": 2.092029668390751, "loss/hidden": 3.332421875, "loss/jsd": 0.0, "loss/logits": 0.19293731367215514, "step": 2110 }, { "epoch": 0.106, "grad_norm": 30.375, "grad_norm_var": 5.1072265625, "learning_rate": 0.0001, "loss": 6.9724, "loss/crossentropy": 1.7829479269683361, "loss/hidden": 3.349609375, "loss/jsd": 0.0, "loss/logits": 0.19456620067358016, "step": 2120 }, { "epoch": 0.1065, "grad_norm": 29.375, "grad_norm_var": 20.4525390625, "learning_rate": 0.0001, "loss": 6.9908, "loss/crossentropy": 1.7853210166096687, "loss/hidden": 3.32265625, "loss/jsd": 0.0, "loss/logits": 0.18279874734580517, "step": 2130 }, { "epoch": 0.107, "grad_norm": 36.5, "grad_norm_var": 20.847330729166668, "learning_rate": 0.0001, "loss": 6.9787, "loss/crossentropy": 1.8366479635238648, "loss/hidden": 3.324609375, "loss/jsd": 0.0, "loss/logits": 0.17941316729411483, "step": 2140 }, { "epoch": 0.1075, "grad_norm": 29.25, "grad_norm_var": 5.1384765625, "learning_rate": 0.0001, "loss": 7.0703, "loss/crossentropy": 1.8491265431046486, "loss/hidden": 3.253515625, "loss/jsd": 0.0, "loss/logits": 0.1788581835106015, "step": 2150 }, { "epoch": 0.108, "grad_norm": 28.5, "grad_norm_var": 3.8082682291666665, "learning_rate": 0.0001, "loss": 7.0117, "loss/crossentropy": 1.8718080654740334, "loss/hidden": 3.36015625, "loss/jsd": 0.0, "loss/logits": 0.18362828250974417, "step": 2160 }, { "epoch": 0.1085, "grad_norm": 31.375, "grad_norm_var": 4.0541015625, "learning_rate": 0.0001, "loss": 6.9147, "loss/crossentropy": 1.823565386980772, "loss/hidden": 3.346875, "loss/jsd": 0.0, "loss/logits": 0.17529825307428837, "step": 2170 }, { "epoch": 0.109, "grad_norm": 29.875, "grad_norm_var": 3.1510416666666665, "learning_rate": 0.0001, "loss": 6.8799, "loss/crossentropy": 1.8646746143698691, "loss/hidden": 3.3625, "loss/jsd": 0.0, "loss/logits": 0.18420496406033635, "step": 2180 }, { "epoch": 0.1095, "grad_norm": 28.0, "grad_norm_var": 1.6061848958333333, "learning_rate": 0.0001, "loss": 6.9741, "loss/crossentropy": 1.8418309345841408, "loss/hidden": 3.289453125, "loss/jsd": 0.0, "loss/logits": 0.17159662526100875, "step": 2190 }, { "epoch": 0.11, "grad_norm": 31.375, "grad_norm_var": 2.4184895833333333, "learning_rate": 0.0001, "loss": 7.0042, "loss/crossentropy": 1.8776386469602584, "loss/hidden": 3.40390625, "loss/jsd": 0.0, "loss/logits": 0.187642621435225, "step": 2200 }, { "epoch": 0.1105, "grad_norm": 29.125, "grad_norm_var": 8.20781018083492e+17, "learning_rate": 0.0001, "loss": 6.9378, "loss/crossentropy": 1.655004223436117, "loss/hidden": 3.273046875, "loss/jsd": 0.0, "loss/logits": 0.1580679954495281, "step": 2210 }, { "epoch": 0.111, "grad_norm": 30.125, "grad_norm_var": 3.468489583333333, "learning_rate": 0.0001, "loss": 6.9831, "loss/crossentropy": 1.792271687835455, "loss/hidden": 3.277734375, "loss/jsd": 0.0, "loss/logits": 0.17089223572984338, "step": 2220 }, { "epoch": 0.1115, "grad_norm": 34.75, "grad_norm_var": 4.209375, "learning_rate": 0.0001, "loss": 6.8364, "loss/crossentropy": 1.734425350278616, "loss/hidden": 3.394140625, "loss/jsd": 0.0, "loss/logits": 0.18262410946190358, "step": 2230 }, { "epoch": 0.112, "grad_norm": 27.0, "grad_norm_var": 4.629166666666666, "learning_rate": 0.0001, "loss": 6.8305, "loss/crossentropy": 1.772131036967039, "loss/hidden": 3.274609375, "loss/jsd": 0.0, "loss/logits": 0.1691578391008079, "step": 2240 }, { "epoch": 0.1125, "grad_norm": 29.125, "grad_norm_var": 6.303580729166667, "learning_rate": 0.0001, "loss": 6.9967, "loss/crossentropy": 1.9334307715296746, "loss/hidden": 3.383203125, "loss/jsd": 0.0, "loss/logits": 0.19251629430800676, "step": 2250 }, { "epoch": 0.113, "grad_norm": 36.5, "grad_norm_var": 6.4791015625, "learning_rate": 0.0001, "loss": 6.981, "loss/crossentropy": 1.887280984222889, "loss/hidden": 3.358984375, "loss/jsd": 0.0, "loss/logits": 0.21319616939872504, "step": 2260 }, { "epoch": 0.1135, "grad_norm": 28.75, "grad_norm_var": 4.7009765625, "learning_rate": 0.0001, "loss": 7.0286, "loss/crossentropy": 1.8285806521773338, "loss/hidden": 3.41484375, "loss/jsd": 0.0, "loss/logits": 0.18080311622470618, "step": 2270 }, { "epoch": 0.114, "grad_norm": 31.0, "grad_norm_var": 7.09375, "learning_rate": 0.0001, "loss": 6.863, "loss/crossentropy": 1.6441345304250716, "loss/hidden": 3.323046875, "loss/jsd": 0.0, "loss/logits": 0.18446694109588863, "step": 2280 }, { "epoch": 0.1145, "grad_norm": 30.125, "grad_norm_var": 9.029166666666667, "learning_rate": 0.0001, "loss": 6.8549, "loss/crossentropy": 1.5048397369682789, "loss/hidden": 3.359765625, "loss/jsd": 0.0, "loss/logits": 0.16425186553969978, "step": 2290 }, { "epoch": 0.115, "grad_norm": 28.625, "grad_norm_var": 3.9400390625, "learning_rate": 0.0001, "loss": 6.9448, "loss/crossentropy": 1.7213742382824422, "loss/hidden": 3.3328125, "loss/jsd": 0.0, "loss/logits": 0.17190376687794923, "step": 2300 }, { "epoch": 0.1155, "grad_norm": 29.125, "grad_norm_var": 51.71608072916667, "learning_rate": 0.0001, "loss": 7.0456, "loss/crossentropy": 1.8745042860507966, "loss/hidden": 3.3703125, "loss/jsd": 0.0, "loss/logits": 0.1922046933323145, "step": 2310 }, { "epoch": 0.116, "grad_norm": 31.625, "grad_norm_var": 5.101822916666666, "learning_rate": 0.0001, "loss": 7.0037, "loss/crossentropy": 1.835337746143341, "loss/hidden": 3.291015625, "loss/jsd": 0.0, "loss/logits": 0.172516768053174, "step": 2320 }, { "epoch": 0.1165, "grad_norm": 29.625, "grad_norm_var": 4.792122395833333, "learning_rate": 0.0001, "loss": 6.8605, "loss/crossentropy": 1.7886844381690026, "loss/hidden": 3.31328125, "loss/jsd": 0.0, "loss/logits": 0.17270518001168966, "step": 2330 }, { "epoch": 0.117, "grad_norm": 30.875, "grad_norm_var": 24.301041666666666, "learning_rate": 0.0001, "loss": 6.8857, "loss/crossentropy": 1.8270663298666476, "loss/hidden": 3.316796875, "loss/jsd": 0.0, "loss/logits": 0.17341279415413738, "step": 2340 }, { "epoch": 0.1175, "grad_norm": 28.25, "grad_norm_var": 23.795247395833332, "learning_rate": 0.0001, "loss": 6.981, "loss/crossentropy": 1.7389558240771295, "loss/hidden": 3.33671875, "loss/jsd": 0.0, "loss/logits": 0.20616078823804856, "step": 2350 }, { "epoch": 0.118, "grad_norm": 31.75, "grad_norm_var": 3.3712890625, "learning_rate": 0.0001, "loss": 6.9706, "loss/crossentropy": 1.7505015313625336, "loss/hidden": 3.2375, "loss/jsd": 0.0, "loss/logits": 0.1691287737339735, "step": 2360 }, { "epoch": 0.1185, "grad_norm": 29.875, "grad_norm_var": 3.7864583333333335, "learning_rate": 0.0001, "loss": 7.0493, "loss/crossentropy": 1.8290210530161857, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.17870840784162284, "step": 2370 }, { "epoch": 0.119, "grad_norm": 28.75, "grad_norm_var": 3.06640625, "learning_rate": 0.0001, "loss": 6.8945, "loss/crossentropy": 1.7312066838145257, "loss/hidden": 3.3453125, "loss/jsd": 0.0, "loss/logits": 0.16353450021706523, "step": 2380 }, { "epoch": 0.1195, "grad_norm": 38.75, "grad_norm_var": 8.985384797395922e+17, "learning_rate": 0.0001, "loss": 7.1346, "loss/crossentropy": 1.8643671602010727, "loss/hidden": 3.3734375, "loss/jsd": 0.0, "loss/logits": 0.18776546316221357, "step": 2390 }, { "epoch": 0.12, "grad_norm": 33.25, "grad_norm_var": 8.985384795065637e+17, "learning_rate": 0.0001, "loss": 7.0339, "loss/crossentropy": 1.7668686166405678, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.19443758334964514, "step": 2400 }, { "epoch": 0.1205, "grad_norm": 30.25, "grad_norm_var": 1.8852243670131978e+18, "learning_rate": 0.0001, "loss": 6.9626, "loss/crossentropy": 1.8465783804655076, "loss/hidden": 3.371875, "loss/jsd": 0.0, "loss/logits": 0.1860800025984645, "step": 2410 }, { "epoch": 0.121, "grad_norm": 33.75, "grad_norm_var": 1.8852243674568678e+18, "learning_rate": 0.0001, "loss": 6.8455, "loss/crossentropy": 1.7212153851985932, "loss/hidden": 3.36171875, "loss/jsd": 0.0, "loss/logits": 0.18209199868142606, "step": 2420 }, { "epoch": 0.1215, "grad_norm": 28.0, "grad_norm_var": 3.81015625, "learning_rate": 0.0001, "loss": 6.9986, "loss/crossentropy": 1.898094529658556, "loss/hidden": 3.375390625, "loss/jsd": 0.0, "loss/logits": 0.194298998080194, "step": 2430 }, { "epoch": 0.122, "grad_norm": 27.5, "grad_norm_var": 3.332291666666667, "learning_rate": 0.0001, "loss": 6.8924, "loss/crossentropy": 1.7420293487608434, "loss/hidden": 3.2546875, "loss/jsd": 0.0, "loss/logits": 0.161607267241925, "step": 2440 }, { "epoch": 0.1225, "grad_norm": 33.0, "grad_norm_var": 2.7622395833333333, "learning_rate": 0.0001, "loss": 6.8686, "loss/crossentropy": 1.6050585605204106, "loss/hidden": 3.371875, "loss/jsd": 0.0, "loss/logits": 0.17848586086183788, "step": 2450 }, { "epoch": 0.123, "grad_norm": 28.75, "grad_norm_var": 2.4400390625, "learning_rate": 0.0001, "loss": 6.9804, "loss/crossentropy": 1.9553805246949196, "loss/hidden": 3.3859375, "loss/jsd": 0.0, "loss/logits": 0.19847506172955037, "step": 2460 }, { "epoch": 0.1235, "grad_norm": 29.875, "grad_norm_var": 2.0791015625, "learning_rate": 0.0001, "loss": 6.9913, "loss/crossentropy": 1.4568642482161522, "loss/hidden": 3.335546875, "loss/jsd": 0.0, "loss/logits": 0.15850053485482932, "step": 2470 }, { "epoch": 0.124, "grad_norm": 31.875, "grad_norm_var": 4.3775390625, "learning_rate": 0.0001, "loss": 6.9326, "loss/crossentropy": 1.6532236352562903, "loss/hidden": 3.45859375, "loss/jsd": 0.0, "loss/logits": 0.18165745195001365, "step": 2480 }, { "epoch": 0.1245, "grad_norm": 28.5, "grad_norm_var": 4.522330729166667, "learning_rate": 0.0001, "loss": 7.005, "loss/crossentropy": 1.6793559297919274, "loss/hidden": 3.339453125, "loss/jsd": 0.0, "loss/logits": 0.17017313856631516, "step": 2490 }, { "epoch": 0.125, "grad_norm": 30.75, "grad_norm_var": 4.3353515625, "learning_rate": 0.0001, "loss": 7.0956, "loss/crossentropy": 1.8292289204895495, "loss/hidden": 3.38203125, "loss/jsd": 0.0, "loss/logits": 0.18509325329214335, "step": 2500 }, { "epoch": 0.1255, "grad_norm": 30.875, "grad_norm_var": 3.78125, "learning_rate": 0.0001, "loss": 6.9137, "loss/crossentropy": 1.7439368188381195, "loss/hidden": 3.3890625, "loss/jsd": 0.0, "loss/logits": 0.19252277240157128, "step": 2510 }, { "epoch": 0.126, "grad_norm": 31.125, "grad_norm_var": 1.1349774577470627e+18, "learning_rate": 0.0001, "loss": 7.051, "loss/crossentropy": 2.0631623208522796, "loss/hidden": 3.4265625, "loss/jsd": 0.0, "loss/logits": 0.22505897115916013, "step": 2520 }, { "epoch": 0.1265, "grad_norm": 29.75, "grad_norm_var": 1.1349774575828206e+18, "learning_rate": 0.0001, "loss": 7.1194, "loss/crossentropy": 1.8867668241262436, "loss/hidden": 3.35390625, "loss/jsd": 0.0, "loss/logits": 0.20316522121429442, "step": 2530 }, { "epoch": 0.127, "grad_norm": 28.25, "grad_norm_var": 20.151822916666667, "learning_rate": 0.0001, "loss": 7.0832, "loss/crossentropy": 1.8491319343447685, "loss/hidden": 3.392578125, "loss/jsd": 0.0, "loss/logits": 0.19702840279787778, "step": 2540 }, { "epoch": 0.1275, "grad_norm": 29.0, "grad_norm_var": 11.6681640625, "learning_rate": 0.0001, "loss": 6.9728, "loss/crossentropy": 1.8162995487451554, "loss/hidden": 3.366015625, "loss/jsd": 0.0, "loss/logits": 0.18736656550318004, "step": 2550 }, { "epoch": 0.128, "grad_norm": 34.5, "grad_norm_var": 13.088997395833333, "learning_rate": 0.0001, "loss": 7.1137, "loss/crossentropy": 2.031092081964016, "loss/hidden": 3.447265625, "loss/jsd": 0.0, "loss/logits": 0.21819815230555833, "step": 2560 }, { "epoch": 0.1285, "grad_norm": 31.125, "grad_norm_var": 1.7945788315993818e+17, "learning_rate": 0.0001, "loss": 7.0175, "loss/crossentropy": 1.731457906216383, "loss/hidden": 3.4015625, "loss/jsd": 0.0, "loss/logits": 0.18550403621047734, "step": 2570 }, { "epoch": 0.129, "grad_norm": 32.0, "grad_norm_var": 1.794578832870256e+17, "learning_rate": 0.0001, "loss": 6.8552, "loss/crossentropy": 1.8714622184634209, "loss/hidden": 3.2890625, "loss/jsd": 0.0, "loss/logits": 0.1802680429071188, "step": 2580 }, { "epoch": 0.1295, "grad_norm": 38.75, "grad_norm_var": 11.655143229166667, "learning_rate": 0.0001, "loss": 6.9513, "loss/crossentropy": 1.6536960810422898, "loss/hidden": 3.426171875, "loss/jsd": 0.0, "loss/logits": 0.19141803495585918, "step": 2590 }, { "epoch": 0.13, "grad_norm": 30.25, "grad_norm_var": 10.824934895833334, "learning_rate": 0.0001, "loss": 7.0451, "loss/crossentropy": 1.7446824312210083, "loss/hidden": 3.437890625, "loss/jsd": 0.0, "loss/logits": 0.21996904909610748, "step": 2600 }, { "epoch": 0.1305, "grad_norm": 32.0, "grad_norm_var": 0.9895182291666667, "learning_rate": 0.0001, "loss": 6.9912, "loss/crossentropy": 1.8711062870919704, "loss/hidden": 3.344140625, "loss/jsd": 0.0, "loss/logits": 0.18015410769730805, "step": 2610 }, { "epoch": 0.131, "grad_norm": 29.0, "grad_norm_var": 1.9697265625, "learning_rate": 0.0001, "loss": 6.9974, "loss/crossentropy": 1.7273207187652588, "loss/hidden": 3.3171875, "loss/jsd": 0.0, "loss/logits": 0.17100013056769967, "step": 2620 }, { "epoch": 0.1315, "grad_norm": 33.0, "grad_norm_var": 0.9681640625, "learning_rate": 0.0001, "loss": 6.864, "loss/crossentropy": 1.772182758897543, "loss/hidden": 3.410546875, "loss/jsd": 0.0, "loss/logits": 0.18076814245432615, "step": 2630 }, { "epoch": 0.132, "grad_norm": 29.25, "grad_norm_var": 5.707291666666666, "learning_rate": 0.0001, "loss": 7.1259, "loss/crossentropy": 1.7641409367322922, "loss/hidden": 3.434375, "loss/jsd": 0.0, "loss/logits": 0.18833348713815212, "step": 2640 }, { "epoch": 0.1325, "grad_norm": 40.75, "grad_norm_var": 10.91015625, "learning_rate": 0.0001, "loss": 7.0193, "loss/crossentropy": 1.859598373621702, "loss/hidden": 3.394140625, "loss/jsd": 0.0, "loss/logits": 0.18878742419183253, "step": 2650 }, { "epoch": 0.133, "grad_norm": 31.375, "grad_norm_var": 18.1822265625, "learning_rate": 0.0001, "loss": 6.9707, "loss/crossentropy": 1.7797490507364273, "loss/hidden": 3.411328125, "loss/jsd": 0.0, "loss/logits": 0.20212376527488232, "step": 2660 }, { "epoch": 0.1335, "grad_norm": 29.875, "grad_norm_var": 11.162239583333333, "learning_rate": 0.0001, "loss": 7.0002, "loss/crossentropy": 1.7839721478521824, "loss/hidden": 3.3140625, "loss/jsd": 0.0, "loss/logits": 0.173302289377898, "step": 2670 }, { "epoch": 0.134, "grad_norm": 27.125, "grad_norm_var": 4.2009765625, "learning_rate": 0.0001, "loss": 6.9156, "loss/crossentropy": 1.7781757101416589, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.19893121821805834, "step": 2680 }, { "epoch": 0.1345, "grad_norm": 30.75, "grad_norm_var": 36.837239583333336, "learning_rate": 0.0001, "loss": 7.0997, "loss/crossentropy": 1.8467799574136734, "loss/hidden": 3.41953125, "loss/jsd": 0.0, "loss/logits": 0.20002066995948553, "step": 2690 }, { "epoch": 0.135, "grad_norm": 28.5, "grad_norm_var": 37.431705729166666, "learning_rate": 0.0001, "loss": 6.9236, "loss/crossentropy": 1.6248198747634888, "loss/hidden": 3.335546875, "loss/jsd": 0.0, "loss/logits": 0.1642201030626893, "step": 2700 }, { "epoch": 0.1355, "grad_norm": 34.0, "grad_norm_var": 4.030989583333334, "learning_rate": 0.0001, "loss": 6.9361, "loss/crossentropy": 1.7102701038122177, "loss/hidden": 3.3609375, "loss/jsd": 0.0, "loss/logits": 0.16836816985160113, "step": 2710 }, { "epoch": 0.136, "grad_norm": 26.0, "grad_norm_var": 1.0907331108694131e+18, "learning_rate": 0.0001, "loss": 6.9167, "loss/crossentropy": 1.8059025250375271, "loss/hidden": 3.31796875, "loss/jsd": 0.0, "loss/logits": 0.16677290350198745, "step": 2720 }, { "epoch": 0.1365, "grad_norm": 29.5, "grad_norm_var": 6.2728515625, "learning_rate": 0.0001, "loss": 6.8796, "loss/crossentropy": 1.776158544421196, "loss/hidden": 3.4125, "loss/jsd": 0.0, "loss/logits": 0.1929216692224145, "step": 2730 }, { "epoch": 0.137, "grad_norm": 28.75, "grad_norm_var": 7.5431640625, "learning_rate": 0.0001, "loss": 6.8288, "loss/crossentropy": 1.8780412912368774, "loss/hidden": 3.397265625, "loss/jsd": 0.0, "loss/logits": 0.18471294036135077, "step": 2740 }, { "epoch": 0.1375, "grad_norm": 33.0, "grad_norm_var": 15.1947265625, "learning_rate": 0.0001, "loss": 6.9741, "loss/crossentropy": 1.7919296585023403, "loss/hidden": 3.40703125, "loss/jsd": 0.0, "loss/logits": 0.19309423677623272, "step": 2750 }, { "epoch": 0.138, "grad_norm": 31.375, "grad_norm_var": 16.696809895833333, "learning_rate": 0.0001, "loss": 6.9971, "loss/crossentropy": 1.8414636544883252, "loss/hidden": 3.330859375, "loss/jsd": 0.0, "loss/logits": 0.1888686059974134, "step": 2760 }, { "epoch": 0.1385, "grad_norm": 28.5, "grad_norm_var": 7.121875, "learning_rate": 0.0001, "loss": 6.9869, "loss/crossentropy": 1.8438507467508316, "loss/hidden": 3.357421875, "loss/jsd": 0.0, "loss/logits": 0.19185615349560975, "step": 2770 }, { "epoch": 0.139, "grad_norm": 33.25, "grad_norm_var": 10.338541666666666, "learning_rate": 0.0001, "loss": 6.9528, "loss/crossentropy": 1.8890479058027267, "loss/hidden": 3.384765625, "loss/jsd": 0.0, "loss/logits": 0.19027914050966502, "step": 2780 }, { "epoch": 0.1395, "grad_norm": 33.5, "grad_norm_var": 12.343684895833333, "learning_rate": 0.0001, "loss": 6.9585, "loss/crossentropy": 1.6378353632986546, "loss/hidden": 3.41484375, "loss/jsd": 0.0, "loss/logits": 0.18243511486798525, "step": 2790 }, { "epoch": 0.14, "grad_norm": 33.0, "grad_norm_var": 7.9384765625, "learning_rate": 0.0001, "loss": 6.885, "loss/crossentropy": 1.6422518469393252, "loss/hidden": 3.26875, "loss/jsd": 0.0, "loss/logits": 0.15738149764947593, "step": 2800 }, { "epoch": 0.1405, "grad_norm": 35.0, "grad_norm_var": 7.362239583333333, "learning_rate": 0.0001, "loss": 6.9251, "loss/crossentropy": 1.818039534240961, "loss/hidden": 3.222265625, "loss/jsd": 0.0, "loss/logits": 0.17553653065115213, "step": 2810 }, { "epoch": 0.141, "grad_norm": 28.875, "grad_norm_var": 8.7134765625, "learning_rate": 0.0001, "loss": 6.9659, "loss/crossentropy": 1.8913455709815026, "loss/hidden": 3.325390625, "loss/jsd": 0.0, "loss/logits": 0.18545334562659263, "step": 2820 }, { "epoch": 0.1415, "grad_norm": 27.5, "grad_norm_var": 7.718684895833333, "learning_rate": 0.0001, "loss": 6.8653, "loss/crossentropy": 1.9232856243848802, "loss/hidden": 3.34921875, "loss/jsd": 0.0, "loss/logits": 0.19609272833913566, "step": 2830 }, { "epoch": 0.142, "grad_norm": 31.75, "grad_norm_var": 18.7166015625, "learning_rate": 0.0001, "loss": 6.9271, "loss/crossentropy": 1.7873032443225383, "loss/hidden": 3.2796875, "loss/jsd": 0.0, "loss/logits": 0.16436451440677047, "step": 2840 }, { "epoch": 0.1425, "grad_norm": 31.375, "grad_norm_var": 4.561393229166667, "learning_rate": 0.0001, "loss": 6.859, "loss/crossentropy": 1.764283910393715, "loss/hidden": 3.3890625, "loss/jsd": 0.0, "loss/logits": 0.18506875950843096, "step": 2850 }, { "epoch": 0.143, "grad_norm": 30.125, "grad_norm_var": 5.339322916666666, "learning_rate": 0.0001, "loss": 7.1328, "loss/crossentropy": 1.746024763584137, "loss/hidden": 3.333984375, "loss/jsd": 0.0, "loss/logits": 0.19091468937695028, "step": 2860 }, { "epoch": 0.1435, "grad_norm": 31.125, "grad_norm_var": 7.5875, "learning_rate": 0.0001, "loss": 6.8931, "loss/crossentropy": 1.8621096529066563, "loss/hidden": 3.2265625, "loss/jsd": 0.0, "loss/logits": 0.167528663482517, "step": 2870 }, { "epoch": 0.144, "grad_norm": 32.25, "grad_norm_var": 7.123372395833333, "learning_rate": 0.0001, "loss": 7.0369, "loss/crossentropy": 1.9750339597463609, "loss/hidden": 3.364453125, "loss/jsd": 0.0, "loss/logits": 0.20070471633225678, "step": 2880 }, { "epoch": 0.1445, "grad_norm": 33.5, "grad_norm_var": 14.7275390625, "learning_rate": 0.0001, "loss": 6.8862, "loss/crossentropy": 1.74088372066617, "loss/hidden": 3.320703125, "loss/jsd": 0.0, "loss/logits": 0.17231013607233764, "step": 2890 }, { "epoch": 0.145, "grad_norm": 28.375, "grad_norm_var": 19.409830729166668, "learning_rate": 0.0001, "loss": 7.035, "loss/crossentropy": 1.7799094915390015, "loss/hidden": 3.32265625, "loss/jsd": 0.0, "loss/logits": 0.18373552113771438, "step": 2900 }, { "epoch": 0.1455, "grad_norm": 31.375, "grad_norm_var": 5.517708333333333, "learning_rate": 0.0001, "loss": 6.9546, "loss/crossentropy": 1.7803256064653397, "loss/hidden": 3.34921875, "loss/jsd": 0.0, "loss/logits": 0.1977113801985979, "step": 2910 }, { "epoch": 0.146, "grad_norm": 28.125, "grad_norm_var": 5.627018229166667, "learning_rate": 0.0001, "loss": 6.9317, "loss/crossentropy": 1.8050019271671771, "loss/hidden": 3.257421875, "loss/jsd": 0.0, "loss/logits": 0.16629343312233685, "step": 2920 }, { "epoch": 0.1465, "grad_norm": 34.5, "grad_norm_var": 7.16640625, "learning_rate": 0.0001, "loss": 6.9453, "loss/crossentropy": 1.8659825779497623, "loss/hidden": 3.331640625, "loss/jsd": 0.0, "loss/logits": 0.1742606306448579, "step": 2930 }, { "epoch": 0.147, "grad_norm": 35.5, "grad_norm_var": 8.9306640625, "learning_rate": 0.0001, "loss": 7.0142, "loss/crossentropy": 1.913654712587595, "loss/hidden": 3.403515625, "loss/jsd": 0.0, "loss/logits": 0.20132352095097303, "step": 2940 }, { "epoch": 0.1475, "grad_norm": 30.25, "grad_norm_var": 6.614518229166666, "learning_rate": 0.0001, "loss": 6.9147, "loss/crossentropy": 1.645759216696024, "loss/hidden": 3.376171875, "loss/jsd": 0.0, "loss/logits": 0.16875347392633558, "step": 2950 }, { "epoch": 0.148, "grad_norm": 29.0, "grad_norm_var": 6.8322265625, "learning_rate": 0.0001, "loss": 6.9988, "loss/crossentropy": 1.8556548431515694, "loss/hidden": 3.39375, "loss/jsd": 0.0, "loss/logits": 0.17874295320361852, "step": 2960 }, { "epoch": 0.1485, "grad_norm": 28.75, "grad_norm_var": 3.5791666666666666, "learning_rate": 0.0001, "loss": 7.0313, "loss/crossentropy": 1.688177353143692, "loss/hidden": 3.28515625, "loss/jsd": 0.0, "loss/logits": 0.16950420523062348, "step": 2970 }, { "epoch": 0.149, "grad_norm": 32.25, "grad_norm_var": 2.246875, "learning_rate": 0.0001, "loss": 7.0247, "loss/crossentropy": 2.071097436547279, "loss/hidden": 3.404296875, "loss/jsd": 0.0, "loss/logits": 0.20375496093183756, "step": 2980 }, { "epoch": 0.1495, "grad_norm": 27.5, "grad_norm_var": 2.6014973958333334, "learning_rate": 0.0001, "loss": 7.0495, "loss/crossentropy": 1.852598314732313, "loss/hidden": 3.31328125, "loss/jsd": 0.0, "loss/logits": 0.16860631257295608, "step": 2990 }, { "epoch": 0.15, "grad_norm": 27.0, "grad_norm_var": 3.5122395833333333, "learning_rate": 0.0001, "loss": 6.7966, "loss/crossentropy": 1.7948169738054276, "loss/hidden": 3.325, "loss/jsd": 0.0, "loss/logits": 0.17319696098566056, "step": 3000 }, { "epoch": 0.1505, "grad_norm": 28.875, "grad_norm_var": 4.367122395833333, "learning_rate": 0.0001, "loss": 6.9423, "loss/crossentropy": 1.6970888696610928, "loss/hidden": 3.43203125, "loss/jsd": 0.0, "loss/logits": 0.16700221002101898, "step": 3010 }, { "epoch": 0.151, "grad_norm": 3674210304.0, "grad_norm_var": 2.0173451962123377e+18, "learning_rate": 0.0001, "loss": 6.9283, "loss/crossentropy": 1.713117253035307, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.1704209728166461, "step": 3020 }, { "epoch": 0.1515, "grad_norm": 31.375, "grad_norm_var": 1.710129338897767e+18, "learning_rate": 0.0001, "loss": 7.0097, "loss/crossentropy": 1.9506682097911834, "loss/hidden": 3.407421875, "loss/jsd": 0.0, "loss/logits": 0.19250028654932977, "step": 3030 }, { "epoch": 0.152, "grad_norm": 29.25, "grad_norm_var": 2.1416666666666666, "learning_rate": 0.0001, "loss": 7.0202, "loss/crossentropy": 1.831156849861145, "loss/hidden": 3.307421875, "loss/jsd": 0.0, "loss/logits": 0.18563526798970997, "step": 3040 }, { "epoch": 0.1525, "grad_norm": 30.625, "grad_norm_var": 2.6768229166666666, "learning_rate": 0.0001, "loss": 7.0529, "loss/crossentropy": 1.8806451916694642, "loss/hidden": 3.405859375, "loss/jsd": 0.0, "loss/logits": 0.19239903232082725, "step": 3050 }, { "epoch": 0.153, "grad_norm": 29.25, "grad_norm_var": 2.8375, "learning_rate": 0.0001, "loss": 6.9243, "loss/crossentropy": 1.8184577412903309, "loss/hidden": 3.359765625, "loss/jsd": 0.0, "loss/logits": 0.173899077065289, "step": 3060 }, { "epoch": 0.1535, "grad_norm": 30.5, "grad_norm_var": 1.6489583333333333, "learning_rate": 0.0001, "loss": 6.901, "loss/crossentropy": 1.782475320994854, "loss/hidden": 3.303125, "loss/jsd": 0.0, "loss/logits": 0.17683281004428864, "step": 3070 }, { "epoch": 0.154, "grad_norm": 30.125, "grad_norm_var": 2.4770833333333333, "learning_rate": 0.0001, "loss": 7.0536, "loss/crossentropy": 1.7542385324835776, "loss/hidden": 3.31015625, "loss/jsd": 0.0, "loss/logits": 0.1734863522462547, "step": 3080 }, { "epoch": 0.1545, "grad_norm": 31.375, "grad_norm_var": 2.5077473958333334, "learning_rate": 0.0001, "loss": 6.7429, "loss/crossentropy": 1.721788990870118, "loss/hidden": 3.336328125, "loss/jsd": 0.0, "loss/logits": 0.1703654458746314, "step": 3090 }, { "epoch": 0.155, "grad_norm": 40.25, "grad_norm_var": 9.09140625, "learning_rate": 0.0001, "loss": 6.9729, "loss/crossentropy": 1.6206283092498779, "loss/hidden": 3.35859375, "loss/jsd": 0.0, "loss/logits": 0.1712807172909379, "step": 3100 }, { "epoch": 0.1555, "grad_norm": 32.0, "grad_norm_var": 8.16640625, "learning_rate": 0.0001, "loss": 6.8604, "loss/crossentropy": 1.7044736705720425, "loss/hidden": 3.247265625, "loss/jsd": 0.0, "loss/logits": 0.16109976628795267, "step": 3110 }, { "epoch": 0.156, "grad_norm": 28.875, "grad_norm_var": 61.90305989583333, "learning_rate": 0.0001, "loss": 6.8603, "loss/crossentropy": 1.7201604932546615, "loss/hidden": 3.3421875, "loss/jsd": 0.0, "loss/logits": 0.1717333897948265, "step": 3120 }, { "epoch": 0.1565, "grad_norm": 29.25, "grad_norm_var": 3.2666015625, "learning_rate": 0.0001, "loss": 6.9316, "loss/crossentropy": 1.611024511605501, "loss/hidden": 3.331640625, "loss/jsd": 0.0, "loss/logits": 0.17799030421301723, "step": 3130 }, { "epoch": 0.157, "grad_norm": 29.25, "grad_norm_var": 6.059830729166666, "learning_rate": 0.0001, "loss": 6.8749, "loss/crossentropy": 1.542306227236986, "loss/hidden": 3.2828125, "loss/jsd": 0.0, "loss/logits": 0.17464940482750535, "step": 3140 }, { "epoch": 0.1575, "grad_norm": 30.75, "grad_norm_var": 4.820572916666666, "learning_rate": 0.0001, "loss": 6.8917, "loss/crossentropy": 1.7465024203062058, "loss/hidden": 3.3546875, "loss/jsd": 0.0, "loss/logits": 0.18054623370990158, "step": 3150 }, { "epoch": 0.158, "grad_norm": 31.5, "grad_norm_var": 2.787239583333333, "learning_rate": 0.0001, "loss": 6.969, "loss/crossentropy": 2.0858161732554437, "loss/hidden": 3.28125, "loss/jsd": 0.0, "loss/logits": 0.18169568832963706, "step": 3160 }, { "epoch": 0.1585, "grad_norm": 29.5, "grad_norm_var": 4.023372395833333, "learning_rate": 0.0001, "loss": 6.8406, "loss/crossentropy": 1.9426328182220458, "loss/hidden": 3.324609375, "loss/jsd": 0.0, "loss/logits": 0.17592350710183383, "step": 3170 }, { "epoch": 0.159, "grad_norm": 29.125, "grad_norm_var": 1.5832967238438093e+18, "learning_rate": 0.0001, "loss": 6.9453, "loss/crossentropy": 1.8308497540652753, "loss/hidden": 3.603125, "loss/jsd": 0.0, "loss/logits": 0.19216080345213413, "step": 3180 }, { "epoch": 0.1595, "grad_norm": 29.5, "grad_norm_var": 1.5832967237861376e+18, "learning_rate": 0.0001, "loss": 6.9032, "loss/crossentropy": 1.705291760712862, "loss/hidden": 3.383984375, "loss/jsd": 0.0, "loss/logits": 0.1843032216653228, "step": 3190 }, { "epoch": 0.16, "grad_norm": 40.25, "grad_norm_var": 19.5056640625, "learning_rate": 0.0001, "loss": 7.0209, "loss/crossentropy": 1.7651132240891456, "loss/hidden": 3.390234375, "loss/jsd": 0.0, "loss/logits": 0.1844408256933093, "step": 3200 }, { "epoch": 0.1605, "grad_norm": 38.0, "grad_norm_var": 6.217782109866559e+17, "learning_rate": 0.0001, "loss": 6.7736, "loss/crossentropy": 1.8051001697778701, "loss/hidden": 3.38984375, "loss/jsd": 0.0, "loss/logits": 0.17440476845949887, "step": 3210 }, { "epoch": 0.161, "grad_norm": 31.125, "grad_norm_var": 6.428059895833333, "learning_rate": 0.0001, "loss": 6.9109, "loss/crossentropy": 1.8851144686341286, "loss/hidden": 3.2359375, "loss/jsd": 0.0, "loss/logits": 0.17897074315696954, "step": 3220 }, { "epoch": 0.1615, "grad_norm": 30.125, "grad_norm_var": 17.601041666666667, "learning_rate": 0.0001, "loss": 6.9799, "loss/crossentropy": 1.6312229566276073, "loss/hidden": 3.2609375, "loss/jsd": 0.0, "loss/logits": 0.16838383311405777, "step": 3230 }, { "epoch": 0.162, "grad_norm": 31.5, "grad_norm_var": 20.835872395833334, "learning_rate": 0.0001, "loss": 6.932, "loss/crossentropy": 2.011029013991356, "loss/hidden": 3.310546875, "loss/jsd": 0.0, "loss/logits": 0.1832389457151294, "step": 3240 }, { "epoch": 0.1625, "grad_norm": 28.375, "grad_norm_var": 7.161458333333333, "learning_rate": 0.0001, "loss": 7.0405, "loss/crossentropy": 1.8453179642558097, "loss/hidden": 3.434765625, "loss/jsd": 0.0, "loss/logits": 0.19180234288796782, "step": 3250 }, { "epoch": 0.163, "grad_norm": 36.5, "grad_norm_var": 10.517708333333333, "learning_rate": 0.0001, "loss": 6.823, "loss/crossentropy": 1.9555616907775402, "loss/hidden": 3.318359375, "loss/jsd": 0.0, "loss/logits": 0.17895318511873484, "step": 3260 }, { "epoch": 0.1635, "grad_norm": 29.125, "grad_norm_var": 8.909830729166666, "learning_rate": 0.0001, "loss": 6.892, "loss/crossentropy": 1.843096625804901, "loss/hidden": 3.33203125, "loss/jsd": 0.0, "loss/logits": 0.18395393253304065, "step": 3270 }, { "epoch": 0.164, "grad_norm": 27.875, "grad_norm_var": 7.260416666666667, "learning_rate": 0.0001, "loss": 6.9288, "loss/crossentropy": 1.688144066929817, "loss/hidden": 3.277734375, "loss/jsd": 0.0, "loss/logits": 0.172001248691231, "step": 3280 }, { "epoch": 0.1645, "grad_norm": 37.5, "grad_norm_var": 12.014518229166667, "learning_rate": 0.0001, "loss": 6.9012, "loss/crossentropy": 1.6900858603417874, "loss/hidden": 3.346875, "loss/jsd": 0.0, "loss/logits": 0.1850940717384219, "step": 3290 }, { "epoch": 0.165, "grad_norm": 30.5, "grad_norm_var": 11.887955729166666, "learning_rate": 0.0001, "loss": 7.0327, "loss/crossentropy": 1.8690055832266808, "loss/hidden": 3.41015625, "loss/jsd": 0.0, "loss/logits": 0.2061467545107007, "step": 3300 }, { "epoch": 0.1655, "grad_norm": 33.25, "grad_norm_var": 44.0900390625, "learning_rate": 0.0001, "loss": 6.9398, "loss/crossentropy": 1.864616620540619, "loss/hidden": 3.21875, "loss/jsd": 0.0, "loss/logits": 0.15337421298027037, "step": 3310 }, { "epoch": 0.166, "grad_norm": 37.25, "grad_norm_var": 45.87473958333333, "learning_rate": 0.0001, "loss": 6.9275, "loss/crossentropy": 1.8501743324100972, "loss/hidden": 3.266796875, "loss/jsd": 0.0, "loss/logits": 0.17113643269985915, "step": 3320 }, { "epoch": 0.1665, "grad_norm": 29.625, "grad_norm_var": 1.1349774579334994e+18, "learning_rate": 0.0001, "loss": 7.0081, "loss/crossentropy": 1.779020744562149, "loss/hidden": 3.323046875, "loss/jsd": 0.0, "loss/logits": 0.1846176441758871, "step": 3330 }, { "epoch": 0.167, "grad_norm": 35.75, "grad_norm_var": 1.0819897936507308e+18, "learning_rate": 0.0001, "loss": 6.9779, "loss/crossentropy": 1.7754384666681289, "loss/hidden": 3.36796875, "loss/jsd": 0.0, "loss/logits": 0.19158907625824212, "step": 3340 }, { "epoch": 0.1675, "grad_norm": 34.25, "grad_norm_var": 1.081989793663733e+18, "learning_rate": 0.0001, "loss": 7.0539, "loss/crossentropy": 1.759375052154064, "loss/hidden": 3.32109375, "loss/jsd": 0.0, "loss/logits": 0.18603504877537488, "step": 3350 }, { "epoch": 0.168, "grad_norm": 29.875, "grad_norm_var": 5.620833333333334, "learning_rate": 0.0001, "loss": 6.8589, "loss/crossentropy": 1.845319252461195, "loss/hidden": 3.3328125, "loss/jsd": 0.0, "loss/logits": 0.18480119155719876, "step": 3360 }, { "epoch": 0.1685, "grad_norm": 34.5, "grad_norm_var": 19.439583333333335, "learning_rate": 0.0001, "loss": 6.9772, "loss/crossentropy": 1.6411745361983776, "loss/hidden": 3.321484375, "loss/jsd": 0.0, "loss/logits": 0.15529545303434134, "step": 3370 }, { "epoch": 0.169, "grad_norm": 28.5, "grad_norm_var": 36.9103515625, "learning_rate": 0.0001, "loss": 6.8489, "loss/crossentropy": 1.7360669024288655, "loss/hidden": 3.327734375, "loss/jsd": 0.0, "loss/logits": 0.17661824598908424, "step": 3380 }, { "epoch": 0.1695, "grad_norm": 29.375, "grad_norm_var": 35.46848958333333, "learning_rate": 0.0001, "loss": 6.7757, "loss/crossentropy": 1.7902205429971219, "loss/hidden": 3.38125, "loss/jsd": 0.0, "loss/logits": 0.17157120602205395, "step": 3390 }, { "epoch": 0.17, "grad_norm": 28.875, "grad_norm_var": 3.6395833333333334, "learning_rate": 0.0001, "loss": 6.8708, "loss/crossentropy": 1.842449489980936, "loss/hidden": 3.29921875, "loss/jsd": 0.0, "loss/logits": 0.16762932492420077, "step": 3400 }, { "epoch": 0.1705, "grad_norm": 37.0, "grad_norm_var": 6.513997395833333, "learning_rate": 0.0001, "loss": 6.8956, "loss/crossentropy": 1.7051387749612332, "loss/hidden": 3.3078125, "loss/jsd": 0.0, "loss/logits": 0.16933946274220943, "step": 3410 }, { "epoch": 0.171, "grad_norm": 30.75, "grad_norm_var": 9.762239583333333, "learning_rate": 0.0001, "loss": 6.9733, "loss/crossentropy": 1.7448437750339507, "loss/hidden": 3.3890625, "loss/jsd": 0.0, "loss/logits": 0.20119084492325784, "step": 3420 }, { "epoch": 0.1715, "grad_norm": 78.0, "grad_norm_var": 144.5125, "learning_rate": 0.0001, "loss": 7.0287, "loss/crossentropy": 1.824779784679413, "loss/hidden": 3.37421875, "loss/jsd": 0.0, "loss/logits": 0.17832597270607947, "step": 3430 }, { "epoch": 0.172, "grad_norm": 28.125, "grad_norm_var": 145.68430989583334, "learning_rate": 0.0001, "loss": 6.7435, "loss/crossentropy": 1.6466563902795315, "loss/hidden": 3.328515625, "loss/jsd": 0.0, "loss/logits": 0.16660706931725144, "step": 3440 }, { "epoch": 0.1725, "grad_norm": 29.875, "grad_norm_var": 8.811393229166667, "learning_rate": 0.0001, "loss": 6.934, "loss/crossentropy": 1.8493422105908395, "loss/hidden": 3.333984375, "loss/jsd": 0.0, "loss/logits": 0.2010068495757878, "step": 3450 }, { "epoch": 0.173, "grad_norm": 29.0, "grad_norm_var": 6.62890625, "learning_rate": 0.0001, "loss": 6.8496, "loss/crossentropy": 1.6380462288856505, "loss/hidden": 3.26015625, "loss/jsd": 0.0, "loss/logits": 0.18449038956314326, "step": 3460 }, { "epoch": 0.1735, "grad_norm": 32.75, "grad_norm_var": 32.5875, "learning_rate": 0.0001, "loss": 6.9309, "loss/crossentropy": 1.6813900470733643, "loss/hidden": 3.400390625, "loss/jsd": 0.0, "loss/logits": 0.18437479846179486, "step": 3470 }, { "epoch": 0.174, "grad_norm": 31.625, "grad_norm_var": 7.465419918819722e+17, "learning_rate": 0.0001, "loss": 7.1677, "loss/crossentropy": 1.789808637648821, "loss/hidden": 3.343359375, "loss/jsd": 0.0, "loss/logits": 0.17758243400603532, "step": 3480 }, { "epoch": 0.1745, "grad_norm": 29.75, "grad_norm_var": 58.84557291666667, "learning_rate": 0.0001, "loss": 6.8709, "loss/crossentropy": 1.8069385841488839, "loss/hidden": 3.403125, "loss/jsd": 0.0, "loss/logits": 0.19292932376265526, "step": 3490 }, { "epoch": 0.175, "grad_norm": 28.25, "grad_norm_var": 13.452018229166667, "learning_rate": 0.0001, "loss": 6.9084, "loss/crossentropy": 1.6264689728617667, "loss/hidden": 3.269140625, "loss/jsd": 0.0, "loss/logits": 0.16363061694428324, "step": 3500 }, { "epoch": 0.1755, "grad_norm": 32.75, "grad_norm_var": 1.459166261163747e+18, "learning_rate": 0.0001, "loss": 6.9837, "loss/crossentropy": 1.7061957284808158, "loss/hidden": 3.411328125, "loss/jsd": 0.0, "loss/logits": 0.18923843959346415, "step": 3510 }, { "epoch": 0.176, "grad_norm": 29.75, "grad_norm_var": 1.459166260217512e+18, "learning_rate": 0.0001, "loss": 6.9459, "loss/crossentropy": 1.6986562974750996, "loss/hidden": 3.453515625, "loss/jsd": 0.0, "loss/logits": 0.18663678420707583, "step": 3520 }, { "epoch": 0.1765, "grad_norm": 31.0, "grad_norm_var": 1.8478515625, "learning_rate": 0.0001, "loss": 6.9793, "loss/crossentropy": 1.7609238177537918, "loss/hidden": 3.38515625, "loss/jsd": 0.0, "loss/logits": 0.19038589783012866, "step": 3530 }, { "epoch": 0.177, "grad_norm": 31.25, "grad_norm_var": 3.1770833333333335, "learning_rate": 0.0001, "loss": 6.9966, "loss/crossentropy": 1.9084905117750168, "loss/hidden": 3.33984375, "loss/jsd": 0.0, "loss/logits": 0.1776235220953822, "step": 3540 }, { "epoch": 0.1775, "grad_norm": 30.25, "grad_norm_var": 2.051497395833333, "learning_rate": 0.0001, "loss": 6.9292, "loss/crossentropy": 1.6809238217771054, "loss/hidden": 3.355078125, "loss/jsd": 0.0, "loss/logits": 0.19617705075070263, "step": 3550 }, { "epoch": 0.178, "grad_norm": 33.75, "grad_norm_var": 1.9955729166666667, "learning_rate": 0.0001, "loss": 6.972, "loss/crossentropy": 1.6389021024107933, "loss/hidden": 3.3796875, "loss/jsd": 0.0, "loss/logits": 0.18174178060144186, "step": 3560 }, { "epoch": 0.1785, "grad_norm": 36.5, "grad_norm_var": 7.553059895833333, "learning_rate": 0.0001, "loss": 7.0848, "loss/crossentropy": 1.7566796734929084, "loss/hidden": 3.465234375, "loss/jsd": 0.0, "loss/logits": 0.1923373954370618, "step": 3570 }, { "epoch": 0.179, "grad_norm": 28.125, "grad_norm_var": 5.9603515625, "learning_rate": 0.0001, "loss": 6.956, "loss/crossentropy": 1.7154954925179482, "loss/hidden": 3.380859375, "loss/jsd": 0.0, "loss/logits": 0.17990761240944267, "step": 3580 }, { "epoch": 0.1795, "grad_norm": 29.875, "grad_norm_var": 4.399934895833334, "learning_rate": 0.0001, "loss": 7.0142, "loss/crossentropy": 1.8327077120542525, "loss/hidden": 3.35234375, "loss/jsd": 0.0, "loss/logits": 0.1800425429828465, "step": 3590 }, { "epoch": 0.18, "grad_norm": 28.875, "grad_norm_var": 3.3247395833333333, "learning_rate": 0.0001, "loss": 6.9351, "loss/crossentropy": 1.8267195105552674, "loss/hidden": 3.394140625, "loss/jsd": 0.0, "loss/logits": 0.19433746309950947, "step": 3600 }, { "epoch": 0.1805, "grad_norm": 32.25, "grad_norm_var": 24.673372395833333, "learning_rate": 0.0001, "loss": 6.8892, "loss/crossentropy": 1.737992748618126, "loss/hidden": 3.40390625, "loss/jsd": 0.0, "loss/logits": 0.20098126940429212, "step": 3610 }, { "epoch": 0.181, "grad_norm": 30.25, "grad_norm_var": 33.395572916666666, "learning_rate": 0.0001, "loss": 7.0103, "loss/crossentropy": 1.8915371976792812, "loss/hidden": 3.305078125, "loss/jsd": 0.0, "loss/logits": 0.1876732436940074, "step": 3620 }, { "epoch": 0.1815, "grad_norm": 26.5, "grad_norm_var": 38.799739583333334, "learning_rate": 0.0001, "loss": 6.981, "loss/crossentropy": 1.7780213125050068, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.18925584964454173, "step": 3630 }, { "epoch": 0.182, "grad_norm": 32.0, "grad_norm_var": 1.0995116106143062e+18, "learning_rate": 0.0001, "loss": 7.039, "loss/crossentropy": 1.7201772332191467, "loss/hidden": 3.292578125, "loss/jsd": 0.0, "loss/logits": 0.1817839713767171, "step": 3640 }, { "epoch": 0.1825, "grad_norm": 29.125, "grad_norm_var": 1.0995116110905345e+18, "learning_rate": 0.0001, "loss": 6.7937, "loss/crossentropy": 1.824095284193754, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.16389566464349628, "step": 3650 }, { "epoch": 0.183, "grad_norm": 28.625, "grad_norm_var": 14.382291666666667, "learning_rate": 0.0001, "loss": 6.918, "loss/crossentropy": 1.7039800986647606, "loss/hidden": 3.34921875, "loss/jsd": 0.0, "loss/logits": 0.17251853737980127, "step": 3660 }, { "epoch": 0.1835, "grad_norm": 29.375, "grad_norm_var": 0.82265625, "learning_rate": 0.0001, "loss": 6.8614, "loss/crossentropy": 1.670785766094923, "loss/hidden": 3.466015625, "loss/jsd": 0.0, "loss/logits": 0.1893833376467228, "step": 3670 }, { "epoch": 0.184, "grad_norm": 28.375, "grad_norm_var": 8.297916666666667, "learning_rate": 0.0001, "loss": 6.8747, "loss/crossentropy": 1.7371518418192864, "loss/hidden": 3.329296875, "loss/jsd": 0.0, "loss/logits": 0.17423492725938558, "step": 3680 }, { "epoch": 0.1845, "grad_norm": 30.5, "grad_norm_var": 11.51640625, "learning_rate": 0.0001, "loss": 7.1482, "loss/crossentropy": 2.011937528848648, "loss/hidden": 3.376171875, "loss/jsd": 0.0, "loss/logits": 0.19120746664702892, "step": 3690 }, { "epoch": 0.185, "grad_norm": 29.75, "grad_norm_var": 114.8119140625, "learning_rate": 0.0001, "loss": 6.9318, "loss/crossentropy": 1.9779032841324806, "loss/hidden": 3.508203125, "loss/jsd": 0.0, "loss/logits": 0.19792085662484168, "step": 3700 }, { "epoch": 0.1855, "grad_norm": 29.5, "grad_norm_var": 3.1666015625, "learning_rate": 0.0001, "loss": 6.9801, "loss/crossentropy": 1.8196966513991355, "loss/hidden": 3.364453125, "loss/jsd": 0.0, "loss/logits": 0.17692473586648702, "step": 3710 }, { "epoch": 0.186, "grad_norm": 31.625, "grad_norm_var": 7.036874289840129e+17, "learning_rate": 0.0001, "loss": 6.9754, "loss/crossentropy": 1.7481721505522727, "loss/hidden": 3.37734375, "loss/jsd": 0.0, "loss/logits": 0.1970391605515033, "step": 3720 }, { "epoch": 0.1865, "grad_norm": 28.75, "grad_norm_var": 7.036874289385746e+17, "learning_rate": 0.0001, "loss": 6.8514, "loss/crossentropy": 1.609993650764227, "loss/hidden": 3.378125, "loss/jsd": 0.0, "loss/logits": 0.19023605762049556, "step": 3730 }, { "epoch": 0.187, "grad_norm": 32.75, "grad_norm_var": 2.0978515625, "learning_rate": 0.0001, "loss": 7.0947, "loss/crossentropy": 2.0539694875478745, "loss/hidden": 3.399609375, "loss/jsd": 0.0, "loss/logits": 0.20270574633032085, "step": 3740 }, { "epoch": 0.1875, "grad_norm": 29.0, "grad_norm_var": 1.9103515625, "learning_rate": 0.0001, "loss": 6.9278, "loss/crossentropy": 1.8215243116021156, "loss/hidden": 3.2640625, "loss/jsd": 0.0, "loss/logits": 0.16428390927612782, "step": 3750 }, { "epoch": 0.188, "grad_norm": 30.0, "grad_norm_var": 2.0541666666666667, "learning_rate": 0.0001, "loss": 7.0503, "loss/crossentropy": 1.8183038413524628, "loss/hidden": 3.3234375, "loss/jsd": 0.0, "loss/logits": 0.19697826653718947, "step": 3760 }, { "epoch": 0.1885, "grad_norm": 32.75, "grad_norm_var": 1.0989583333333333, "learning_rate": 0.0001, "loss": 7.1034, "loss/crossentropy": 1.7321583658456803, "loss/hidden": 3.41484375, "loss/jsd": 0.0, "loss/logits": 0.19197138799354435, "step": 3770 }, { "epoch": 0.189, "grad_norm": 28.25, "grad_norm_var": 2.314322916666667, "learning_rate": 0.0001, "loss": 6.8113, "loss/crossentropy": 1.8538015499711036, "loss/hidden": 3.369140625, "loss/jsd": 0.0, "loss/logits": 0.18398043606430292, "step": 3780 }, { "epoch": 0.1895, "grad_norm": 29.625, "grad_norm_var": 5.827018229166667, "learning_rate": 0.0001, "loss": 7.0817, "loss/crossentropy": 1.8768661253154277, "loss/hidden": 3.369140625, "loss/jsd": 0.0, "loss/logits": 0.20710380356758834, "step": 3790 }, { "epoch": 0.19, "grad_norm": 33.25, "grad_norm_var": 4.195572916666666, "learning_rate": 0.0001, "loss": 7.0374, "loss/crossentropy": 1.7977422267198562, "loss/hidden": 3.28828125, "loss/jsd": 0.0, "loss/logits": 0.1821097361855209, "step": 3800 }, { "epoch": 0.1905, "grad_norm": 36.0, "grad_norm_var": 5.763997395833333, "learning_rate": 0.0001, "loss": 7.0815, "loss/crossentropy": 1.743187139183283, "loss/hidden": 3.371875, "loss/jsd": 0.0, "loss/logits": 0.18471882613375784, "step": 3810 }, { "epoch": 0.191, "grad_norm": 30.25, "grad_norm_var": 6.658333333333333, "learning_rate": 0.0001, "loss": 6.8304, "loss/crossentropy": 1.7315315805375575, "loss/hidden": 3.3015625, "loss/jsd": 0.0, "loss/logits": 0.16926794005557894, "step": 3820 }, { "epoch": 0.1915, "grad_norm": 29.875, "grad_norm_var": 7.5380756628017e+17, "learning_rate": 0.0001, "loss": 7.091, "loss/crossentropy": 1.8176006272435188, "loss/hidden": 3.350390625, "loss/jsd": 0.0, "loss/logits": 0.1842843361198902, "step": 3830 }, { "epoch": 0.192, "grad_norm": 29.375, "grad_norm_var": 7.563541666666667, "learning_rate": 0.0001, "loss": 6.9694, "loss/crossentropy": 1.777810937166214, "loss/hidden": 3.32578125, "loss/jsd": 0.0, "loss/logits": 0.1739983822219074, "step": 3840 }, { "epoch": 0.1925, "grad_norm": 28.375, "grad_norm_var": 5.01875, "learning_rate": 0.0001, "loss": 6.8715, "loss/crossentropy": 1.9018649347126484, "loss/hidden": 3.32421875, "loss/jsd": 0.0, "loss/logits": 0.18006115844473242, "step": 3850 }, { "epoch": 0.193, "grad_norm": 30.0, "grad_norm_var": 1.2455729166666667, "learning_rate": 0.0001, "loss": 6.881, "loss/crossentropy": 1.8844246573746204, "loss/hidden": 3.3390625, "loss/jsd": 0.0, "loss/logits": 0.19470994817093015, "step": 3860 }, { "epoch": 0.1935, "grad_norm": 28.75, "grad_norm_var": 2.6809895833333335, "learning_rate": 0.0001, "loss": 6.9021, "loss/crossentropy": 1.7199362799525262, "loss/hidden": 3.36328125, "loss/jsd": 0.0, "loss/logits": 0.18913396131247281, "step": 3870 }, { "epoch": 0.194, "grad_norm": 32.25, "grad_norm_var": 2.5197265625, "learning_rate": 0.0001, "loss": 6.9324, "loss/crossentropy": 1.755439005047083, "loss/hidden": 3.343359375, "loss/jsd": 0.0, "loss/logits": 0.1858789509162307, "step": 3880 }, { "epoch": 0.1945, "grad_norm": 31.125, "grad_norm_var": 3.384375, "learning_rate": 0.0001, "loss": 6.9477, "loss/crossentropy": 1.7906312070786954, "loss/hidden": 3.334765625, "loss/jsd": 0.0, "loss/logits": 0.19127205722033977, "step": 3890 }, { "epoch": 0.195, "grad_norm": 32.25, "grad_norm_var": 2.1504557291666666, "learning_rate": 0.0001, "loss": 7.1196, "loss/crossentropy": 1.9957764573395251, "loss/hidden": 3.36015625, "loss/jsd": 0.0, "loss/logits": 0.18469135276973248, "step": 3900 }, { "epoch": 0.1955, "grad_norm": 31.875, "grad_norm_var": 9.387366238726391e+17, "learning_rate": 0.0001, "loss": 6.9857, "loss/crossentropy": 1.7901725992560387, "loss/hidden": 3.300390625, "loss/jsd": 0.0, "loss/logits": 0.18710751123726369, "step": 3910 }, { "epoch": 0.196, "grad_norm": 30.625, "grad_norm_var": 27.239322916666666, "learning_rate": 0.0001, "loss": 6.9815, "loss/crossentropy": 1.7652528271079064, "loss/hidden": 3.234375, "loss/jsd": 0.0, "loss/logits": 0.16305868746712804, "step": 3920 }, { "epoch": 0.1965, "grad_norm": 29.75, "grad_norm_var": 21.822916666666668, "learning_rate": 0.0001, "loss": 6.873, "loss/crossentropy": 1.7368287414312362, "loss/hidden": 3.433984375, "loss/jsd": 0.0, "loss/logits": 0.19844600670039653, "step": 3930 }, { "epoch": 0.197, "grad_norm": 29.625, "grad_norm_var": 2.035416666666667, "learning_rate": 0.0001, "loss": 7.0347, "loss/crossentropy": 1.9710937917232514, "loss/hidden": 3.265625, "loss/jsd": 0.0, "loss/logits": 0.17006599269807338, "step": 3940 }, { "epoch": 0.1975, "grad_norm": 27.875, "grad_norm_var": 55.904622395833336, "learning_rate": 0.0001, "loss": 6.9427, "loss/crossentropy": 1.6834511645138264, "loss/hidden": 3.315234375, "loss/jsd": 0.0, "loss/logits": 0.18321871096268297, "step": 3950 }, { "epoch": 0.198, "grad_norm": 28.25, "grad_norm_var": 4.515559895833333, "learning_rate": 0.0001, "loss": 6.8793, "loss/crossentropy": 1.8481898710131646, "loss/hidden": 3.219921875, "loss/jsd": 0.0, "loss/logits": 0.16018803734332324, "step": 3960 }, { "epoch": 0.1985, "grad_norm": 31.375, "grad_norm_var": 4.105208333333334, "learning_rate": 0.0001, "loss": 6.9158, "loss/crossentropy": 1.7632210277020932, "loss/hidden": 3.35390625, "loss/jsd": 0.0, "loss/logits": 0.17569016199558973, "step": 3970 }, { "epoch": 0.199, "grad_norm": 31.5, "grad_norm_var": 3.6056640625, "learning_rate": 0.0001, "loss": 7.0644, "loss/crossentropy": 1.8658879399299622, "loss/hidden": 3.445703125, "loss/jsd": 0.0, "loss/logits": 0.2075220150873065, "step": 3980 }, { "epoch": 0.1995, "grad_norm": 30.0, "grad_norm_var": 3.3301432291666666, "learning_rate": 0.0001, "loss": 7.1057, "loss/crossentropy": 1.915429985523224, "loss/hidden": 3.390234375, "loss/jsd": 0.0, "loss/logits": 0.19412722568958998, "step": 3990 }, { "epoch": 0.2, "grad_norm": 29.75, "grad_norm_var": 139.13170572916667, "learning_rate": 0.0001, "loss": 6.9355, "loss/crossentropy": 1.8257215216755867, "loss/hidden": 3.39375, "loss/jsd": 0.0, "loss/logits": 0.18498760322108865, "step": 4000 } ], "logging_steps": 10, "max_steps": 20000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 4000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1430040128035226e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }