newst / trainer_state.json
semran1's picture
Upload folder using huggingface_hub
cfcda4c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.2,
"eval_steps": 2000,
"global_step": 4000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005,
"grad_norm": 30.875,
"learning_rate": 0.0001,
"loss": 7.1506,
"loss/crossentropy": 1.9750229328870774,
"loss/hidden": 3.38984375,
"loss/jsd": 0.0,
"loss/logits": 0.18868114035576583,
"step": 10
},
{
"epoch": 0.001,
"grad_norm": 30.75,
"grad_norm_var": 2.09765625,
"learning_rate": 0.0001,
"loss": 7.266,
"loss/crossentropy": 1.915299428999424,
"loss/hidden": 3.368359375,
"loss/jsd": 0.0,
"loss/logits": 0.19173294119536877,
"step": 20
},
{
"epoch": 0.0015,
"grad_norm": 31.625,
"grad_norm_var": 35.572330729166666,
"learning_rate": 0.0001,
"loss": 7.1477,
"loss/crossentropy": 1.845322072505951,
"loss/hidden": 3.42421875,
"loss/jsd": 0.0,
"loss/logits": 0.1835887383669615,
"step": 30
},
{
"epoch": 0.002,
"grad_norm": 30.25,
"grad_norm_var": 5.803580729166667,
"learning_rate": 0.0001,
"loss": 7.125,
"loss/crossentropy": 1.8556978717446326,
"loss/hidden": 3.5,
"loss/jsd": 0.0,
"loss/logits": 0.22780380193144084,
"step": 40
},
{
"epoch": 0.0025,
"grad_norm": 39.5,
"grad_norm_var": 6.737239583333333,
"learning_rate": 0.0001,
"loss": 7.2665,
"loss/crossentropy": 2.051687541604042,
"loss/hidden": 3.45078125,
"loss/jsd": 0.0,
"loss/logits": 0.21537381634116173,
"step": 50
},
{
"epoch": 0.003,
"grad_norm": 36.5,
"grad_norm_var": 11.058333333333334,
"learning_rate": 0.0001,
"loss": 7.2095,
"loss/crossentropy": 1.9898784533143044,
"loss/hidden": 3.3953125,
"loss/jsd": 0.0,
"loss/logits": 0.19060547631233932,
"step": 60
},
{
"epoch": 0.0035,
"grad_norm": 27.0,
"grad_norm_var": 6.45390625,
"learning_rate": 0.0001,
"loss": 7.2606,
"loss/crossentropy": 1.8448080085217953,
"loss/hidden": 3.394140625,
"loss/jsd": 0.0,
"loss/logits": 0.18068002099171282,
"step": 70
},
{
"epoch": 0.004,
"grad_norm": 38.75,
"grad_norm_var": 1.3401023445121106e+18,
"learning_rate": 0.0001,
"loss": 7.4871,
"loss/crossentropy": 2.0318232350051404,
"loss/hidden": 3.733984375,
"loss/jsd": 0.0,
"loss/logits": 0.337183965742588,
"step": 80
},
{
"epoch": 0.0045,
"grad_norm": 35.25,
"grad_norm_var": 1.3401023442516444e+18,
"learning_rate": 0.0001,
"loss": 7.1923,
"loss/crossentropy": 1.7826939225196838,
"loss/hidden": 3.587890625,
"loss/jsd": 0.0,
"loss/logits": 0.2118432404473424,
"step": 90
},
{
"epoch": 0.005,
"grad_norm": 32.75,
"grad_norm_var": 2.7309895833333333,
"learning_rate": 0.0001,
"loss": 7.2487,
"loss/crossentropy": 1.88408655077219,
"loss/hidden": 3.48671875,
"loss/jsd": 0.0,
"loss/logits": 0.1903762748464942,
"step": 100
},
{
"epoch": 0.0055,
"grad_norm": 34.25,
"grad_norm_var": 4.268489583333333,
"learning_rate": 0.0001,
"loss": 7.1643,
"loss/crossentropy": 1.83259879052639,
"loss/hidden": 3.41953125,
"loss/jsd": 0.0,
"loss/logits": 0.19554968569427728,
"step": 110
},
{
"epoch": 0.006,
"grad_norm": 33.0,
"grad_norm_var": 6.548958333333333,
"learning_rate": 0.0001,
"loss": 7.1535,
"loss/crossentropy": 1.8173740945756436,
"loss/hidden": 3.34609375,
"loss/jsd": 0.0,
"loss/logits": 0.17036083210259675,
"step": 120
},
{
"epoch": 0.0065,
"grad_norm": 32.25,
"grad_norm_var": 3.220572916666667,
"learning_rate": 0.0001,
"loss": 7.2113,
"loss/crossentropy": 1.8991591855883598,
"loss/hidden": 3.4359375,
"loss/jsd": 0.0,
"loss/logits": 0.20231554415076972,
"step": 130
},
{
"epoch": 0.007,
"grad_norm": 120.0,
"grad_norm_var": 494.52890625,
"learning_rate": 0.0001,
"loss": 7.1589,
"loss/crossentropy": 1.9234379842877387,
"loss/hidden": 3.348828125,
"loss/jsd": 0.0,
"loss/logits": 0.19592595770955085,
"step": 140
},
{
"epoch": 0.0075,
"grad_norm": 30.375,
"grad_norm_var": 496.27265625,
"learning_rate": 0.0001,
"loss": 7.1392,
"loss/crossentropy": 1.7669467806816102,
"loss/hidden": 3.39453125,
"loss/jsd": 0.0,
"loss/logits": 0.1691664818674326,
"step": 150
},
{
"epoch": 0.008,
"grad_norm": 35.25,
"grad_norm_var": 202.11354166666666,
"learning_rate": 0.0001,
"loss": 7.2551,
"loss/crossentropy": 1.979496531933546,
"loss/hidden": 3.51484375,
"loss/jsd": 0.0,
"loss/logits": 0.2397671105340123,
"step": 160
},
{
"epoch": 0.0085,
"grad_norm": 29.75,
"grad_norm_var": 41.73118489583333,
"learning_rate": 0.0001,
"loss": 7.0709,
"loss/crossentropy": 1.6596970088779925,
"loss/hidden": 3.46875,
"loss/jsd": 0.0,
"loss/logits": 0.1801933947019279,
"step": 170
},
{
"epoch": 0.009,
"grad_norm": 31.375,
"grad_norm_var": 3.1510416666666665,
"learning_rate": 0.0001,
"loss": 7.1329,
"loss/crossentropy": 1.8317318260669708,
"loss/hidden": 3.470703125,
"loss/jsd": 0.0,
"loss/logits": 0.2027322521433234,
"step": 180
},
{
"epoch": 0.0095,
"grad_norm": 31.25,
"grad_norm_var": 1.034375,
"learning_rate": 0.0001,
"loss": 7.2704,
"loss/crossentropy": 1.7871993221342564,
"loss/hidden": 3.3296875,
"loss/jsd": 0.0,
"loss/logits": 0.17167234625667332,
"step": 190
},
{
"epoch": 0.01,
"grad_norm": 29.375,
"grad_norm_var": 1.4218098958333334,
"learning_rate": 0.0001,
"loss": 7.2074,
"loss/crossentropy": 1.9208836354315282,
"loss/hidden": 3.355859375,
"loss/jsd": 0.0,
"loss/logits": 0.18774686167016624,
"step": 200
},
{
"epoch": 0.0105,
"grad_norm": 29.75,
"grad_norm_var": 5.548958333333333,
"learning_rate": 0.0001,
"loss": 7.2446,
"loss/crossentropy": 1.8792764976620675,
"loss/hidden": 3.430859375,
"loss/jsd": 0.0,
"loss/logits": 0.19080359637737274,
"step": 210
},
{
"epoch": 0.011,
"grad_norm": 32.25,
"grad_norm_var": 11.7619140625,
"learning_rate": 0.0001,
"loss": 7.2031,
"loss/crossentropy": 1.926865078508854,
"loss/hidden": 3.387890625,
"loss/jsd": 0.0,
"loss/logits": 0.19636590238660573,
"step": 220
},
{
"epoch": 0.0115,
"grad_norm": 29.25,
"grad_norm_var": 4.170247395833333,
"learning_rate": 0.0001,
"loss": 7.0576,
"loss/crossentropy": 1.8266212515532971,
"loss/hidden": 3.377734375,
"loss/jsd": 0.0,
"loss/logits": 0.18201391287148,
"step": 230
},
{
"epoch": 0.012,
"grad_norm": 31.5,
"grad_norm_var": 1.81015625,
"learning_rate": 0.0001,
"loss": 7.1432,
"loss/crossentropy": 1.8445213377475738,
"loss/hidden": 3.34140625,
"loss/jsd": 0.0,
"loss/logits": 0.18868241235613822,
"step": 240
},
{
"epoch": 0.0125,
"grad_norm": 33.75,
"grad_norm_var": 1.9625138843884541e+18,
"learning_rate": 0.0001,
"loss": 7.0655,
"loss/crossentropy": 1.8239912115037442,
"loss/hidden": 3.298828125,
"loss/jsd": 0.0,
"loss/logits": 0.17756748497486113,
"step": 250
},
{
"epoch": 0.013,
"grad_norm": 31.875,
"grad_norm_var": 1.56640625,
"learning_rate": 0.0001,
"loss": 7.1575,
"loss/crossentropy": 1.7626003332436084,
"loss/hidden": 3.4109375,
"loss/jsd": 0.0,
"loss/logits": 0.18398213125765323,
"step": 260
},
{
"epoch": 0.0135,
"grad_norm": 32.25,
"grad_norm_var": 1.1129557291666667,
"learning_rate": 0.0001,
"loss": 7.1441,
"loss/crossentropy": 1.7845010846853255,
"loss/hidden": 3.344140625,
"loss/jsd": 0.0,
"loss/logits": 0.18147525601089,
"step": 270
},
{
"epoch": 0.014,
"grad_norm": 30.25,
"grad_norm_var": 2.9822265625,
"learning_rate": 0.0001,
"loss": 7.1286,
"loss/crossentropy": 1.8358447797596456,
"loss/hidden": 3.358203125,
"loss/jsd": 0.0,
"loss/logits": 0.17241306640207768,
"step": 280
},
{
"epoch": 0.0145,
"grad_norm": 33.0,
"grad_norm_var": 10.982291666666667,
"learning_rate": 0.0001,
"loss": 7.1123,
"loss/crossentropy": 1.843992917239666,
"loss/hidden": 3.3671875,
"loss/jsd": 0.0,
"loss/logits": 0.19916406068950893,
"step": 290
},
{
"epoch": 0.015,
"grad_norm": 31.5,
"grad_norm_var": 3.6176432291666667,
"learning_rate": 0.0001,
"loss": 6.9761,
"loss/crossentropy": 1.710184234380722,
"loss/hidden": 3.385546875,
"loss/jsd": 0.0,
"loss/logits": 0.1904242929071188,
"step": 300
},
{
"epoch": 0.0155,
"grad_norm": 30.625,
"grad_norm_var": 1.4795028269701094e+18,
"learning_rate": 0.0001,
"loss": 7.1128,
"loss/crossentropy": 1.783938717842102,
"loss/hidden": 3.38515625,
"loss/jsd": 0.0,
"loss/logits": 0.19371993821114303,
"step": 310
},
{
"epoch": 0.016,
"grad_norm": 27.375,
"grad_norm_var": 9.558072916666667,
"learning_rate": 0.0001,
"loss": 7.1587,
"loss/crossentropy": 1.799688772857189,
"loss/hidden": 3.35078125,
"loss/jsd": 0.0,
"loss/logits": 0.18227657950483261,
"step": 320
},
{
"epoch": 0.0165,
"grad_norm": 30.75,
"grad_norm_var": 5.827235584899985e+17,
"learning_rate": 0.0001,
"loss": 7.1719,
"loss/crossentropy": 1.8475290067493915,
"loss/hidden": 3.490234375,
"loss/jsd": 0.0,
"loss/logits": 0.20651640743017197,
"step": 330
},
{
"epoch": 0.017,
"grad_norm": 31.875,
"grad_norm_var": 1.0473683707078467e+18,
"learning_rate": 0.0001,
"loss": 7.2024,
"loss/crossentropy": 1.7877734430134296,
"loss/hidden": 3.341015625,
"loss/jsd": 0.0,
"loss/logits": 0.17529369578696788,
"step": 340
},
{
"epoch": 0.0175,
"grad_norm": 29.625,
"grad_norm_var": 1.0473683706481477e+18,
"learning_rate": 0.0001,
"loss": 7.0127,
"loss/crossentropy": 1.8476789727807046,
"loss/hidden": 3.376953125,
"loss/jsd": 0.0,
"loss/logits": 0.18340907394886016,
"step": 350
},
{
"epoch": 0.018,
"grad_norm": 31.5,
"grad_norm_var": 4.201822916666667,
"learning_rate": 0.0001,
"loss": 7.0837,
"loss/crossentropy": 1.9127952009439468,
"loss/hidden": 3.274609375,
"loss/jsd": 0.0,
"loss/logits": 0.18515819907188416,
"step": 360
},
{
"epoch": 0.0185,
"grad_norm": 33.25,
"grad_norm_var": 3.4580729166666666,
"learning_rate": 0.0001,
"loss": 7.1494,
"loss/crossentropy": 1.7446002267301082,
"loss/hidden": 3.410546875,
"loss/jsd": 0.0,
"loss/logits": 0.18972037807106973,
"step": 370
},
{
"epoch": 0.019,
"grad_norm": 32.25,
"grad_norm_var": 4.0712890625,
"learning_rate": 0.0001,
"loss": 6.9798,
"loss/crossentropy": 1.6596938122063876,
"loss/hidden": 3.39296875,
"loss/jsd": 0.0,
"loss/logits": 0.16941323587670923,
"step": 380
},
{
"epoch": 0.0195,
"grad_norm": 31.5,
"grad_norm_var": 1.8014398298089062e+18,
"learning_rate": 0.0001,
"loss": 7.1659,
"loss/crossentropy": 1.8092470526695252,
"loss/hidden": 3.278515625,
"loss/jsd": 0.0,
"loss/logits": 0.16989028006792067,
"step": 390
},
{
"epoch": 0.02,
"grad_norm": 29.25,
"grad_norm_var": 1.801439829596395e+18,
"learning_rate": 0.0001,
"loss": 7.1246,
"loss/crossentropy": 1.803744176030159,
"loss/hidden": 3.365625,
"loss/jsd": 0.0,
"loss/logits": 0.19061805782839655,
"step": 400
},
{
"epoch": 0.0205,
"grad_norm": 30.75,
"grad_norm_var": 1.1895833333333334,
"learning_rate": 0.0001,
"loss": 6.8644,
"loss/crossentropy": 1.711807917803526,
"loss/hidden": 3.348046875,
"loss/jsd": 0.0,
"loss/logits": 0.17410435527563095,
"step": 410
},
{
"epoch": 0.021,
"grad_norm": 28.75,
"grad_norm_var": 1.0518229166666666,
"learning_rate": 0.0001,
"loss": 6.9733,
"loss/crossentropy": 1.9412737876176833,
"loss/hidden": 3.32109375,
"loss/jsd": 0.0,
"loss/logits": 0.1845760691910982,
"step": 420
},
{
"epoch": 0.0215,
"grad_norm": 33.75,
"grad_norm_var": 3.36875,
"learning_rate": 0.0001,
"loss": 7.0425,
"loss/crossentropy": 1.6975354842841626,
"loss/hidden": 3.30703125,
"loss/jsd": 0.0,
"loss/logits": 0.17426773644983767,
"step": 430
},
{
"epoch": 0.022,
"grad_norm": 28.875,
"grad_norm_var": 4.533072916666667,
"learning_rate": 0.0001,
"loss": 7.0644,
"loss/crossentropy": 1.8431582309305667,
"loss/hidden": 3.309765625,
"loss/jsd": 0.0,
"loss/logits": 0.19988675275817513,
"step": 440
},
{
"epoch": 0.0225,
"grad_norm": 28.5,
"grad_norm_var": 4.65,
"learning_rate": 0.0001,
"loss": 7.1091,
"loss/crossentropy": 1.845390348136425,
"loss/hidden": 3.395703125,
"loss/jsd": 0.0,
"loss/logits": 0.18364266194403173,
"step": 450
},
{
"epoch": 0.023,
"grad_norm": 30.75,
"grad_norm_var": 4.459375,
"learning_rate": 0.0001,
"loss": 7.0581,
"loss/crossentropy": 1.7513741821050643,
"loss/hidden": 3.42109375,
"loss/jsd": 0.0,
"loss/logits": 0.186102606728673,
"step": 460
},
{
"epoch": 0.0235,
"grad_norm": 27.375,
"grad_norm_var": 4.786458333333333,
"learning_rate": 0.0001,
"loss": 6.9763,
"loss/crossentropy": 1.779174941033125,
"loss/hidden": 3.373046875,
"loss/jsd": 0.0,
"loss/logits": 0.17763521214947103,
"step": 470
},
{
"epoch": 0.024,
"grad_norm": 32.75,
"grad_norm_var": 4.1,
"learning_rate": 0.0001,
"loss": 6.9638,
"loss/crossentropy": 1.7178381219506265,
"loss/hidden": 3.36484375,
"loss/jsd": 0.0,
"loss/logits": 0.17294319327920676,
"step": 480
},
{
"epoch": 0.0245,
"grad_norm": 33.75,
"grad_norm_var": 3.40625,
"learning_rate": 0.0001,
"loss": 6.9397,
"loss/crossentropy": 1.8609587274491788,
"loss/hidden": 3.309765625,
"loss/jsd": 0.0,
"loss/logits": 0.1921778223477304,
"step": 490
},
{
"epoch": 0.025,
"grad_norm": 30.125,
"grad_norm_var": 7.0625,
"learning_rate": 0.0001,
"loss": 7.1176,
"loss/crossentropy": 1.8291713461279868,
"loss/hidden": 3.390234375,
"loss/jsd": 0.0,
"loss/logits": 0.18730791788548232,
"step": 500
},
{
"epoch": 0.0255,
"grad_norm": 30.375,
"grad_norm_var": 6.520572916666667,
"learning_rate": 0.0001,
"loss": 7.097,
"loss/crossentropy": 1.6978721603751183,
"loss/hidden": 3.354296875,
"loss/jsd": 0.0,
"loss/logits": 0.16910959454253316,
"step": 510
},
{
"epoch": 0.026,
"grad_norm": 31.5,
"grad_norm_var": 5.492708333333334,
"learning_rate": 0.0001,
"loss": 7.1184,
"loss/crossentropy": 1.7646001767367125,
"loss/hidden": 3.49609375,
"loss/jsd": 0.0,
"loss/logits": 0.18606224549002945,
"step": 520
},
{
"epoch": 0.0265,
"grad_norm": 33.25,
"grad_norm_var": 3.2478515625,
"learning_rate": 0.0001,
"loss": 6.9289,
"loss/crossentropy": 1.7254683546721936,
"loss/hidden": 3.414453125,
"loss/jsd": 0.0,
"loss/logits": 0.19350956091657281,
"step": 530
},
{
"epoch": 0.027,
"grad_norm": 28.5,
"grad_norm_var": 3.2426432291666667,
"learning_rate": 0.0001,
"loss": 7.0072,
"loss/crossentropy": 1.8291743457317353,
"loss/hidden": 3.2703125,
"loss/jsd": 0.0,
"loss/logits": 0.17015220914036036,
"step": 540
},
{
"epoch": 0.0275,
"grad_norm": 29.375,
"grad_norm_var": 6.1978515625,
"learning_rate": 0.0001,
"loss": 7.0714,
"loss/crossentropy": 1.7038650900125503,
"loss/hidden": 3.35546875,
"loss/jsd": 0.0,
"loss/logits": 0.17573642041534185,
"step": 550
},
{
"epoch": 0.028,
"grad_norm": 28.875,
"grad_norm_var": 5.530143229166667,
"learning_rate": 0.0001,
"loss": 7.0376,
"loss/crossentropy": 2.000048974901438,
"loss/hidden": 3.3921875,
"loss/jsd": 0.0,
"loss/logits": 0.20670556500554085,
"step": 560
},
{
"epoch": 0.0285,
"grad_norm": 30.125,
"grad_norm_var": 37.509830729166666,
"learning_rate": 0.0001,
"loss": 7.0782,
"loss/crossentropy": 1.7484589993953705,
"loss/hidden": 3.4453125,
"loss/jsd": 0.0,
"loss/logits": 0.20398099757730961,
"step": 570
},
{
"epoch": 0.029,
"grad_norm": 30.75,
"grad_norm_var": 37.80930989583333,
"learning_rate": 0.0001,
"loss": 7.1094,
"loss/crossentropy": 1.747946521639824,
"loss/hidden": 3.325,
"loss/jsd": 0.0,
"loss/logits": 0.1723929913714528,
"step": 580
},
{
"epoch": 0.0295,
"grad_norm": 31.5,
"grad_norm_var": 1.9410807291666667,
"learning_rate": 0.0001,
"loss": 7.0532,
"loss/crossentropy": 1.714518916606903,
"loss/hidden": 3.395703125,
"loss/jsd": 0.0,
"loss/logits": 0.17450172062963248,
"step": 590
},
{
"epoch": 0.03,
"grad_norm": 31.375,
"grad_norm_var": 6.620995009586922e+17,
"learning_rate": 0.0001,
"loss": 7.2589,
"loss/crossentropy": 1.7456246592104434,
"loss/hidden": 3.3328125,
"loss/jsd": 0.0,
"loss/logits": 0.18539317091926932,
"step": 600
},
{
"epoch": 0.0305,
"grad_norm": 31.625,
"grad_norm_var": 6.620995011655063e+17,
"learning_rate": 0.0001,
"loss": 7.1014,
"loss/crossentropy": 1.6763587422668933,
"loss/hidden": 3.4015625,
"loss/jsd": 0.0,
"loss/logits": 0.1931827544234693,
"step": 610
},
{
"epoch": 0.031,
"grad_norm": 31.5,
"grad_norm_var": 4.528125,
"learning_rate": 0.0001,
"loss": 7.115,
"loss/crossentropy": 1.849663856625557,
"loss/hidden": 3.41953125,
"loss/jsd": 0.0,
"loss/logits": 0.21164124589413405,
"step": 620
},
{
"epoch": 0.0315,
"grad_norm": 31.25,
"grad_norm_var": 3.027083333333333,
"learning_rate": 0.0001,
"loss": 7.1975,
"loss/crossentropy": 1.765239630639553,
"loss/hidden": 3.3609375,
"loss/jsd": 0.0,
"loss/logits": 0.18264974560588598,
"step": 630
},
{
"epoch": 0.032,
"grad_norm": 29.25,
"grad_norm_var": 3.428580729166667,
"learning_rate": 0.0001,
"loss": 7.1206,
"loss/crossentropy": 1.8783695727586747,
"loss/hidden": 3.369921875,
"loss/jsd": 0.0,
"loss/logits": 0.18768006665632128,
"step": 640
},
{
"epoch": 0.0325,
"grad_norm": 30.75,
"grad_norm_var": 3.9385416666666666,
"learning_rate": 0.0001,
"loss": 7.1671,
"loss/crossentropy": 1.8120282679796218,
"loss/hidden": 3.41484375,
"loss/jsd": 0.0,
"loss/logits": 0.21209220625460148,
"step": 650
},
{
"epoch": 0.033,
"grad_norm": 31.75,
"grad_norm_var": 1.77265625,
"learning_rate": 0.0001,
"loss": 7.0683,
"loss/crossentropy": 1.6486516989767552,
"loss/hidden": 3.3765625,
"loss/jsd": 0.0,
"loss/logits": 0.17768741883337497,
"step": 660
},
{
"epoch": 0.0335,
"grad_norm": 28.5,
"grad_norm_var": 1.9622395833333333,
"learning_rate": 0.0001,
"loss": 7.0341,
"loss/crossentropy": 1.5188174404203891,
"loss/hidden": 3.355078125,
"loss/jsd": 0.0,
"loss/logits": 0.17400255370885134,
"step": 670
},
{
"epoch": 0.034,
"grad_norm": 29.25,
"grad_norm_var": 3.075,
"learning_rate": 0.0001,
"loss": 7.0187,
"loss/crossentropy": 1.7111039966344834,
"loss/hidden": 3.42734375,
"loss/jsd": 0.0,
"loss/logits": 0.20188356712460517,
"step": 680
},
{
"epoch": 0.0345,
"grad_norm": 30.5,
"grad_norm_var": 1.5458333333333334,
"learning_rate": 0.0001,
"loss": 7.1392,
"loss/crossentropy": 1.7463210627436638,
"loss/hidden": 3.380078125,
"loss/jsd": 0.0,
"loss/logits": 0.18064118530601264,
"step": 690
},
{
"epoch": 0.035,
"grad_norm": 30.0,
"grad_norm_var": 1.6020833333333333,
"learning_rate": 0.0001,
"loss": 7.0488,
"loss/crossentropy": 1.913002396374941,
"loss/hidden": 3.248046875,
"loss/jsd": 0.0,
"loss/logits": 0.17795131383463741,
"step": 700
},
{
"epoch": 0.0355,
"grad_norm": 3674210304.0,
"grad_norm_var": 2.2729279965717071e+18,
"learning_rate": 0.0001,
"loss": 7.1836,
"loss/crossentropy": 1.7232265777885913,
"loss/hidden": 3.417578125,
"loss/jsd": 0.0,
"loss/logits": 0.19430895978584886,
"step": 710
},
{
"epoch": 0.036,
"grad_norm": 29.125,
"grad_norm_var": 8.437388195823355e+17,
"learning_rate": 0.0001,
"loss": 6.9841,
"loss/crossentropy": 1.8030119113624097,
"loss/hidden": 3.39453125,
"loss/jsd": 0.0,
"loss/logits": 0.18302876157686115,
"step": 720
},
{
"epoch": 0.0365,
"grad_norm": 30.375,
"grad_norm_var": 2.85,
"learning_rate": 0.0001,
"loss": 6.9804,
"loss/crossentropy": 1.9009442821145057,
"loss/hidden": 3.266796875,
"loss/jsd": 0.0,
"loss/logits": 0.16866004383191466,
"step": 730
},
{
"epoch": 0.037,
"grad_norm": 30.0,
"grad_norm_var": 9.339322916666667,
"learning_rate": 0.0001,
"loss": 6.9876,
"loss/crossentropy": 1.6418433368206025,
"loss/hidden": 3.438671875,
"loss/jsd": 0.0,
"loss/logits": 0.191958365496248,
"step": 740
},
{
"epoch": 0.0375,
"grad_norm": 30.875,
"grad_norm_var": 7.639322916666667,
"learning_rate": 0.0001,
"loss": 7.0538,
"loss/crossentropy": 1.853764034062624,
"loss/hidden": 3.32578125,
"loss/jsd": 0.0,
"loss/logits": 0.17473467853851615,
"step": 750
},
{
"epoch": 0.038,
"grad_norm": 31.125,
"grad_norm_var": 1.0613932291666666,
"learning_rate": 0.0001,
"loss": 7.1458,
"loss/crossentropy": 1.8514880582690239,
"loss/hidden": 3.378515625,
"loss/jsd": 0.0,
"loss/logits": 0.19726306498050689,
"step": 760
},
{
"epoch": 0.0385,
"grad_norm": 28.875,
"grad_norm_var": 1.7997395833333334,
"learning_rate": 0.0001,
"loss": 7.0766,
"loss/crossentropy": 1.8405121728777885,
"loss/hidden": 3.324609375,
"loss/jsd": 0.0,
"loss/logits": 0.19442977402359246,
"step": 770
},
{
"epoch": 0.039,
"grad_norm": 29.0,
"grad_norm_var": 2.3802083333333335,
"learning_rate": 0.0001,
"loss": 7.0214,
"loss/crossentropy": 1.9466332450509072,
"loss/hidden": 3.289453125,
"loss/jsd": 0.0,
"loss/logits": 0.170109105668962,
"step": 780
},
{
"epoch": 0.0395,
"grad_norm": 30.0,
"grad_norm_var": 1.6124348958333334,
"learning_rate": 0.0001,
"loss": 7.1306,
"loss/crossentropy": 1.8399325378239155,
"loss/hidden": 3.46015625,
"loss/jsd": 0.0,
"loss/logits": 0.20626397961750625,
"step": 790
},
{
"epoch": 0.04,
"grad_norm": 31.75,
"grad_norm_var": 1.6559895833333333,
"learning_rate": 0.0001,
"loss": 7.1375,
"loss/crossentropy": 1.9278223380446433,
"loss/hidden": 3.41015625,
"loss/jsd": 0.0,
"loss/logits": 0.2024382423609495,
"step": 800
},
{
"epoch": 0.0405,
"grad_norm": 27.5,
"grad_norm_var": 16.089322916666667,
"learning_rate": 0.0001,
"loss": 7.0363,
"loss/crossentropy": 1.859210267663002,
"loss/hidden": 3.345703125,
"loss/jsd": 0.0,
"loss/logits": 0.18585832975804806,
"step": 810
},
{
"epoch": 0.041,
"grad_norm": 28.25,
"grad_norm_var": 38.77265625,
"learning_rate": 0.0001,
"loss": 6.9378,
"loss/crossentropy": 1.8994540706276895,
"loss/hidden": 3.376953125,
"loss/jsd": 0.0,
"loss/logits": 0.2018324811011553,
"step": 820
},
{
"epoch": 0.0415,
"grad_norm": 32.0,
"grad_norm_var": 38.8375,
"learning_rate": 0.0001,
"loss": 7.002,
"loss/crossentropy": 1.8244094364345074,
"loss/hidden": 3.415625,
"loss/jsd": 0.0,
"loss/logits": 0.20930232629179954,
"step": 830
},
{
"epoch": 0.042,
"grad_norm": 30.25,
"grad_norm_var": 2.0634765625,
"learning_rate": 0.0001,
"loss": 6.9688,
"loss/crossentropy": 1.8976417139172554,
"loss/hidden": 3.33515625,
"loss/jsd": 0.0,
"loss/logits": 0.1871755332686007,
"step": 840
},
{
"epoch": 0.0425,
"grad_norm": 50.75,
"grad_norm_var": 28.351497395833334,
"learning_rate": 0.0001,
"loss": 6.992,
"loss/crossentropy": 1.899886740744114,
"loss/hidden": 3.417578125,
"loss/jsd": 0.0,
"loss/logits": 0.18904313631355762,
"step": 850
},
{
"epoch": 0.043,
"grad_norm": 29.0,
"grad_norm_var": 27.3056640625,
"learning_rate": 0.0001,
"loss": 7.0939,
"loss/crossentropy": 1.8286892741918563,
"loss/hidden": 3.362109375,
"loss/jsd": 0.0,
"loss/logits": 0.18909739144146442,
"step": 860
},
{
"epoch": 0.0435,
"grad_norm": 28.375,
"grad_norm_var": 1.3247395833333333,
"learning_rate": 0.0001,
"loss": 6.9381,
"loss/crossentropy": 1.9782623961567878,
"loss/hidden": 3.305859375,
"loss/jsd": 0.0,
"loss/logits": 0.1766037069261074,
"step": 870
},
{
"epoch": 0.044,
"grad_norm": 29.0,
"grad_norm_var": 2.1988932291666665,
"learning_rate": 0.0001,
"loss": 6.8414,
"loss/crossentropy": 1.8968854755163194,
"loss/hidden": 3.413671875,
"loss/jsd": 0.0,
"loss/logits": 0.20138736004009844,
"step": 880
},
{
"epoch": 0.0445,
"grad_norm": 32.75,
"grad_norm_var": 1.92890625,
"learning_rate": 0.0001,
"loss": 7.1271,
"loss/crossentropy": 1.8630956932902336,
"loss/hidden": 3.428125,
"loss/jsd": 0.0,
"loss/logits": 0.21029497124254704,
"step": 890
},
{
"epoch": 0.045,
"grad_norm": 29.25,
"grad_norm_var": 2.037239583333333,
"learning_rate": 0.0001,
"loss": 7.0435,
"loss/crossentropy": 1.8676601111888886,
"loss/hidden": 3.351953125,
"loss/jsd": 0.0,
"loss/logits": 0.19789310321211814,
"step": 900
},
{
"epoch": 0.0455,
"grad_norm": 30.25,
"grad_norm_var": 4.2265225949129395e+17,
"learning_rate": 0.0001,
"loss": 7.1233,
"loss/crossentropy": 1.8434145867824554,
"loss/hidden": 3.378125,
"loss/jsd": 0.0,
"loss/logits": 0.18832013495266436,
"step": 910
},
{
"epoch": 0.046,
"grad_norm": 29.375,
"grad_norm_var": 4.2265225969445555e+17,
"learning_rate": 0.0001,
"loss": 6.8733,
"loss/crossentropy": 1.81582195982337,
"loss/hidden": 3.416796875,
"loss/jsd": 0.0,
"loss/logits": 0.18773540575057268,
"step": 920
},
{
"epoch": 0.0465,
"grad_norm": 33.0,
"grad_norm_var": 4.476822916666666,
"learning_rate": 0.0001,
"loss": 7.0752,
"loss/crossentropy": 1.8667447365820409,
"loss/hidden": 3.336328125,
"loss/jsd": 0.0,
"loss/logits": 0.18054497512057424,
"step": 930
},
{
"epoch": 0.047,
"grad_norm": 28.625,
"grad_norm_var": 6.144205729166667,
"learning_rate": 0.0001,
"loss": 7.0032,
"loss/crossentropy": 1.8144822165369987,
"loss/hidden": 3.271484375,
"loss/jsd": 0.0,
"loss/logits": 0.1632128401659429,
"step": 940
},
{
"epoch": 0.0475,
"grad_norm": 30.375,
"grad_norm_var": 5.01875,
"learning_rate": 0.0001,
"loss": 6.8626,
"loss/crossentropy": 1.8152224607765675,
"loss/hidden": 3.394140625,
"loss/jsd": 0.0,
"loss/logits": 0.18933067489415406,
"step": 950
},
{
"epoch": 0.048,
"grad_norm": 37.0,
"grad_norm_var": 7.297916666666667,
"learning_rate": 0.0001,
"loss": 7.0437,
"loss/crossentropy": 1.6399064034223556,
"loss/hidden": 3.39140625,
"loss/jsd": 0.0,
"loss/logits": 0.18825935963541268,
"step": 960
},
{
"epoch": 0.0485,
"grad_norm": 29.75,
"grad_norm_var": 4.739583333333333,
"learning_rate": 0.0001,
"loss": 7.0331,
"loss/crossentropy": 1.6737658925354482,
"loss/hidden": 3.412890625,
"loss/jsd": 0.0,
"loss/logits": 0.17548465421423315,
"step": 970
},
{
"epoch": 0.049,
"grad_norm": 30.0,
"grad_norm_var": 18.1541015625,
"learning_rate": 0.0001,
"loss": 6.9385,
"loss/crossentropy": 1.8608146458864212,
"loss/hidden": 3.35234375,
"loss/jsd": 0.0,
"loss/logits": 0.19196428768336773,
"step": 980
},
{
"epoch": 0.0495,
"grad_norm": 33.75,
"grad_norm_var": 4.003125,
"learning_rate": 0.0001,
"loss": 7.0686,
"loss/crossentropy": 1.8301926247775555,
"loss/hidden": 3.347265625,
"loss/jsd": 0.0,
"loss/logits": 0.18049606634303927,
"step": 990
},
{
"epoch": 0.05,
"grad_norm": 31.75,
"grad_norm_var": 1.0473683721235639e+18,
"learning_rate": 0.0001,
"loss": 7.0193,
"loss/crossentropy": 1.7465273767709732,
"loss/hidden": 3.369921875,
"loss/jsd": 0.0,
"loss/logits": 0.17173261381685734,
"step": 1000
},
{
"epoch": 0.0505,
"grad_norm": 29.75,
"grad_norm_var": 22.408268229166666,
"learning_rate": 0.0001,
"loss": 6.9709,
"loss/crossentropy": 1.7683202728629113,
"loss/hidden": 3.419921875,
"loss/jsd": 0.0,
"loss/logits": 0.210743809863925,
"step": 1010
},
{
"epoch": 0.051,
"grad_norm": 28.625,
"grad_norm_var": 2.371875,
"learning_rate": 0.0001,
"loss": 7.0597,
"loss/crossentropy": 2.046058624982834,
"loss/hidden": 3.3375,
"loss/jsd": 0.0,
"loss/logits": 0.18963768277317286,
"step": 1020
},
{
"epoch": 0.0515,
"grad_norm": 30.0,
"grad_norm_var": 1.3184895833333334,
"learning_rate": 0.0001,
"loss": 7.0245,
"loss/crossentropy": 1.745854178071022,
"loss/hidden": 3.30390625,
"loss/jsd": 0.0,
"loss/logits": 0.17351055853068828,
"step": 1030
},
{
"epoch": 0.052,
"grad_norm": 34.75,
"grad_norm_var": 2.8108723958333335,
"learning_rate": 0.0001,
"loss": 6.9474,
"loss/crossentropy": 1.8277953140437604,
"loss/hidden": 3.33984375,
"loss/jsd": 0.0,
"loss/logits": 0.16915141120553018,
"step": 1040
},
{
"epoch": 0.0525,
"grad_norm": 32.5,
"grad_norm_var": 3.39765625,
"learning_rate": 0.0001,
"loss": 6.9366,
"loss/crossentropy": 1.9404960587620734,
"loss/hidden": 3.35625,
"loss/jsd": 0.0,
"loss/logits": 0.18970660548657178,
"step": 1050
},
{
"epoch": 0.053,
"grad_norm": 35.75,
"grad_norm_var": 1.1892317588406927e+18,
"learning_rate": 0.0001,
"loss": 7.0954,
"loss/crossentropy": 1.8612810902297496,
"loss/hidden": 3.31171875,
"loss/jsd": 0.0,
"loss/logits": 0.17269262354820966,
"step": 1060
},
{
"epoch": 0.0535,
"grad_norm": 29.875,
"grad_norm_var": 1.1892317588497805e+18,
"learning_rate": 0.0001,
"loss": 7.0259,
"loss/crossentropy": 1.743497943878174,
"loss/hidden": 3.2609375,
"loss/jsd": 0.0,
"loss/logits": 0.1666251303628087,
"step": 1070
},
{
"epoch": 0.054,
"grad_norm": 29.625,
"grad_norm_var": 2.903059895833333,
"learning_rate": 0.0001,
"loss": 7.0055,
"loss/crossentropy": 1.9657445706427097,
"loss/hidden": 3.32734375,
"loss/jsd": 0.0,
"loss/logits": 0.18259168425574898,
"step": 1080
},
{
"epoch": 0.0545,
"grad_norm": 30.25,
"grad_norm_var": 51.16015625,
"learning_rate": 0.0001,
"loss": 7.1126,
"loss/crossentropy": 2.0204195216298104,
"loss/hidden": 3.334765625,
"loss/jsd": 0.0,
"loss/logits": 0.20481194872409106,
"step": 1090
},
{
"epoch": 0.055,
"grad_norm": 29.625,
"grad_norm_var": 2.90390625,
"learning_rate": 0.0001,
"loss": 7.0413,
"loss/crossentropy": 1.589720468968153,
"loss/hidden": 3.275,
"loss/jsd": 0.0,
"loss/logits": 0.18000307623296977,
"step": 1100
},
{
"epoch": 0.0555,
"grad_norm": 29.375,
"grad_norm_var": 2.2613932291666665,
"learning_rate": 0.0001,
"loss": 6.9722,
"loss/crossentropy": 1.7191244810819626,
"loss/hidden": 3.45390625,
"loss/jsd": 0.0,
"loss/logits": 0.18164545409381389,
"step": 1110
},
{
"epoch": 0.056,
"grad_norm": 28.875,
"grad_norm_var": 1.7520833333333334,
"learning_rate": 0.0001,
"loss": 6.9492,
"loss/crossentropy": 1.8928776159882545,
"loss/hidden": 3.358203125,
"loss/jsd": 0.0,
"loss/logits": 0.18985262140631676,
"step": 1120
},
{
"epoch": 0.0565,
"grad_norm": 30.0,
"grad_norm_var": 1.2447265625,
"learning_rate": 0.0001,
"loss": 7.1367,
"loss/crossentropy": 1.7702923499047756,
"loss/hidden": 3.32109375,
"loss/jsd": 0.0,
"loss/logits": 0.17983693201094866,
"step": 1130
},
{
"epoch": 0.057,
"grad_norm": 30.25,
"grad_norm_var": 3.3080729166666667,
"learning_rate": 0.0001,
"loss": 7.0322,
"loss/crossentropy": 1.8519952863454818,
"loss/hidden": 3.465234375,
"loss/jsd": 0.0,
"loss/logits": 0.20197003111243247,
"step": 1140
},
{
"epoch": 0.0575,
"grad_norm": 31.125,
"grad_norm_var": 3.1962890625,
"learning_rate": 0.0001,
"loss": 7.0557,
"loss/crossentropy": 1.8624355979263783,
"loss/hidden": 3.526953125,
"loss/jsd": 0.0,
"loss/logits": 0.20604186709970235,
"step": 1150
},
{
"epoch": 0.058,
"grad_norm": 28.5,
"grad_norm_var": 22.8462890625,
"learning_rate": 0.0001,
"loss": 6.9562,
"loss/crossentropy": 1.8102556586265564,
"loss/hidden": 3.44609375,
"loss/jsd": 0.0,
"loss/logits": 0.20240887869149446,
"step": 1160
},
{
"epoch": 0.0585,
"grad_norm": 32.25,
"grad_norm_var": 23.950455729166666,
"learning_rate": 0.0001,
"loss": 6.9857,
"loss/crossentropy": 1.8860370084643363,
"loss/hidden": 3.339453125,
"loss/jsd": 0.0,
"loss/logits": 0.18206186592578888,
"step": 1170
},
{
"epoch": 0.059,
"grad_norm": 30.125,
"grad_norm_var": 1.6518229166666667,
"learning_rate": 0.0001,
"loss": 7.056,
"loss/crossentropy": 1.9338740326464177,
"loss/hidden": 3.42265625,
"loss/jsd": 0.0,
"loss/logits": 0.22607974465936423,
"step": 1180
},
{
"epoch": 0.0595,
"grad_norm": 29.5,
"grad_norm_var": 11.267708333333333,
"learning_rate": 0.0001,
"loss": 6.931,
"loss/crossentropy": 1.9357615426182746,
"loss/hidden": 3.351953125,
"loss/jsd": 0.0,
"loss/logits": 0.1852928228676319,
"step": 1190
},
{
"epoch": 0.06,
"grad_norm": 39.25,
"grad_norm_var": 1.2635411532464435e+18,
"learning_rate": 0.0001,
"loss": 7.0138,
"loss/crossentropy": 1.669256182014942,
"loss/hidden": 3.31328125,
"loss/jsd": 0.0,
"loss/logits": 0.1792891369201243,
"step": 1200
},
{
"epoch": 0.0605,
"grad_norm": 30.125,
"grad_norm_var": 2.2555340145024479e+18,
"learning_rate": 0.0001,
"loss": 7.003,
"loss/crossentropy": 1.8537344850599766,
"loss/hidden": 3.6078125,
"loss/jsd": 0.0,
"loss/logits": 0.18713028654456138,
"step": 1210
},
{
"epoch": 0.061,
"grad_norm": 30.75,
"grad_norm_var": 1.1529214881025404e+18,
"learning_rate": 0.0001,
"loss": 6.9982,
"loss/crossentropy": 1.8868144243955611,
"loss/hidden": 3.259375,
"loss/jsd": 0.0,
"loss/logits": 0.16826356202363968,
"step": 1220
},
{
"epoch": 0.0615,
"grad_norm": 38.0,
"grad_norm_var": 11.041080729166667,
"learning_rate": 0.0001,
"loss": 7.1145,
"loss/crossentropy": 1.7373395457863807,
"loss/hidden": 3.26328125,
"loss/jsd": 0.0,
"loss/logits": 0.16631986051797867,
"step": 1230
},
{
"epoch": 0.062,
"grad_norm": 28.625,
"grad_norm_var": 6.718489583333334,
"learning_rate": 0.0001,
"loss": 6.8881,
"loss/crossentropy": 1.610298927500844,
"loss/hidden": 3.358984375,
"loss/jsd": 0.0,
"loss/logits": 0.1909397032111883,
"step": 1240
},
{
"epoch": 0.0625,
"grad_norm": 29.625,
"grad_norm_var": 4.344205729166666,
"learning_rate": 0.0001,
"loss": 7.0797,
"loss/crossentropy": 1.7361410059034825,
"loss/hidden": 3.366796875,
"loss/jsd": 0.0,
"loss/logits": 0.18541559688746928,
"step": 1250
},
{
"epoch": 0.063,
"grad_norm": 27.875,
"grad_norm_var": 3.3889973958333335,
"learning_rate": 0.0001,
"loss": 6.9329,
"loss/crossentropy": 1.7078735738992692,
"loss/hidden": 3.4015625,
"loss/jsd": 0.0,
"loss/logits": 0.18024133574217557,
"step": 1260
},
{
"epoch": 0.0635,
"grad_norm": 35.0,
"grad_norm_var": 6.6166015625,
"learning_rate": 0.0001,
"loss": 6.9738,
"loss/crossentropy": 1.8044774197041988,
"loss/hidden": 3.276171875,
"loss/jsd": 0.0,
"loss/logits": 0.1794836211949587,
"step": 1270
},
{
"epoch": 0.064,
"grad_norm": 29.375,
"grad_norm_var": 13.601822916666666,
"learning_rate": 0.0001,
"loss": 6.9062,
"loss/crossentropy": 1.8313415050506592,
"loss/hidden": 3.3140625,
"loss/jsd": 0.0,
"loss/logits": 0.18087668968364595,
"step": 1280
},
{
"epoch": 0.0645,
"grad_norm": 29.75,
"grad_norm_var": 3.6020182291666667,
"learning_rate": 0.0001,
"loss": 6.9407,
"loss/crossentropy": 1.6438103877007961,
"loss/hidden": 3.41875,
"loss/jsd": 0.0,
"loss/logits": 0.1820345466956496,
"step": 1290
},
{
"epoch": 0.065,
"grad_norm": 30.25,
"grad_norm_var": 1.2379557291666667,
"learning_rate": 0.0001,
"loss": 7.0302,
"loss/crossentropy": 1.7621051207184792,
"loss/hidden": 3.41171875,
"loss/jsd": 0.0,
"loss/logits": 0.19308385904878378,
"step": 1300
},
{
"epoch": 0.0655,
"grad_norm": 29.375,
"grad_norm_var": 3.46640625,
"learning_rate": 0.0001,
"loss": 7.1178,
"loss/crossentropy": 1.871315811574459,
"loss/hidden": 3.3875,
"loss/jsd": 0.0,
"loss/logits": 0.19272034596651794,
"step": 1310
},
{
"epoch": 0.066,
"grad_norm": 31.625,
"grad_norm_var": 3.609375,
"learning_rate": 0.0001,
"loss": 7.0298,
"loss/crossentropy": 1.8252998240292073,
"loss/hidden": 3.36875,
"loss/jsd": 0.0,
"loss/logits": 0.21978344805538655,
"step": 1320
},
{
"epoch": 0.0665,
"grad_norm": 33.5,
"grad_norm_var": 1.3990009840566536e+18,
"learning_rate": 0.0001,
"loss": 7.068,
"loss/crossentropy": 1.639507355540991,
"loss/hidden": 3.60703125,
"loss/jsd": 0.0,
"loss/logits": 0.18024437148123978,
"step": 1330
},
{
"epoch": 0.067,
"grad_norm": 28.75,
"grad_norm_var": 1.3990009842291443e+18,
"learning_rate": 0.0001,
"loss": 6.9556,
"loss/crossentropy": 1.8158223167061807,
"loss/hidden": 3.3203125,
"loss/jsd": 0.0,
"loss/logits": 0.18003626042045653,
"step": 1340
},
{
"epoch": 0.0675,
"grad_norm": 29.75,
"grad_norm_var": 3.21640625,
"learning_rate": 0.0001,
"loss": 6.7859,
"loss/crossentropy": 1.6335266396403312,
"loss/hidden": 3.38046875,
"loss/jsd": 0.0,
"loss/logits": 0.1845483684912324,
"step": 1350
},
{
"epoch": 0.068,
"grad_norm": 30.75,
"grad_norm_var": 2.5497395833333334,
"learning_rate": 0.0001,
"loss": 6.8607,
"loss/crossentropy": 1.7433619983494282,
"loss/hidden": 3.3484375,
"loss/jsd": 0.0,
"loss/logits": 0.17121702507138253,
"step": 1360
},
{
"epoch": 0.0685,
"grad_norm": 28.0,
"grad_norm_var": 4.353580729166667,
"learning_rate": 0.0001,
"loss": 7.1422,
"loss/crossentropy": 1.8455571182072164,
"loss/hidden": 3.333203125,
"loss/jsd": 0.0,
"loss/logits": 0.2054300512187183,
"step": 1370
},
{
"epoch": 0.069,
"grad_norm": 29.625,
"grad_norm_var": 3.388541666666667,
"learning_rate": 0.0001,
"loss": 7.0213,
"loss/crossentropy": 1.8241696588695049,
"loss/hidden": 3.321875,
"loss/jsd": 0.0,
"loss/logits": 0.18985041994601487,
"step": 1380
},
{
"epoch": 0.0695,
"grad_norm": 31.25,
"grad_norm_var": 8.0431640625,
"learning_rate": 0.0001,
"loss": 7.0,
"loss/crossentropy": 1.7940153643488883,
"loss/hidden": 3.331640625,
"loss/jsd": 0.0,
"loss/logits": 0.18176266234368085,
"step": 1390
},
{
"epoch": 0.07,
"grad_norm": 33.0,
"grad_norm_var": 14.3041015625,
"learning_rate": 0.0001,
"loss": 6.898,
"loss/crossentropy": 1.8607503667473793,
"loss/hidden": 3.326953125,
"loss/jsd": 0.0,
"loss/logits": 0.17468307819217443,
"step": 1400
},
{
"epoch": 0.0705,
"grad_norm": 28.125,
"grad_norm_var": 13.432291666666666,
"learning_rate": 0.0001,
"loss": 7.031,
"loss/crossentropy": 1.6316836021840573,
"loss/hidden": 3.240234375,
"loss/jsd": 0.0,
"loss/logits": 0.15119749261066318,
"step": 1410
},
{
"epoch": 0.071,
"grad_norm": 28.25,
"grad_norm_var": 45.9634765625,
"learning_rate": 0.0001,
"loss": 7.1507,
"loss/crossentropy": 1.8821631267666816,
"loss/hidden": 3.465625,
"loss/jsd": 0.0,
"loss/logits": 0.19027305245399476,
"step": 1420
},
{
"epoch": 0.0715,
"grad_norm": 28.375,
"grad_norm_var": 46.1884765625,
"learning_rate": 0.0001,
"loss": 7.063,
"loss/crossentropy": 1.6992614693939685,
"loss/hidden": 3.4625,
"loss/jsd": 0.0,
"loss/logits": 0.2002884623594582,
"step": 1430
},
{
"epoch": 0.072,
"grad_norm": 29.625,
"grad_norm_var": 6.732291666666667,
"learning_rate": 0.0001,
"loss": 6.9439,
"loss/crossentropy": 1.7733798533678056,
"loss/hidden": 3.307421875,
"loss/jsd": 0.0,
"loss/logits": 0.17554995641112328,
"step": 1440
},
{
"epoch": 0.0725,
"grad_norm": 30.625,
"grad_norm_var": 24.97265625,
"learning_rate": 0.0001,
"loss": 7.0264,
"loss/crossentropy": 1.8444553710520268,
"loss/hidden": 3.412109375,
"loss/jsd": 0.0,
"loss/logits": 0.1976129287853837,
"step": 1450
},
{
"epoch": 0.073,
"grad_norm": 41.5,
"grad_norm_var": 18.2275390625,
"learning_rate": 0.0001,
"loss": 7.0056,
"loss/crossentropy": 1.778428715467453,
"loss/hidden": 3.3046875,
"loss/jsd": 0.0,
"loss/logits": 0.17879956895485521,
"step": 1460
},
{
"epoch": 0.0735,
"grad_norm": 40.75,
"grad_norm_var": 14.88515625,
"learning_rate": 0.0001,
"loss": 6.8647,
"loss/crossentropy": 1.8260969623923302,
"loss/hidden": 3.431640625,
"loss/jsd": 0.0,
"loss/logits": 0.18223165888339282,
"step": 1470
},
{
"epoch": 0.074,
"grad_norm": 30.75,
"grad_norm_var": 12.42265625,
"learning_rate": 0.0001,
"loss": 6.9814,
"loss/crossentropy": 1.852180902659893,
"loss/hidden": 3.188671875,
"loss/jsd": 0.0,
"loss/logits": 0.15915404492989182,
"step": 1480
},
{
"epoch": 0.0745,
"grad_norm": 32.0,
"grad_norm_var": 17.264518229166665,
"learning_rate": 0.0001,
"loss": 6.9467,
"loss/crossentropy": 1.8016018435359,
"loss/hidden": 3.30234375,
"loss/jsd": 0.0,
"loss/logits": 0.17374343778938056,
"step": 1490
},
{
"epoch": 0.075,
"grad_norm": 27.75,
"grad_norm_var": 16.795572916666668,
"learning_rate": 0.0001,
"loss": 6.9688,
"loss/crossentropy": 1.7803546212613583,
"loss/hidden": 3.230078125,
"loss/jsd": 0.0,
"loss/logits": 0.1623454326763749,
"step": 1500
},
{
"epoch": 0.0755,
"grad_norm": 27.125,
"grad_norm_var": 11.0072265625,
"learning_rate": 0.0001,
"loss": 6.9148,
"loss/crossentropy": 1.7990518882870674,
"loss/hidden": 3.341015625,
"loss/jsd": 0.0,
"loss/logits": 0.1776049867272377,
"step": 1510
},
{
"epoch": 0.076,
"grad_norm": 28.875,
"grad_norm_var": 9.0009765625,
"learning_rate": 0.0001,
"loss": 6.9834,
"loss/crossentropy": 1.7659361466765404,
"loss/hidden": 3.229296875,
"loss/jsd": 0.0,
"loss/logits": 0.17018448635935784,
"step": 1520
},
{
"epoch": 0.0765,
"grad_norm": 28.75,
"grad_norm_var": 5.566666666666666,
"learning_rate": 0.0001,
"loss": 6.9513,
"loss/crossentropy": 1.948898734152317,
"loss/hidden": 3.368359375,
"loss/jsd": 0.0,
"loss/logits": 0.20332392100244762,
"step": 1530
},
{
"epoch": 0.077,
"grad_norm": 37.0,
"grad_norm_var": 12.0337890625,
"learning_rate": 0.0001,
"loss": 6.9845,
"loss/crossentropy": 1.897236557304859,
"loss/hidden": 3.305078125,
"loss/jsd": 0.0,
"loss/logits": 0.1786106862127781,
"step": 1540
},
{
"epoch": 0.0775,
"grad_norm": 30.75,
"grad_norm_var": 10.74140625,
"learning_rate": 0.0001,
"loss": 6.9651,
"loss/crossentropy": 1.668473443388939,
"loss/hidden": 3.30390625,
"loss/jsd": 0.0,
"loss/logits": 0.18010491924360394,
"step": 1550
},
{
"epoch": 0.078,
"grad_norm": 35.0,
"grad_norm_var": 11.645768229166666,
"learning_rate": 0.0001,
"loss": 7.0873,
"loss/crossentropy": 1.8844516187906266,
"loss/hidden": 3.323828125,
"loss/jsd": 0.0,
"loss/logits": 0.19164156243205072,
"step": 1560
},
{
"epoch": 0.0785,
"grad_norm": 36.5,
"grad_norm_var": 9.326497395833334,
"learning_rate": 0.0001,
"loss": 6.9175,
"loss/crossentropy": 1.7603260070085525,
"loss/hidden": 3.276953125,
"loss/jsd": 0.0,
"loss/logits": 0.17738686297088863,
"step": 1570
},
{
"epoch": 0.079,
"grad_norm": 28.25,
"grad_norm_var": 11.4259765625,
"learning_rate": 0.0001,
"loss": 7.0352,
"loss/crossentropy": 1.8728493131697177,
"loss/hidden": 3.341796875,
"loss/jsd": 0.0,
"loss/logits": 0.19688725294545292,
"step": 1580
},
{
"epoch": 0.0795,
"grad_norm": 29.25,
"grad_norm_var": 8.5375,
"learning_rate": 0.0001,
"loss": 6.955,
"loss/crossentropy": 1.8099886417388915,
"loss/hidden": 3.29375,
"loss/jsd": 0.0,
"loss/logits": 0.18610341083258392,
"step": 1590
},
{
"epoch": 0.08,
"grad_norm": 36.0,
"grad_norm_var": 19.722330729166668,
"learning_rate": 0.0001,
"loss": 6.9313,
"loss/crossentropy": 1.7017989411950112,
"loss/hidden": 3.35234375,
"loss/jsd": 0.0,
"loss/logits": 0.17710780492052436,
"step": 1600
},
{
"epoch": 0.0805,
"grad_norm": 32.25,
"grad_norm_var": 21.603125,
"learning_rate": 0.0001,
"loss": 7.069,
"loss/crossentropy": 1.7873531341552735,
"loss/hidden": 3.333203125,
"loss/jsd": 0.0,
"loss/logits": 0.1812642457894981,
"step": 1610
},
{
"epoch": 0.081,
"grad_norm": 28.875,
"grad_norm_var": 3.2207682291666666,
"learning_rate": 0.0001,
"loss": 7.0405,
"loss/crossentropy": 1.7903928458690643,
"loss/hidden": 3.394921875,
"loss/jsd": 0.0,
"loss/logits": 0.19645511778071523,
"step": 1620
},
{
"epoch": 0.0815,
"grad_norm": 29.75,
"grad_norm_var": 2.874739583333333,
"learning_rate": 0.0001,
"loss": 7.0022,
"loss/crossentropy": 1.6019535034894943,
"loss/hidden": 3.271484375,
"loss/jsd": 0.0,
"loss/logits": 0.1628541074693203,
"step": 1630
},
{
"epoch": 0.082,
"grad_norm": 31.375,
"grad_norm_var": 6.37265625,
"learning_rate": 0.0001,
"loss": 6.7734,
"loss/crossentropy": 1.7893570616841317,
"loss/hidden": 3.371484375,
"loss/jsd": 0.0,
"loss/logits": 0.2000499103218317,
"step": 1640
},
{
"epoch": 0.0825,
"grad_norm": 30.5,
"grad_norm_var": 6.910416666666666,
"learning_rate": 0.0001,
"loss": 6.9578,
"loss/crossentropy": 1.6443258710205555,
"loss/hidden": 3.259765625,
"loss/jsd": 0.0,
"loss/logits": 0.16416865289211274,
"step": 1650
},
{
"epoch": 0.083,
"grad_norm": 30.5,
"grad_norm_var": 35.25182291666667,
"learning_rate": 0.0001,
"loss": 7.0861,
"loss/crossentropy": 1.8358689159154893,
"loss/hidden": 3.28359375,
"loss/jsd": 0.0,
"loss/logits": 0.1853348884731531,
"step": 1660
},
{
"epoch": 0.0835,
"grad_norm": 30.0,
"grad_norm_var": 15.6587890625,
"learning_rate": 0.0001,
"loss": 6.9008,
"loss/crossentropy": 1.9014468491077423,
"loss/hidden": 3.34140625,
"loss/jsd": 0.0,
"loss/logits": 0.19975380562245845,
"step": 1670
},
{
"epoch": 0.084,
"grad_norm": 28.25,
"grad_norm_var": 4.9666015625,
"learning_rate": 0.0001,
"loss": 7.0062,
"loss/crossentropy": 1.7637556672096253,
"loss/hidden": 3.40703125,
"loss/jsd": 0.0,
"loss/logits": 0.19306765552610158,
"step": 1680
},
{
"epoch": 0.0845,
"grad_norm": 44.0,
"grad_norm_var": 14.08125,
"learning_rate": 0.0001,
"loss": 6.9184,
"loss/crossentropy": 1.7980270460247993,
"loss/hidden": 3.336328125,
"loss/jsd": 0.0,
"loss/logits": 0.17251317510381342,
"step": 1690
},
{
"epoch": 0.085,
"grad_norm": 30.0,
"grad_norm_var": 16.656184895833334,
"learning_rate": 0.0001,
"loss": 6.8985,
"loss/crossentropy": 1.9003560155630113,
"loss/hidden": 3.336328125,
"loss/jsd": 0.0,
"loss/logits": 0.19372209012508393,
"step": 1700
},
{
"epoch": 0.0855,
"grad_norm": 28.375,
"grad_norm_var": 4.02265625,
"learning_rate": 0.0001,
"loss": 6.8638,
"loss/crossentropy": 1.7488896727561951,
"loss/hidden": 3.31484375,
"loss/jsd": 0.0,
"loss/logits": 0.16111841816455125,
"step": 1710
},
{
"epoch": 0.086,
"grad_norm": 4362076160.0,
"grad_norm_var": 1.1892317599584748e+18,
"learning_rate": 0.0001,
"loss": 7.061,
"loss/crossentropy": 1.7708093903958797,
"loss/hidden": 3.35625,
"loss/jsd": 0.0,
"loss/logits": 0.19512660000473261,
"step": 1720
},
{
"epoch": 0.0865,
"grad_norm": 30.375,
"grad_norm_var": 1.1892317591996554e+18,
"learning_rate": 0.0001,
"loss": 6.8861,
"loss/crossentropy": 1.6944726780056953,
"loss/hidden": 3.333203125,
"loss/jsd": 0.0,
"loss/logits": 0.16455791369080544,
"step": 1730
},
{
"epoch": 0.087,
"grad_norm": 29.375,
"grad_norm_var": 3.2905598958333333,
"learning_rate": 0.0001,
"loss": 6.8425,
"loss/crossentropy": 1.7352489478886128,
"loss/hidden": 3.32421875,
"loss/jsd": 0.0,
"loss/logits": 0.16651339596137404,
"step": 1740
},
{
"epoch": 0.0875,
"grad_norm": 29.875,
"grad_norm_var": 1.81015625,
"learning_rate": 0.0001,
"loss": 6.886,
"loss/crossentropy": 1.775932352244854,
"loss/hidden": 3.375,
"loss/jsd": 0.0,
"loss/logits": 0.18791395220905543,
"step": 1750
},
{
"epoch": 0.088,
"grad_norm": 29.25,
"grad_norm_var": 2.9848307291666667,
"learning_rate": 0.0001,
"loss": 6.8755,
"loss/crossentropy": 1.700956543534994,
"loss/hidden": 3.359765625,
"loss/jsd": 0.0,
"loss/logits": 0.17034402694553136,
"step": 1760
},
{
"epoch": 0.0885,
"grad_norm": 30.375,
"grad_norm_var": 2.0660807291666665,
"learning_rate": 0.0001,
"loss": 6.9996,
"loss/crossentropy": 1.6696124613285064,
"loss/hidden": 3.317578125,
"loss/jsd": 0.0,
"loss/logits": 0.17471891567111014,
"step": 1770
},
{
"epoch": 0.089,
"grad_norm": 29.0,
"grad_norm_var": 2.7729166666666667,
"learning_rate": 0.0001,
"loss": 6.8325,
"loss/crossentropy": 1.6660587199032306,
"loss/hidden": 3.328515625,
"loss/jsd": 0.0,
"loss/logits": 0.1662266943603754,
"step": 1780
},
{
"epoch": 0.0895,
"grad_norm": 32.5,
"grad_norm_var": 4.6900390625,
"learning_rate": 0.0001,
"loss": 6.947,
"loss/crossentropy": 1.8900059774518012,
"loss/hidden": 3.315234375,
"loss/jsd": 0.0,
"loss/logits": 0.18781680446118115,
"step": 1790
},
{
"epoch": 0.09,
"grad_norm": 30.0,
"grad_norm_var": 4.231705729166666,
"learning_rate": 0.0001,
"loss": 6.9437,
"loss/crossentropy": 1.8869778975844382,
"loss/hidden": 3.269921875,
"loss/jsd": 0.0,
"loss/logits": 0.17426692880690098,
"step": 1800
},
{
"epoch": 0.0905,
"grad_norm": 33.0,
"grad_norm_var": 2.8309895833333334,
"learning_rate": 0.0001,
"loss": 6.9652,
"loss/crossentropy": 1.8232818126678467,
"loss/hidden": 3.331640625,
"loss/jsd": 0.0,
"loss/logits": 0.16745625659823418,
"step": 1810
},
{
"epoch": 0.091,
"grad_norm": 34.25,
"grad_norm_var": 4.40390625,
"learning_rate": 0.0001,
"loss": 7.0219,
"loss/crossentropy": 1.8258642494678496,
"loss/hidden": 3.315234375,
"loss/jsd": 0.0,
"loss/logits": 0.19198300442658364,
"step": 1820
},
{
"epoch": 0.0915,
"grad_norm": 32.25,
"grad_norm_var": 8.268684895833333,
"learning_rate": 0.0001,
"loss": 6.8434,
"loss/crossentropy": 1.7024194486439228,
"loss/hidden": 3.409765625,
"loss/jsd": 0.0,
"loss/logits": 0.18930096151307224,
"step": 1830
},
{
"epoch": 0.092,
"grad_norm": 31.625,
"grad_norm_var": 6.74765625,
"learning_rate": 0.0001,
"loss": 6.9231,
"loss/crossentropy": 1.7479817308485508,
"loss/hidden": 3.3453125,
"loss/jsd": 0.0,
"loss/logits": 0.1829341644886881,
"step": 1840
},
{
"epoch": 0.0925,
"grad_norm": 33.75,
"grad_norm_var": 4.48515625,
"learning_rate": 0.0001,
"loss": 7.0635,
"loss/crossentropy": 2.0127600729465485,
"loss/hidden": 3.2953125,
"loss/jsd": 0.0,
"loss/logits": 0.18128359764814378,
"step": 1850
},
{
"epoch": 0.093,
"grad_norm": 31.75,
"grad_norm_var": 11.642708333333333,
"learning_rate": 0.0001,
"loss": 6.9505,
"loss/crossentropy": 1.7567149683833123,
"loss/hidden": 3.343359375,
"loss/jsd": 0.0,
"loss/logits": 0.1842447452247143,
"step": 1860
},
{
"epoch": 0.0935,
"grad_norm": 34.5,
"grad_norm_var": 1.5832967231255347e+18,
"learning_rate": 0.0001,
"loss": 7.1294,
"loss/crossentropy": 1.8183075070381165,
"loss/hidden": 3.25,
"loss/jsd": 0.0,
"loss/logits": 0.17170923966914414,
"step": 1870
},
{
"epoch": 0.094,
"grad_norm": 36.0,
"grad_norm_var": 14.670833333333333,
"learning_rate": 0.0001,
"loss": 6.7269,
"loss/crossentropy": 1.6782560005784035,
"loss/hidden": 3.329296875,
"loss/jsd": 0.0,
"loss/logits": 0.16191824562847615,
"step": 1880
},
{
"epoch": 0.0945,
"grad_norm": 29.5,
"grad_norm_var": 8.283984344848707e+17,
"learning_rate": 0.0001,
"loss": 6.9423,
"loss/crossentropy": 1.7822233349084855,
"loss/hidden": 3.319140625,
"loss/jsd": 0.0,
"loss/logits": 0.15704208929091693,
"step": 1890
},
{
"epoch": 0.095,
"grad_norm": 27.25,
"grad_norm_var": 12.049739583333333,
"learning_rate": 0.0001,
"loss": 6.8598,
"loss/crossentropy": 1.8880347676575184,
"loss/hidden": 3.30546875,
"loss/jsd": 0.0,
"loss/logits": 0.18590961638838052,
"step": 1900
},
{
"epoch": 0.0955,
"grad_norm": 32.75,
"grad_norm_var": 6.827351348981094e+17,
"learning_rate": 0.0001,
"loss": 7.0671,
"loss/crossentropy": 1.6947499185800552,
"loss/hidden": 3.341015625,
"loss/jsd": 0.0,
"loss/logits": 0.17880834415555,
"step": 1910
},
{
"epoch": 0.096,
"grad_norm": 30.875,
"grad_norm_var": 7.036874278235887e+17,
"learning_rate": 0.0001,
"loss": 6.8978,
"loss/crossentropy": 1.6141892828047275,
"loss/hidden": 3.35390625,
"loss/jsd": 0.0,
"loss/logits": 0.18202604549005627,
"step": 1920
},
{
"epoch": 0.0965,
"grad_norm": 29.625,
"grad_norm_var": 12.239583333333334,
"learning_rate": 0.0001,
"loss": 6.9659,
"loss/crossentropy": 1.7211613908410073,
"loss/hidden": 3.29453125,
"loss/jsd": 0.0,
"loss/logits": 0.19102244451642036,
"step": 1930
},
{
"epoch": 0.097,
"grad_norm": 28.375,
"grad_norm_var": 15.983268229166667,
"learning_rate": 0.0001,
"loss": 6.8912,
"loss/crossentropy": 1.7675188466906548,
"loss/hidden": 3.32421875,
"loss/jsd": 0.0,
"loss/logits": 0.19818378714844584,
"step": 1940
},
{
"epoch": 0.0975,
"grad_norm": 32.75,
"grad_norm_var": 9.306266259729068e+17,
"learning_rate": 0.0001,
"loss": 6.9645,
"loss/crossentropy": 1.7558425486087799,
"loss/hidden": 3.419921875,
"loss/jsd": 0.0,
"loss/logits": 0.1911760584451258,
"step": 1950
},
{
"epoch": 0.098,
"grad_norm": 27.625,
"grad_norm_var": 1.5205981735288307e+18,
"learning_rate": 0.0001,
"loss": 6.8635,
"loss/crossentropy": 1.7457415886223315,
"loss/hidden": 3.384765625,
"loss/jsd": 0.0,
"loss/logits": 0.1852768061682582,
"step": 1960
},
{
"epoch": 0.0985,
"grad_norm": 32.75,
"grad_norm_var": 14.7125,
"learning_rate": 0.0001,
"loss": 6.8508,
"loss/crossentropy": 1.683419554680586,
"loss/hidden": 3.337890625,
"loss/jsd": 0.0,
"loss/logits": 0.1731728465296328,
"step": 1970
},
{
"epoch": 0.099,
"grad_norm": 30.625,
"grad_norm_var": 1.0302687666727377e+18,
"learning_rate": 0.0001,
"loss": 7.0005,
"loss/crossentropy": 1.727415306866169,
"loss/hidden": 3.297265625,
"loss/jsd": 0.0,
"loss/logits": 0.18517111875116826,
"step": 1980
},
{
"epoch": 0.0995,
"grad_norm": 32.25,
"grad_norm_var": 22.14375,
"learning_rate": 0.0001,
"loss": 6.9138,
"loss/crossentropy": 1.8120180189609527,
"loss/hidden": 3.434375,
"loss/jsd": 0.0,
"loss/logits": 0.20129222217947246,
"step": 1990
},
{
"epoch": 0.1,
"grad_norm": 35.5,
"grad_norm_var": 8.491080729166667,
"learning_rate": 0.0001,
"loss": 6.9525,
"loss/crossentropy": 1.8299045406281949,
"loss/hidden": 3.251171875,
"loss/jsd": 0.0,
"loss/logits": 0.17095453599467875,
"step": 2000
},
{
"epoch": 0.1005,
"grad_norm": 32.75,
"grad_norm_var": 8.586458333333333,
"learning_rate": 0.0001,
"loss": 6.7871,
"loss/crossentropy": 1.7243870817124844,
"loss/hidden": 3.3703125,
"loss/jsd": 0.0,
"loss/logits": 0.16602067481726407,
"step": 2010
},
{
"epoch": 0.101,
"grad_norm": 29.625,
"grad_norm_var": 9.378125,
"learning_rate": 0.0001,
"loss": 6.855,
"loss/crossentropy": 1.6784847162663936,
"loss/hidden": 3.225,
"loss/jsd": 0.0,
"loss/logits": 0.16919725136831404,
"step": 2020
},
{
"epoch": 0.1015,
"grad_norm": 41.0,
"grad_norm_var": 112.83170572916667,
"learning_rate": 0.0001,
"loss": 6.9616,
"loss/crossentropy": 1.8477609053254127,
"loss/hidden": 3.259375,
"loss/jsd": 0.0,
"loss/logits": 0.16309508439153433,
"step": 2030
},
{
"epoch": 0.102,
"grad_norm": 30.0,
"grad_norm_var": 111.6259765625,
"learning_rate": 0.0001,
"loss": 6.9517,
"loss/crossentropy": 1.7308252967894078,
"loss/hidden": 3.202734375,
"loss/jsd": 0.0,
"loss/logits": 0.1722710312344134,
"step": 2040
},
{
"epoch": 0.1025,
"grad_norm": 30.625,
"grad_norm_var": 4.073893229166667,
"learning_rate": 0.0001,
"loss": 6.9088,
"loss/crossentropy": 1.7544417701661588,
"loss/hidden": 3.41796875,
"loss/jsd": 0.0,
"loss/logits": 0.19881883040070533,
"step": 2050
},
{
"epoch": 0.103,
"grad_norm": 38.0,
"grad_norm_var": 13.948958333333334,
"learning_rate": 0.0001,
"loss": 6.9474,
"loss/crossentropy": 1.9995075345039368,
"loss/hidden": 3.271875,
"loss/jsd": 0.0,
"loss/logits": 0.17399701047688723,
"step": 2060
},
{
"epoch": 0.1035,
"grad_norm": 31.75,
"grad_norm_var": 21.0744140625,
"learning_rate": 0.0001,
"loss": 6.8732,
"loss/crossentropy": 1.8493791602551937,
"loss/hidden": 3.2421875,
"loss/jsd": 0.0,
"loss/logits": 0.16063635479658842,
"step": 2070
},
{
"epoch": 0.104,
"grad_norm": 32.25,
"grad_norm_var": 17.897916666666667,
"learning_rate": 0.0001,
"loss": 6.9556,
"loss/crossentropy": 1.737601400911808,
"loss/hidden": 3.333984375,
"loss/jsd": 0.0,
"loss/logits": 0.18038861453533173,
"step": 2080
},
{
"epoch": 0.1045,
"grad_norm": 32.25,
"grad_norm_var": 3.38515625,
"learning_rate": 0.0001,
"loss": 6.979,
"loss/crossentropy": 1.7256839543581008,
"loss/hidden": 3.3515625,
"loss/jsd": 0.0,
"loss/logits": 0.19299248773604633,
"step": 2090
},
{
"epoch": 0.105,
"grad_norm": 31.0,
"grad_norm_var": 3.4853515625,
"learning_rate": 0.0001,
"loss": 6.8191,
"loss/crossentropy": 1.7587849080562592,
"loss/hidden": 3.25,
"loss/jsd": 0.0,
"loss/logits": 0.16214433256536723,
"step": 2100
},
{
"epoch": 0.1055,
"grad_norm": 33.5,
"grad_norm_var": 4.112239583333333,
"learning_rate": 0.0001,
"loss": 7.0774,
"loss/crossentropy": 2.092029668390751,
"loss/hidden": 3.332421875,
"loss/jsd": 0.0,
"loss/logits": 0.19293731367215514,
"step": 2110
},
{
"epoch": 0.106,
"grad_norm": 30.375,
"grad_norm_var": 5.1072265625,
"learning_rate": 0.0001,
"loss": 6.9724,
"loss/crossentropy": 1.7829479269683361,
"loss/hidden": 3.349609375,
"loss/jsd": 0.0,
"loss/logits": 0.19456620067358016,
"step": 2120
},
{
"epoch": 0.1065,
"grad_norm": 29.375,
"grad_norm_var": 20.4525390625,
"learning_rate": 0.0001,
"loss": 6.9908,
"loss/crossentropy": 1.7853210166096687,
"loss/hidden": 3.32265625,
"loss/jsd": 0.0,
"loss/logits": 0.18279874734580517,
"step": 2130
},
{
"epoch": 0.107,
"grad_norm": 36.5,
"grad_norm_var": 20.847330729166668,
"learning_rate": 0.0001,
"loss": 6.9787,
"loss/crossentropy": 1.8366479635238648,
"loss/hidden": 3.324609375,
"loss/jsd": 0.0,
"loss/logits": 0.17941316729411483,
"step": 2140
},
{
"epoch": 0.1075,
"grad_norm": 29.25,
"grad_norm_var": 5.1384765625,
"learning_rate": 0.0001,
"loss": 7.0703,
"loss/crossentropy": 1.8491265431046486,
"loss/hidden": 3.253515625,
"loss/jsd": 0.0,
"loss/logits": 0.1788581835106015,
"step": 2150
},
{
"epoch": 0.108,
"grad_norm": 28.5,
"grad_norm_var": 3.8082682291666665,
"learning_rate": 0.0001,
"loss": 7.0117,
"loss/crossentropy": 1.8718080654740334,
"loss/hidden": 3.36015625,
"loss/jsd": 0.0,
"loss/logits": 0.18362828250974417,
"step": 2160
},
{
"epoch": 0.1085,
"grad_norm": 31.375,
"grad_norm_var": 4.0541015625,
"learning_rate": 0.0001,
"loss": 6.9147,
"loss/crossentropy": 1.823565386980772,
"loss/hidden": 3.346875,
"loss/jsd": 0.0,
"loss/logits": 0.17529825307428837,
"step": 2170
},
{
"epoch": 0.109,
"grad_norm": 29.875,
"grad_norm_var": 3.1510416666666665,
"learning_rate": 0.0001,
"loss": 6.8799,
"loss/crossentropy": 1.8646746143698691,
"loss/hidden": 3.3625,
"loss/jsd": 0.0,
"loss/logits": 0.18420496406033635,
"step": 2180
},
{
"epoch": 0.1095,
"grad_norm": 28.0,
"grad_norm_var": 1.6061848958333333,
"learning_rate": 0.0001,
"loss": 6.9741,
"loss/crossentropy": 1.8418309345841408,
"loss/hidden": 3.289453125,
"loss/jsd": 0.0,
"loss/logits": 0.17159662526100875,
"step": 2190
},
{
"epoch": 0.11,
"grad_norm": 31.375,
"grad_norm_var": 2.4184895833333333,
"learning_rate": 0.0001,
"loss": 7.0042,
"loss/crossentropy": 1.8776386469602584,
"loss/hidden": 3.40390625,
"loss/jsd": 0.0,
"loss/logits": 0.187642621435225,
"step": 2200
},
{
"epoch": 0.1105,
"grad_norm": 29.125,
"grad_norm_var": 8.20781018083492e+17,
"learning_rate": 0.0001,
"loss": 6.9378,
"loss/crossentropy": 1.655004223436117,
"loss/hidden": 3.273046875,
"loss/jsd": 0.0,
"loss/logits": 0.1580679954495281,
"step": 2210
},
{
"epoch": 0.111,
"grad_norm": 30.125,
"grad_norm_var": 3.468489583333333,
"learning_rate": 0.0001,
"loss": 6.9831,
"loss/crossentropy": 1.792271687835455,
"loss/hidden": 3.277734375,
"loss/jsd": 0.0,
"loss/logits": 0.17089223572984338,
"step": 2220
},
{
"epoch": 0.1115,
"grad_norm": 34.75,
"grad_norm_var": 4.209375,
"learning_rate": 0.0001,
"loss": 6.8364,
"loss/crossentropy": 1.734425350278616,
"loss/hidden": 3.394140625,
"loss/jsd": 0.0,
"loss/logits": 0.18262410946190358,
"step": 2230
},
{
"epoch": 0.112,
"grad_norm": 27.0,
"grad_norm_var": 4.629166666666666,
"learning_rate": 0.0001,
"loss": 6.8305,
"loss/crossentropy": 1.772131036967039,
"loss/hidden": 3.274609375,
"loss/jsd": 0.0,
"loss/logits": 0.1691578391008079,
"step": 2240
},
{
"epoch": 0.1125,
"grad_norm": 29.125,
"grad_norm_var": 6.303580729166667,
"learning_rate": 0.0001,
"loss": 6.9967,
"loss/crossentropy": 1.9334307715296746,
"loss/hidden": 3.383203125,
"loss/jsd": 0.0,
"loss/logits": 0.19251629430800676,
"step": 2250
},
{
"epoch": 0.113,
"grad_norm": 36.5,
"grad_norm_var": 6.4791015625,
"learning_rate": 0.0001,
"loss": 6.981,
"loss/crossentropy": 1.887280984222889,
"loss/hidden": 3.358984375,
"loss/jsd": 0.0,
"loss/logits": 0.21319616939872504,
"step": 2260
},
{
"epoch": 0.1135,
"grad_norm": 28.75,
"grad_norm_var": 4.7009765625,
"learning_rate": 0.0001,
"loss": 7.0286,
"loss/crossentropy": 1.8285806521773338,
"loss/hidden": 3.41484375,
"loss/jsd": 0.0,
"loss/logits": 0.18080311622470618,
"step": 2270
},
{
"epoch": 0.114,
"grad_norm": 31.0,
"grad_norm_var": 7.09375,
"learning_rate": 0.0001,
"loss": 6.863,
"loss/crossentropy": 1.6441345304250716,
"loss/hidden": 3.323046875,
"loss/jsd": 0.0,
"loss/logits": 0.18446694109588863,
"step": 2280
},
{
"epoch": 0.1145,
"grad_norm": 30.125,
"grad_norm_var": 9.029166666666667,
"learning_rate": 0.0001,
"loss": 6.8549,
"loss/crossentropy": 1.5048397369682789,
"loss/hidden": 3.359765625,
"loss/jsd": 0.0,
"loss/logits": 0.16425186553969978,
"step": 2290
},
{
"epoch": 0.115,
"grad_norm": 28.625,
"grad_norm_var": 3.9400390625,
"learning_rate": 0.0001,
"loss": 6.9448,
"loss/crossentropy": 1.7213742382824422,
"loss/hidden": 3.3328125,
"loss/jsd": 0.0,
"loss/logits": 0.17190376687794923,
"step": 2300
},
{
"epoch": 0.1155,
"grad_norm": 29.125,
"grad_norm_var": 51.71608072916667,
"learning_rate": 0.0001,
"loss": 7.0456,
"loss/crossentropy": 1.8745042860507966,
"loss/hidden": 3.3703125,
"loss/jsd": 0.0,
"loss/logits": 0.1922046933323145,
"step": 2310
},
{
"epoch": 0.116,
"grad_norm": 31.625,
"grad_norm_var": 5.101822916666666,
"learning_rate": 0.0001,
"loss": 7.0037,
"loss/crossentropy": 1.835337746143341,
"loss/hidden": 3.291015625,
"loss/jsd": 0.0,
"loss/logits": 0.172516768053174,
"step": 2320
},
{
"epoch": 0.1165,
"grad_norm": 29.625,
"grad_norm_var": 4.792122395833333,
"learning_rate": 0.0001,
"loss": 6.8605,
"loss/crossentropy": 1.7886844381690026,
"loss/hidden": 3.31328125,
"loss/jsd": 0.0,
"loss/logits": 0.17270518001168966,
"step": 2330
},
{
"epoch": 0.117,
"grad_norm": 30.875,
"grad_norm_var": 24.301041666666666,
"learning_rate": 0.0001,
"loss": 6.8857,
"loss/crossentropy": 1.8270663298666476,
"loss/hidden": 3.316796875,
"loss/jsd": 0.0,
"loss/logits": 0.17341279415413738,
"step": 2340
},
{
"epoch": 0.1175,
"grad_norm": 28.25,
"grad_norm_var": 23.795247395833332,
"learning_rate": 0.0001,
"loss": 6.981,
"loss/crossentropy": 1.7389558240771295,
"loss/hidden": 3.33671875,
"loss/jsd": 0.0,
"loss/logits": 0.20616078823804856,
"step": 2350
},
{
"epoch": 0.118,
"grad_norm": 31.75,
"grad_norm_var": 3.3712890625,
"learning_rate": 0.0001,
"loss": 6.9706,
"loss/crossentropy": 1.7505015313625336,
"loss/hidden": 3.2375,
"loss/jsd": 0.0,
"loss/logits": 0.1691287737339735,
"step": 2360
},
{
"epoch": 0.1185,
"grad_norm": 29.875,
"grad_norm_var": 3.7864583333333335,
"learning_rate": 0.0001,
"loss": 7.0493,
"loss/crossentropy": 1.8290210530161857,
"loss/hidden": 3.3515625,
"loss/jsd": 0.0,
"loss/logits": 0.17870840784162284,
"step": 2370
},
{
"epoch": 0.119,
"grad_norm": 28.75,
"grad_norm_var": 3.06640625,
"learning_rate": 0.0001,
"loss": 6.8945,
"loss/crossentropy": 1.7312066838145257,
"loss/hidden": 3.3453125,
"loss/jsd": 0.0,
"loss/logits": 0.16353450021706523,
"step": 2380
},
{
"epoch": 0.1195,
"grad_norm": 38.75,
"grad_norm_var": 8.985384797395922e+17,
"learning_rate": 0.0001,
"loss": 7.1346,
"loss/crossentropy": 1.8643671602010727,
"loss/hidden": 3.3734375,
"loss/jsd": 0.0,
"loss/logits": 0.18776546316221357,
"step": 2390
},
{
"epoch": 0.12,
"grad_norm": 33.25,
"grad_norm_var": 8.985384795065637e+17,
"learning_rate": 0.0001,
"loss": 7.0339,
"loss/crossentropy": 1.7668686166405678,
"loss/hidden": 3.4140625,
"loss/jsd": 0.0,
"loss/logits": 0.19443758334964514,
"step": 2400
},
{
"epoch": 0.1205,
"grad_norm": 30.25,
"grad_norm_var": 1.8852243670131978e+18,
"learning_rate": 0.0001,
"loss": 6.9626,
"loss/crossentropy": 1.8465783804655076,
"loss/hidden": 3.371875,
"loss/jsd": 0.0,
"loss/logits": 0.1860800025984645,
"step": 2410
},
{
"epoch": 0.121,
"grad_norm": 33.75,
"grad_norm_var": 1.8852243674568678e+18,
"learning_rate": 0.0001,
"loss": 6.8455,
"loss/crossentropy": 1.7212153851985932,
"loss/hidden": 3.36171875,
"loss/jsd": 0.0,
"loss/logits": 0.18209199868142606,
"step": 2420
},
{
"epoch": 0.1215,
"grad_norm": 28.0,
"grad_norm_var": 3.81015625,
"learning_rate": 0.0001,
"loss": 6.9986,
"loss/crossentropy": 1.898094529658556,
"loss/hidden": 3.375390625,
"loss/jsd": 0.0,
"loss/logits": 0.194298998080194,
"step": 2430
},
{
"epoch": 0.122,
"grad_norm": 27.5,
"grad_norm_var": 3.332291666666667,
"learning_rate": 0.0001,
"loss": 6.8924,
"loss/crossentropy": 1.7420293487608434,
"loss/hidden": 3.2546875,
"loss/jsd": 0.0,
"loss/logits": 0.161607267241925,
"step": 2440
},
{
"epoch": 0.1225,
"grad_norm": 33.0,
"grad_norm_var": 2.7622395833333333,
"learning_rate": 0.0001,
"loss": 6.8686,
"loss/crossentropy": 1.6050585605204106,
"loss/hidden": 3.371875,
"loss/jsd": 0.0,
"loss/logits": 0.17848586086183788,
"step": 2450
},
{
"epoch": 0.123,
"grad_norm": 28.75,
"grad_norm_var": 2.4400390625,
"learning_rate": 0.0001,
"loss": 6.9804,
"loss/crossentropy": 1.9553805246949196,
"loss/hidden": 3.3859375,
"loss/jsd": 0.0,
"loss/logits": 0.19847506172955037,
"step": 2460
},
{
"epoch": 0.1235,
"grad_norm": 29.875,
"grad_norm_var": 2.0791015625,
"learning_rate": 0.0001,
"loss": 6.9913,
"loss/crossentropy": 1.4568642482161522,
"loss/hidden": 3.335546875,
"loss/jsd": 0.0,
"loss/logits": 0.15850053485482932,
"step": 2470
},
{
"epoch": 0.124,
"grad_norm": 31.875,
"grad_norm_var": 4.3775390625,
"learning_rate": 0.0001,
"loss": 6.9326,
"loss/crossentropy": 1.6532236352562903,
"loss/hidden": 3.45859375,
"loss/jsd": 0.0,
"loss/logits": 0.18165745195001365,
"step": 2480
},
{
"epoch": 0.1245,
"grad_norm": 28.5,
"grad_norm_var": 4.522330729166667,
"learning_rate": 0.0001,
"loss": 7.005,
"loss/crossentropy": 1.6793559297919274,
"loss/hidden": 3.339453125,
"loss/jsd": 0.0,
"loss/logits": 0.17017313856631516,
"step": 2490
},
{
"epoch": 0.125,
"grad_norm": 30.75,
"grad_norm_var": 4.3353515625,
"learning_rate": 0.0001,
"loss": 7.0956,
"loss/crossentropy": 1.8292289204895495,
"loss/hidden": 3.38203125,
"loss/jsd": 0.0,
"loss/logits": 0.18509325329214335,
"step": 2500
},
{
"epoch": 0.1255,
"grad_norm": 30.875,
"grad_norm_var": 3.78125,
"learning_rate": 0.0001,
"loss": 6.9137,
"loss/crossentropy": 1.7439368188381195,
"loss/hidden": 3.3890625,
"loss/jsd": 0.0,
"loss/logits": 0.19252277240157128,
"step": 2510
},
{
"epoch": 0.126,
"grad_norm": 31.125,
"grad_norm_var": 1.1349774577470627e+18,
"learning_rate": 0.0001,
"loss": 7.051,
"loss/crossentropy": 2.0631623208522796,
"loss/hidden": 3.4265625,
"loss/jsd": 0.0,
"loss/logits": 0.22505897115916013,
"step": 2520
},
{
"epoch": 0.1265,
"grad_norm": 29.75,
"grad_norm_var": 1.1349774575828206e+18,
"learning_rate": 0.0001,
"loss": 7.1194,
"loss/crossentropy": 1.8867668241262436,
"loss/hidden": 3.35390625,
"loss/jsd": 0.0,
"loss/logits": 0.20316522121429442,
"step": 2530
},
{
"epoch": 0.127,
"grad_norm": 28.25,
"grad_norm_var": 20.151822916666667,
"learning_rate": 0.0001,
"loss": 7.0832,
"loss/crossentropy": 1.8491319343447685,
"loss/hidden": 3.392578125,
"loss/jsd": 0.0,
"loss/logits": 0.19702840279787778,
"step": 2540
},
{
"epoch": 0.1275,
"grad_norm": 29.0,
"grad_norm_var": 11.6681640625,
"learning_rate": 0.0001,
"loss": 6.9728,
"loss/crossentropy": 1.8162995487451554,
"loss/hidden": 3.366015625,
"loss/jsd": 0.0,
"loss/logits": 0.18736656550318004,
"step": 2550
},
{
"epoch": 0.128,
"grad_norm": 34.5,
"grad_norm_var": 13.088997395833333,
"learning_rate": 0.0001,
"loss": 7.1137,
"loss/crossentropy": 2.031092081964016,
"loss/hidden": 3.447265625,
"loss/jsd": 0.0,
"loss/logits": 0.21819815230555833,
"step": 2560
},
{
"epoch": 0.1285,
"grad_norm": 31.125,
"grad_norm_var": 1.7945788315993818e+17,
"learning_rate": 0.0001,
"loss": 7.0175,
"loss/crossentropy": 1.731457906216383,
"loss/hidden": 3.4015625,
"loss/jsd": 0.0,
"loss/logits": 0.18550403621047734,
"step": 2570
},
{
"epoch": 0.129,
"grad_norm": 32.0,
"grad_norm_var": 1.794578832870256e+17,
"learning_rate": 0.0001,
"loss": 6.8552,
"loss/crossentropy": 1.8714622184634209,
"loss/hidden": 3.2890625,
"loss/jsd": 0.0,
"loss/logits": 0.1802680429071188,
"step": 2580
},
{
"epoch": 0.1295,
"grad_norm": 38.75,
"grad_norm_var": 11.655143229166667,
"learning_rate": 0.0001,
"loss": 6.9513,
"loss/crossentropy": 1.6536960810422898,
"loss/hidden": 3.426171875,
"loss/jsd": 0.0,
"loss/logits": 0.19141803495585918,
"step": 2590
},
{
"epoch": 0.13,
"grad_norm": 30.25,
"grad_norm_var": 10.824934895833334,
"learning_rate": 0.0001,
"loss": 7.0451,
"loss/crossentropy": 1.7446824312210083,
"loss/hidden": 3.437890625,
"loss/jsd": 0.0,
"loss/logits": 0.21996904909610748,
"step": 2600
},
{
"epoch": 0.1305,
"grad_norm": 32.0,
"grad_norm_var": 0.9895182291666667,
"learning_rate": 0.0001,
"loss": 6.9912,
"loss/crossentropy": 1.8711062870919704,
"loss/hidden": 3.344140625,
"loss/jsd": 0.0,
"loss/logits": 0.18015410769730805,
"step": 2610
},
{
"epoch": 0.131,
"grad_norm": 29.0,
"grad_norm_var": 1.9697265625,
"learning_rate": 0.0001,
"loss": 6.9974,
"loss/crossentropy": 1.7273207187652588,
"loss/hidden": 3.3171875,
"loss/jsd": 0.0,
"loss/logits": 0.17100013056769967,
"step": 2620
},
{
"epoch": 0.1315,
"grad_norm": 33.0,
"grad_norm_var": 0.9681640625,
"learning_rate": 0.0001,
"loss": 6.864,
"loss/crossentropy": 1.772182758897543,
"loss/hidden": 3.410546875,
"loss/jsd": 0.0,
"loss/logits": 0.18076814245432615,
"step": 2630
},
{
"epoch": 0.132,
"grad_norm": 29.25,
"grad_norm_var": 5.707291666666666,
"learning_rate": 0.0001,
"loss": 7.1259,
"loss/crossentropy": 1.7641409367322922,
"loss/hidden": 3.434375,
"loss/jsd": 0.0,
"loss/logits": 0.18833348713815212,
"step": 2640
},
{
"epoch": 0.1325,
"grad_norm": 40.75,
"grad_norm_var": 10.91015625,
"learning_rate": 0.0001,
"loss": 7.0193,
"loss/crossentropy": 1.859598373621702,
"loss/hidden": 3.394140625,
"loss/jsd": 0.0,
"loss/logits": 0.18878742419183253,
"step": 2650
},
{
"epoch": 0.133,
"grad_norm": 31.375,
"grad_norm_var": 18.1822265625,
"learning_rate": 0.0001,
"loss": 6.9707,
"loss/crossentropy": 1.7797490507364273,
"loss/hidden": 3.411328125,
"loss/jsd": 0.0,
"loss/logits": 0.20212376527488232,
"step": 2660
},
{
"epoch": 0.1335,
"grad_norm": 29.875,
"grad_norm_var": 11.162239583333333,
"learning_rate": 0.0001,
"loss": 7.0002,
"loss/crossentropy": 1.7839721478521824,
"loss/hidden": 3.3140625,
"loss/jsd": 0.0,
"loss/logits": 0.173302289377898,
"step": 2670
},
{
"epoch": 0.134,
"grad_norm": 27.125,
"grad_norm_var": 4.2009765625,
"learning_rate": 0.0001,
"loss": 6.9156,
"loss/crossentropy": 1.7781757101416589,
"loss/hidden": 3.390625,
"loss/jsd": 0.0,
"loss/logits": 0.19893121821805834,
"step": 2680
},
{
"epoch": 0.1345,
"grad_norm": 30.75,
"grad_norm_var": 36.837239583333336,
"learning_rate": 0.0001,
"loss": 7.0997,
"loss/crossentropy": 1.8467799574136734,
"loss/hidden": 3.41953125,
"loss/jsd": 0.0,
"loss/logits": 0.20002066995948553,
"step": 2690
},
{
"epoch": 0.135,
"grad_norm": 28.5,
"grad_norm_var": 37.431705729166666,
"learning_rate": 0.0001,
"loss": 6.9236,
"loss/crossentropy": 1.6248198747634888,
"loss/hidden": 3.335546875,
"loss/jsd": 0.0,
"loss/logits": 0.1642201030626893,
"step": 2700
},
{
"epoch": 0.1355,
"grad_norm": 34.0,
"grad_norm_var": 4.030989583333334,
"learning_rate": 0.0001,
"loss": 6.9361,
"loss/crossentropy": 1.7102701038122177,
"loss/hidden": 3.3609375,
"loss/jsd": 0.0,
"loss/logits": 0.16836816985160113,
"step": 2710
},
{
"epoch": 0.136,
"grad_norm": 26.0,
"grad_norm_var": 1.0907331108694131e+18,
"learning_rate": 0.0001,
"loss": 6.9167,
"loss/crossentropy": 1.8059025250375271,
"loss/hidden": 3.31796875,
"loss/jsd": 0.0,
"loss/logits": 0.16677290350198745,
"step": 2720
},
{
"epoch": 0.1365,
"grad_norm": 29.5,
"grad_norm_var": 6.2728515625,
"learning_rate": 0.0001,
"loss": 6.8796,
"loss/crossentropy": 1.776158544421196,
"loss/hidden": 3.4125,
"loss/jsd": 0.0,
"loss/logits": 0.1929216692224145,
"step": 2730
},
{
"epoch": 0.137,
"grad_norm": 28.75,
"grad_norm_var": 7.5431640625,
"learning_rate": 0.0001,
"loss": 6.8288,
"loss/crossentropy": 1.8780412912368774,
"loss/hidden": 3.397265625,
"loss/jsd": 0.0,
"loss/logits": 0.18471294036135077,
"step": 2740
},
{
"epoch": 0.1375,
"grad_norm": 33.0,
"grad_norm_var": 15.1947265625,
"learning_rate": 0.0001,
"loss": 6.9741,
"loss/crossentropy": 1.7919296585023403,
"loss/hidden": 3.40703125,
"loss/jsd": 0.0,
"loss/logits": 0.19309423677623272,
"step": 2750
},
{
"epoch": 0.138,
"grad_norm": 31.375,
"grad_norm_var": 16.696809895833333,
"learning_rate": 0.0001,
"loss": 6.9971,
"loss/crossentropy": 1.8414636544883252,
"loss/hidden": 3.330859375,
"loss/jsd": 0.0,
"loss/logits": 0.1888686059974134,
"step": 2760
},
{
"epoch": 0.1385,
"grad_norm": 28.5,
"grad_norm_var": 7.121875,
"learning_rate": 0.0001,
"loss": 6.9869,
"loss/crossentropy": 1.8438507467508316,
"loss/hidden": 3.357421875,
"loss/jsd": 0.0,
"loss/logits": 0.19185615349560975,
"step": 2770
},
{
"epoch": 0.139,
"grad_norm": 33.25,
"grad_norm_var": 10.338541666666666,
"learning_rate": 0.0001,
"loss": 6.9528,
"loss/crossentropy": 1.8890479058027267,
"loss/hidden": 3.384765625,
"loss/jsd": 0.0,
"loss/logits": 0.19027914050966502,
"step": 2780
},
{
"epoch": 0.1395,
"grad_norm": 33.5,
"grad_norm_var": 12.343684895833333,
"learning_rate": 0.0001,
"loss": 6.9585,
"loss/crossentropy": 1.6378353632986546,
"loss/hidden": 3.41484375,
"loss/jsd": 0.0,
"loss/logits": 0.18243511486798525,
"step": 2790
},
{
"epoch": 0.14,
"grad_norm": 33.0,
"grad_norm_var": 7.9384765625,
"learning_rate": 0.0001,
"loss": 6.885,
"loss/crossentropy": 1.6422518469393252,
"loss/hidden": 3.26875,
"loss/jsd": 0.0,
"loss/logits": 0.15738149764947593,
"step": 2800
},
{
"epoch": 0.1405,
"grad_norm": 35.0,
"grad_norm_var": 7.362239583333333,
"learning_rate": 0.0001,
"loss": 6.9251,
"loss/crossentropy": 1.818039534240961,
"loss/hidden": 3.222265625,
"loss/jsd": 0.0,
"loss/logits": 0.17553653065115213,
"step": 2810
},
{
"epoch": 0.141,
"grad_norm": 28.875,
"grad_norm_var": 8.7134765625,
"learning_rate": 0.0001,
"loss": 6.9659,
"loss/crossentropy": 1.8913455709815026,
"loss/hidden": 3.325390625,
"loss/jsd": 0.0,
"loss/logits": 0.18545334562659263,
"step": 2820
},
{
"epoch": 0.1415,
"grad_norm": 27.5,
"grad_norm_var": 7.718684895833333,
"learning_rate": 0.0001,
"loss": 6.8653,
"loss/crossentropy": 1.9232856243848802,
"loss/hidden": 3.34921875,
"loss/jsd": 0.0,
"loss/logits": 0.19609272833913566,
"step": 2830
},
{
"epoch": 0.142,
"grad_norm": 31.75,
"grad_norm_var": 18.7166015625,
"learning_rate": 0.0001,
"loss": 6.9271,
"loss/crossentropy": 1.7873032443225383,
"loss/hidden": 3.2796875,
"loss/jsd": 0.0,
"loss/logits": 0.16436451440677047,
"step": 2840
},
{
"epoch": 0.1425,
"grad_norm": 31.375,
"grad_norm_var": 4.561393229166667,
"learning_rate": 0.0001,
"loss": 6.859,
"loss/crossentropy": 1.764283910393715,
"loss/hidden": 3.3890625,
"loss/jsd": 0.0,
"loss/logits": 0.18506875950843096,
"step": 2850
},
{
"epoch": 0.143,
"grad_norm": 30.125,
"grad_norm_var": 5.339322916666666,
"learning_rate": 0.0001,
"loss": 7.1328,
"loss/crossentropy": 1.746024763584137,
"loss/hidden": 3.333984375,
"loss/jsd": 0.0,
"loss/logits": 0.19091468937695028,
"step": 2860
},
{
"epoch": 0.1435,
"grad_norm": 31.125,
"grad_norm_var": 7.5875,
"learning_rate": 0.0001,
"loss": 6.8931,
"loss/crossentropy": 1.8621096529066563,
"loss/hidden": 3.2265625,
"loss/jsd": 0.0,
"loss/logits": 0.167528663482517,
"step": 2870
},
{
"epoch": 0.144,
"grad_norm": 32.25,
"grad_norm_var": 7.123372395833333,
"learning_rate": 0.0001,
"loss": 7.0369,
"loss/crossentropy": 1.9750339597463609,
"loss/hidden": 3.364453125,
"loss/jsd": 0.0,
"loss/logits": 0.20070471633225678,
"step": 2880
},
{
"epoch": 0.1445,
"grad_norm": 33.5,
"grad_norm_var": 14.7275390625,
"learning_rate": 0.0001,
"loss": 6.8862,
"loss/crossentropy": 1.74088372066617,
"loss/hidden": 3.320703125,
"loss/jsd": 0.0,
"loss/logits": 0.17231013607233764,
"step": 2890
},
{
"epoch": 0.145,
"grad_norm": 28.375,
"grad_norm_var": 19.409830729166668,
"learning_rate": 0.0001,
"loss": 7.035,
"loss/crossentropy": 1.7799094915390015,
"loss/hidden": 3.32265625,
"loss/jsd": 0.0,
"loss/logits": 0.18373552113771438,
"step": 2900
},
{
"epoch": 0.1455,
"grad_norm": 31.375,
"grad_norm_var": 5.517708333333333,
"learning_rate": 0.0001,
"loss": 6.9546,
"loss/crossentropy": 1.7803256064653397,
"loss/hidden": 3.34921875,
"loss/jsd": 0.0,
"loss/logits": 0.1977113801985979,
"step": 2910
},
{
"epoch": 0.146,
"grad_norm": 28.125,
"grad_norm_var": 5.627018229166667,
"learning_rate": 0.0001,
"loss": 6.9317,
"loss/crossentropy": 1.8050019271671771,
"loss/hidden": 3.257421875,
"loss/jsd": 0.0,
"loss/logits": 0.16629343312233685,
"step": 2920
},
{
"epoch": 0.1465,
"grad_norm": 34.5,
"grad_norm_var": 7.16640625,
"learning_rate": 0.0001,
"loss": 6.9453,
"loss/crossentropy": 1.8659825779497623,
"loss/hidden": 3.331640625,
"loss/jsd": 0.0,
"loss/logits": 0.1742606306448579,
"step": 2930
},
{
"epoch": 0.147,
"grad_norm": 35.5,
"grad_norm_var": 8.9306640625,
"learning_rate": 0.0001,
"loss": 7.0142,
"loss/crossentropy": 1.913654712587595,
"loss/hidden": 3.403515625,
"loss/jsd": 0.0,
"loss/logits": 0.20132352095097303,
"step": 2940
},
{
"epoch": 0.1475,
"grad_norm": 30.25,
"grad_norm_var": 6.614518229166666,
"learning_rate": 0.0001,
"loss": 6.9147,
"loss/crossentropy": 1.645759216696024,
"loss/hidden": 3.376171875,
"loss/jsd": 0.0,
"loss/logits": 0.16875347392633558,
"step": 2950
},
{
"epoch": 0.148,
"grad_norm": 29.0,
"grad_norm_var": 6.8322265625,
"learning_rate": 0.0001,
"loss": 6.9988,
"loss/crossentropy": 1.8556548431515694,
"loss/hidden": 3.39375,
"loss/jsd": 0.0,
"loss/logits": 0.17874295320361852,
"step": 2960
},
{
"epoch": 0.1485,
"grad_norm": 28.75,
"grad_norm_var": 3.5791666666666666,
"learning_rate": 0.0001,
"loss": 7.0313,
"loss/crossentropy": 1.688177353143692,
"loss/hidden": 3.28515625,
"loss/jsd": 0.0,
"loss/logits": 0.16950420523062348,
"step": 2970
},
{
"epoch": 0.149,
"grad_norm": 32.25,
"grad_norm_var": 2.246875,
"learning_rate": 0.0001,
"loss": 7.0247,
"loss/crossentropy": 2.071097436547279,
"loss/hidden": 3.404296875,
"loss/jsd": 0.0,
"loss/logits": 0.20375496093183756,
"step": 2980
},
{
"epoch": 0.1495,
"grad_norm": 27.5,
"grad_norm_var": 2.6014973958333334,
"learning_rate": 0.0001,
"loss": 7.0495,
"loss/crossentropy": 1.852598314732313,
"loss/hidden": 3.31328125,
"loss/jsd": 0.0,
"loss/logits": 0.16860631257295608,
"step": 2990
},
{
"epoch": 0.15,
"grad_norm": 27.0,
"grad_norm_var": 3.5122395833333333,
"learning_rate": 0.0001,
"loss": 6.7966,
"loss/crossentropy": 1.7948169738054276,
"loss/hidden": 3.325,
"loss/jsd": 0.0,
"loss/logits": 0.17319696098566056,
"step": 3000
},
{
"epoch": 0.1505,
"grad_norm": 28.875,
"grad_norm_var": 4.367122395833333,
"learning_rate": 0.0001,
"loss": 6.9423,
"loss/crossentropy": 1.6970888696610928,
"loss/hidden": 3.43203125,
"loss/jsd": 0.0,
"loss/logits": 0.16700221002101898,
"step": 3010
},
{
"epoch": 0.151,
"grad_norm": 3674210304.0,
"grad_norm_var": 2.0173451962123377e+18,
"learning_rate": 0.0001,
"loss": 6.9283,
"loss/crossentropy": 1.713117253035307,
"loss/hidden": 3.34375,
"loss/jsd": 0.0,
"loss/logits": 0.1704209728166461,
"step": 3020
},
{
"epoch": 0.1515,
"grad_norm": 31.375,
"grad_norm_var": 1.710129338897767e+18,
"learning_rate": 0.0001,
"loss": 7.0097,
"loss/crossentropy": 1.9506682097911834,
"loss/hidden": 3.407421875,
"loss/jsd": 0.0,
"loss/logits": 0.19250028654932977,
"step": 3030
},
{
"epoch": 0.152,
"grad_norm": 29.25,
"grad_norm_var": 2.1416666666666666,
"learning_rate": 0.0001,
"loss": 7.0202,
"loss/crossentropy": 1.831156849861145,
"loss/hidden": 3.307421875,
"loss/jsd": 0.0,
"loss/logits": 0.18563526798970997,
"step": 3040
},
{
"epoch": 0.1525,
"grad_norm": 30.625,
"grad_norm_var": 2.6768229166666666,
"learning_rate": 0.0001,
"loss": 7.0529,
"loss/crossentropy": 1.8806451916694642,
"loss/hidden": 3.405859375,
"loss/jsd": 0.0,
"loss/logits": 0.19239903232082725,
"step": 3050
},
{
"epoch": 0.153,
"grad_norm": 29.25,
"grad_norm_var": 2.8375,
"learning_rate": 0.0001,
"loss": 6.9243,
"loss/crossentropy": 1.8184577412903309,
"loss/hidden": 3.359765625,
"loss/jsd": 0.0,
"loss/logits": 0.173899077065289,
"step": 3060
},
{
"epoch": 0.1535,
"grad_norm": 30.5,
"grad_norm_var": 1.6489583333333333,
"learning_rate": 0.0001,
"loss": 6.901,
"loss/crossentropy": 1.782475320994854,
"loss/hidden": 3.303125,
"loss/jsd": 0.0,
"loss/logits": 0.17683281004428864,
"step": 3070
},
{
"epoch": 0.154,
"grad_norm": 30.125,
"grad_norm_var": 2.4770833333333333,
"learning_rate": 0.0001,
"loss": 7.0536,
"loss/crossentropy": 1.7542385324835776,
"loss/hidden": 3.31015625,
"loss/jsd": 0.0,
"loss/logits": 0.1734863522462547,
"step": 3080
},
{
"epoch": 0.1545,
"grad_norm": 31.375,
"grad_norm_var": 2.5077473958333334,
"learning_rate": 0.0001,
"loss": 6.7429,
"loss/crossentropy": 1.721788990870118,
"loss/hidden": 3.336328125,
"loss/jsd": 0.0,
"loss/logits": 0.1703654458746314,
"step": 3090
},
{
"epoch": 0.155,
"grad_norm": 40.25,
"grad_norm_var": 9.09140625,
"learning_rate": 0.0001,
"loss": 6.9729,
"loss/crossentropy": 1.6206283092498779,
"loss/hidden": 3.35859375,
"loss/jsd": 0.0,
"loss/logits": 0.1712807172909379,
"step": 3100
},
{
"epoch": 0.1555,
"grad_norm": 32.0,
"grad_norm_var": 8.16640625,
"learning_rate": 0.0001,
"loss": 6.8604,
"loss/crossentropy": 1.7044736705720425,
"loss/hidden": 3.247265625,
"loss/jsd": 0.0,
"loss/logits": 0.16109976628795267,
"step": 3110
},
{
"epoch": 0.156,
"grad_norm": 28.875,
"grad_norm_var": 61.90305989583333,
"learning_rate": 0.0001,
"loss": 6.8603,
"loss/crossentropy": 1.7201604932546615,
"loss/hidden": 3.3421875,
"loss/jsd": 0.0,
"loss/logits": 0.1717333897948265,
"step": 3120
},
{
"epoch": 0.1565,
"grad_norm": 29.25,
"grad_norm_var": 3.2666015625,
"learning_rate": 0.0001,
"loss": 6.9316,
"loss/crossentropy": 1.611024511605501,
"loss/hidden": 3.331640625,
"loss/jsd": 0.0,
"loss/logits": 0.17799030421301723,
"step": 3130
},
{
"epoch": 0.157,
"grad_norm": 29.25,
"grad_norm_var": 6.059830729166666,
"learning_rate": 0.0001,
"loss": 6.8749,
"loss/crossentropy": 1.542306227236986,
"loss/hidden": 3.2828125,
"loss/jsd": 0.0,
"loss/logits": 0.17464940482750535,
"step": 3140
},
{
"epoch": 0.1575,
"grad_norm": 30.75,
"grad_norm_var": 4.820572916666666,
"learning_rate": 0.0001,
"loss": 6.8917,
"loss/crossentropy": 1.7465024203062058,
"loss/hidden": 3.3546875,
"loss/jsd": 0.0,
"loss/logits": 0.18054623370990158,
"step": 3150
},
{
"epoch": 0.158,
"grad_norm": 31.5,
"grad_norm_var": 2.787239583333333,
"learning_rate": 0.0001,
"loss": 6.969,
"loss/crossentropy": 2.0858161732554437,
"loss/hidden": 3.28125,
"loss/jsd": 0.0,
"loss/logits": 0.18169568832963706,
"step": 3160
},
{
"epoch": 0.1585,
"grad_norm": 29.5,
"grad_norm_var": 4.023372395833333,
"learning_rate": 0.0001,
"loss": 6.8406,
"loss/crossentropy": 1.9426328182220458,
"loss/hidden": 3.324609375,
"loss/jsd": 0.0,
"loss/logits": 0.17592350710183383,
"step": 3170
},
{
"epoch": 0.159,
"grad_norm": 29.125,
"grad_norm_var": 1.5832967238438093e+18,
"learning_rate": 0.0001,
"loss": 6.9453,
"loss/crossentropy": 1.8308497540652753,
"loss/hidden": 3.603125,
"loss/jsd": 0.0,
"loss/logits": 0.19216080345213413,
"step": 3180
},
{
"epoch": 0.1595,
"grad_norm": 29.5,
"grad_norm_var": 1.5832967237861376e+18,
"learning_rate": 0.0001,
"loss": 6.9032,
"loss/crossentropy": 1.705291760712862,
"loss/hidden": 3.383984375,
"loss/jsd": 0.0,
"loss/logits": 0.1843032216653228,
"step": 3190
},
{
"epoch": 0.16,
"grad_norm": 40.25,
"grad_norm_var": 19.5056640625,
"learning_rate": 0.0001,
"loss": 7.0209,
"loss/crossentropy": 1.7651132240891456,
"loss/hidden": 3.390234375,
"loss/jsd": 0.0,
"loss/logits": 0.1844408256933093,
"step": 3200
},
{
"epoch": 0.1605,
"grad_norm": 38.0,
"grad_norm_var": 6.217782109866559e+17,
"learning_rate": 0.0001,
"loss": 6.7736,
"loss/crossentropy": 1.8051001697778701,
"loss/hidden": 3.38984375,
"loss/jsd": 0.0,
"loss/logits": 0.17440476845949887,
"step": 3210
},
{
"epoch": 0.161,
"grad_norm": 31.125,
"grad_norm_var": 6.428059895833333,
"learning_rate": 0.0001,
"loss": 6.9109,
"loss/crossentropy": 1.8851144686341286,
"loss/hidden": 3.2359375,
"loss/jsd": 0.0,
"loss/logits": 0.17897074315696954,
"step": 3220
},
{
"epoch": 0.1615,
"grad_norm": 30.125,
"grad_norm_var": 17.601041666666667,
"learning_rate": 0.0001,
"loss": 6.9799,
"loss/crossentropy": 1.6312229566276073,
"loss/hidden": 3.2609375,
"loss/jsd": 0.0,
"loss/logits": 0.16838383311405777,
"step": 3230
},
{
"epoch": 0.162,
"grad_norm": 31.5,
"grad_norm_var": 20.835872395833334,
"learning_rate": 0.0001,
"loss": 6.932,
"loss/crossentropy": 2.011029013991356,
"loss/hidden": 3.310546875,
"loss/jsd": 0.0,
"loss/logits": 0.1832389457151294,
"step": 3240
},
{
"epoch": 0.1625,
"grad_norm": 28.375,
"grad_norm_var": 7.161458333333333,
"learning_rate": 0.0001,
"loss": 7.0405,
"loss/crossentropy": 1.8453179642558097,
"loss/hidden": 3.434765625,
"loss/jsd": 0.0,
"loss/logits": 0.19180234288796782,
"step": 3250
},
{
"epoch": 0.163,
"grad_norm": 36.5,
"grad_norm_var": 10.517708333333333,
"learning_rate": 0.0001,
"loss": 6.823,
"loss/crossentropy": 1.9555616907775402,
"loss/hidden": 3.318359375,
"loss/jsd": 0.0,
"loss/logits": 0.17895318511873484,
"step": 3260
},
{
"epoch": 0.1635,
"grad_norm": 29.125,
"grad_norm_var": 8.909830729166666,
"learning_rate": 0.0001,
"loss": 6.892,
"loss/crossentropy": 1.843096625804901,
"loss/hidden": 3.33203125,
"loss/jsd": 0.0,
"loss/logits": 0.18395393253304065,
"step": 3270
},
{
"epoch": 0.164,
"grad_norm": 27.875,
"grad_norm_var": 7.260416666666667,
"learning_rate": 0.0001,
"loss": 6.9288,
"loss/crossentropy": 1.688144066929817,
"loss/hidden": 3.277734375,
"loss/jsd": 0.0,
"loss/logits": 0.172001248691231,
"step": 3280
},
{
"epoch": 0.1645,
"grad_norm": 37.5,
"grad_norm_var": 12.014518229166667,
"learning_rate": 0.0001,
"loss": 6.9012,
"loss/crossentropy": 1.6900858603417874,
"loss/hidden": 3.346875,
"loss/jsd": 0.0,
"loss/logits": 0.1850940717384219,
"step": 3290
},
{
"epoch": 0.165,
"grad_norm": 30.5,
"grad_norm_var": 11.887955729166666,
"learning_rate": 0.0001,
"loss": 7.0327,
"loss/crossentropy": 1.8690055832266808,
"loss/hidden": 3.41015625,
"loss/jsd": 0.0,
"loss/logits": 0.2061467545107007,
"step": 3300
},
{
"epoch": 0.1655,
"grad_norm": 33.25,
"grad_norm_var": 44.0900390625,
"learning_rate": 0.0001,
"loss": 6.9398,
"loss/crossentropy": 1.864616620540619,
"loss/hidden": 3.21875,
"loss/jsd": 0.0,
"loss/logits": 0.15337421298027037,
"step": 3310
},
{
"epoch": 0.166,
"grad_norm": 37.25,
"grad_norm_var": 45.87473958333333,
"learning_rate": 0.0001,
"loss": 6.9275,
"loss/crossentropy": 1.8501743324100972,
"loss/hidden": 3.266796875,
"loss/jsd": 0.0,
"loss/logits": 0.17113643269985915,
"step": 3320
},
{
"epoch": 0.1665,
"grad_norm": 29.625,
"grad_norm_var": 1.1349774579334994e+18,
"learning_rate": 0.0001,
"loss": 7.0081,
"loss/crossentropy": 1.779020744562149,
"loss/hidden": 3.323046875,
"loss/jsd": 0.0,
"loss/logits": 0.1846176441758871,
"step": 3330
},
{
"epoch": 0.167,
"grad_norm": 35.75,
"grad_norm_var": 1.0819897936507308e+18,
"learning_rate": 0.0001,
"loss": 6.9779,
"loss/crossentropy": 1.7754384666681289,
"loss/hidden": 3.36796875,
"loss/jsd": 0.0,
"loss/logits": 0.19158907625824212,
"step": 3340
},
{
"epoch": 0.1675,
"grad_norm": 34.25,
"grad_norm_var": 1.081989793663733e+18,
"learning_rate": 0.0001,
"loss": 7.0539,
"loss/crossentropy": 1.759375052154064,
"loss/hidden": 3.32109375,
"loss/jsd": 0.0,
"loss/logits": 0.18603504877537488,
"step": 3350
},
{
"epoch": 0.168,
"grad_norm": 29.875,
"grad_norm_var": 5.620833333333334,
"learning_rate": 0.0001,
"loss": 6.8589,
"loss/crossentropy": 1.845319252461195,
"loss/hidden": 3.3328125,
"loss/jsd": 0.0,
"loss/logits": 0.18480119155719876,
"step": 3360
},
{
"epoch": 0.1685,
"grad_norm": 34.5,
"grad_norm_var": 19.439583333333335,
"learning_rate": 0.0001,
"loss": 6.9772,
"loss/crossentropy": 1.6411745361983776,
"loss/hidden": 3.321484375,
"loss/jsd": 0.0,
"loss/logits": 0.15529545303434134,
"step": 3370
},
{
"epoch": 0.169,
"grad_norm": 28.5,
"grad_norm_var": 36.9103515625,
"learning_rate": 0.0001,
"loss": 6.8489,
"loss/crossentropy": 1.7360669024288655,
"loss/hidden": 3.327734375,
"loss/jsd": 0.0,
"loss/logits": 0.17661824598908424,
"step": 3380
},
{
"epoch": 0.1695,
"grad_norm": 29.375,
"grad_norm_var": 35.46848958333333,
"learning_rate": 0.0001,
"loss": 6.7757,
"loss/crossentropy": 1.7902205429971219,
"loss/hidden": 3.38125,
"loss/jsd": 0.0,
"loss/logits": 0.17157120602205395,
"step": 3390
},
{
"epoch": 0.17,
"grad_norm": 28.875,
"grad_norm_var": 3.6395833333333334,
"learning_rate": 0.0001,
"loss": 6.8708,
"loss/crossentropy": 1.842449489980936,
"loss/hidden": 3.29921875,
"loss/jsd": 0.0,
"loss/logits": 0.16762932492420077,
"step": 3400
},
{
"epoch": 0.1705,
"grad_norm": 37.0,
"grad_norm_var": 6.513997395833333,
"learning_rate": 0.0001,
"loss": 6.8956,
"loss/crossentropy": 1.7051387749612332,
"loss/hidden": 3.3078125,
"loss/jsd": 0.0,
"loss/logits": 0.16933946274220943,
"step": 3410
},
{
"epoch": 0.171,
"grad_norm": 30.75,
"grad_norm_var": 9.762239583333333,
"learning_rate": 0.0001,
"loss": 6.9733,
"loss/crossentropy": 1.7448437750339507,
"loss/hidden": 3.3890625,
"loss/jsd": 0.0,
"loss/logits": 0.20119084492325784,
"step": 3420
},
{
"epoch": 0.1715,
"grad_norm": 78.0,
"grad_norm_var": 144.5125,
"learning_rate": 0.0001,
"loss": 7.0287,
"loss/crossentropy": 1.824779784679413,
"loss/hidden": 3.37421875,
"loss/jsd": 0.0,
"loss/logits": 0.17832597270607947,
"step": 3430
},
{
"epoch": 0.172,
"grad_norm": 28.125,
"grad_norm_var": 145.68430989583334,
"learning_rate": 0.0001,
"loss": 6.7435,
"loss/crossentropy": 1.6466563902795315,
"loss/hidden": 3.328515625,
"loss/jsd": 0.0,
"loss/logits": 0.16660706931725144,
"step": 3440
},
{
"epoch": 0.1725,
"grad_norm": 29.875,
"grad_norm_var": 8.811393229166667,
"learning_rate": 0.0001,
"loss": 6.934,
"loss/crossentropy": 1.8493422105908395,
"loss/hidden": 3.333984375,
"loss/jsd": 0.0,
"loss/logits": 0.2010068495757878,
"step": 3450
},
{
"epoch": 0.173,
"grad_norm": 29.0,
"grad_norm_var": 6.62890625,
"learning_rate": 0.0001,
"loss": 6.8496,
"loss/crossentropy": 1.6380462288856505,
"loss/hidden": 3.26015625,
"loss/jsd": 0.0,
"loss/logits": 0.18449038956314326,
"step": 3460
},
{
"epoch": 0.1735,
"grad_norm": 32.75,
"grad_norm_var": 32.5875,
"learning_rate": 0.0001,
"loss": 6.9309,
"loss/crossentropy": 1.6813900470733643,
"loss/hidden": 3.400390625,
"loss/jsd": 0.0,
"loss/logits": 0.18437479846179486,
"step": 3470
},
{
"epoch": 0.174,
"grad_norm": 31.625,
"grad_norm_var": 7.465419918819722e+17,
"learning_rate": 0.0001,
"loss": 7.1677,
"loss/crossentropy": 1.789808637648821,
"loss/hidden": 3.343359375,
"loss/jsd": 0.0,
"loss/logits": 0.17758243400603532,
"step": 3480
},
{
"epoch": 0.1745,
"grad_norm": 29.75,
"grad_norm_var": 58.84557291666667,
"learning_rate": 0.0001,
"loss": 6.8709,
"loss/crossentropy": 1.8069385841488839,
"loss/hidden": 3.403125,
"loss/jsd": 0.0,
"loss/logits": 0.19292932376265526,
"step": 3490
},
{
"epoch": 0.175,
"grad_norm": 28.25,
"grad_norm_var": 13.452018229166667,
"learning_rate": 0.0001,
"loss": 6.9084,
"loss/crossentropy": 1.6264689728617667,
"loss/hidden": 3.269140625,
"loss/jsd": 0.0,
"loss/logits": 0.16363061694428324,
"step": 3500
},
{
"epoch": 0.1755,
"grad_norm": 32.75,
"grad_norm_var": 1.459166261163747e+18,
"learning_rate": 0.0001,
"loss": 6.9837,
"loss/crossentropy": 1.7061957284808158,
"loss/hidden": 3.411328125,
"loss/jsd": 0.0,
"loss/logits": 0.18923843959346415,
"step": 3510
},
{
"epoch": 0.176,
"grad_norm": 29.75,
"grad_norm_var": 1.459166260217512e+18,
"learning_rate": 0.0001,
"loss": 6.9459,
"loss/crossentropy": 1.6986562974750996,
"loss/hidden": 3.453515625,
"loss/jsd": 0.0,
"loss/logits": 0.18663678420707583,
"step": 3520
},
{
"epoch": 0.1765,
"grad_norm": 31.0,
"grad_norm_var": 1.8478515625,
"learning_rate": 0.0001,
"loss": 6.9793,
"loss/crossentropy": 1.7609238177537918,
"loss/hidden": 3.38515625,
"loss/jsd": 0.0,
"loss/logits": 0.19038589783012866,
"step": 3530
},
{
"epoch": 0.177,
"grad_norm": 31.25,
"grad_norm_var": 3.1770833333333335,
"learning_rate": 0.0001,
"loss": 6.9966,
"loss/crossentropy": 1.9084905117750168,
"loss/hidden": 3.33984375,
"loss/jsd": 0.0,
"loss/logits": 0.1776235220953822,
"step": 3540
},
{
"epoch": 0.1775,
"grad_norm": 30.25,
"grad_norm_var": 2.051497395833333,
"learning_rate": 0.0001,
"loss": 6.9292,
"loss/crossentropy": 1.6809238217771054,
"loss/hidden": 3.355078125,
"loss/jsd": 0.0,
"loss/logits": 0.19617705075070263,
"step": 3550
},
{
"epoch": 0.178,
"grad_norm": 33.75,
"grad_norm_var": 1.9955729166666667,
"learning_rate": 0.0001,
"loss": 6.972,
"loss/crossentropy": 1.6389021024107933,
"loss/hidden": 3.3796875,
"loss/jsd": 0.0,
"loss/logits": 0.18174178060144186,
"step": 3560
},
{
"epoch": 0.1785,
"grad_norm": 36.5,
"grad_norm_var": 7.553059895833333,
"learning_rate": 0.0001,
"loss": 7.0848,
"loss/crossentropy": 1.7566796734929084,
"loss/hidden": 3.465234375,
"loss/jsd": 0.0,
"loss/logits": 0.1923373954370618,
"step": 3570
},
{
"epoch": 0.179,
"grad_norm": 28.125,
"grad_norm_var": 5.9603515625,
"learning_rate": 0.0001,
"loss": 6.956,
"loss/crossentropy": 1.7154954925179482,
"loss/hidden": 3.380859375,
"loss/jsd": 0.0,
"loss/logits": 0.17990761240944267,
"step": 3580
},
{
"epoch": 0.1795,
"grad_norm": 29.875,
"grad_norm_var": 4.399934895833334,
"learning_rate": 0.0001,
"loss": 7.0142,
"loss/crossentropy": 1.8327077120542525,
"loss/hidden": 3.35234375,
"loss/jsd": 0.0,
"loss/logits": 0.1800425429828465,
"step": 3590
},
{
"epoch": 0.18,
"grad_norm": 28.875,
"grad_norm_var": 3.3247395833333333,
"learning_rate": 0.0001,
"loss": 6.9351,
"loss/crossentropy": 1.8267195105552674,
"loss/hidden": 3.394140625,
"loss/jsd": 0.0,
"loss/logits": 0.19433746309950947,
"step": 3600
},
{
"epoch": 0.1805,
"grad_norm": 32.25,
"grad_norm_var": 24.673372395833333,
"learning_rate": 0.0001,
"loss": 6.8892,
"loss/crossentropy": 1.737992748618126,
"loss/hidden": 3.40390625,
"loss/jsd": 0.0,
"loss/logits": 0.20098126940429212,
"step": 3610
},
{
"epoch": 0.181,
"grad_norm": 30.25,
"grad_norm_var": 33.395572916666666,
"learning_rate": 0.0001,
"loss": 7.0103,
"loss/crossentropy": 1.8915371976792812,
"loss/hidden": 3.305078125,
"loss/jsd": 0.0,
"loss/logits": 0.1876732436940074,
"step": 3620
},
{
"epoch": 0.1815,
"grad_norm": 26.5,
"grad_norm_var": 38.799739583333334,
"learning_rate": 0.0001,
"loss": 6.981,
"loss/crossentropy": 1.7780213125050068,
"loss/hidden": 3.421875,
"loss/jsd": 0.0,
"loss/logits": 0.18925584964454173,
"step": 3630
},
{
"epoch": 0.182,
"grad_norm": 32.0,
"grad_norm_var": 1.0995116106143062e+18,
"learning_rate": 0.0001,
"loss": 7.039,
"loss/crossentropy": 1.7201772332191467,
"loss/hidden": 3.292578125,
"loss/jsd": 0.0,
"loss/logits": 0.1817839713767171,
"step": 3640
},
{
"epoch": 0.1825,
"grad_norm": 29.125,
"grad_norm_var": 1.0995116110905345e+18,
"learning_rate": 0.0001,
"loss": 6.7937,
"loss/crossentropy": 1.824095284193754,
"loss/hidden": 3.3203125,
"loss/jsd": 0.0,
"loss/logits": 0.16389566464349628,
"step": 3650
},
{
"epoch": 0.183,
"grad_norm": 28.625,
"grad_norm_var": 14.382291666666667,
"learning_rate": 0.0001,
"loss": 6.918,
"loss/crossentropy": 1.7039800986647606,
"loss/hidden": 3.34921875,
"loss/jsd": 0.0,
"loss/logits": 0.17251853737980127,
"step": 3660
},
{
"epoch": 0.1835,
"grad_norm": 29.375,
"grad_norm_var": 0.82265625,
"learning_rate": 0.0001,
"loss": 6.8614,
"loss/crossentropy": 1.670785766094923,
"loss/hidden": 3.466015625,
"loss/jsd": 0.0,
"loss/logits": 0.1893833376467228,
"step": 3670
},
{
"epoch": 0.184,
"grad_norm": 28.375,
"grad_norm_var": 8.297916666666667,
"learning_rate": 0.0001,
"loss": 6.8747,
"loss/crossentropy": 1.7371518418192864,
"loss/hidden": 3.329296875,
"loss/jsd": 0.0,
"loss/logits": 0.17423492725938558,
"step": 3680
},
{
"epoch": 0.1845,
"grad_norm": 30.5,
"grad_norm_var": 11.51640625,
"learning_rate": 0.0001,
"loss": 7.1482,
"loss/crossentropy": 2.011937528848648,
"loss/hidden": 3.376171875,
"loss/jsd": 0.0,
"loss/logits": 0.19120746664702892,
"step": 3690
},
{
"epoch": 0.185,
"grad_norm": 29.75,
"grad_norm_var": 114.8119140625,
"learning_rate": 0.0001,
"loss": 6.9318,
"loss/crossentropy": 1.9779032841324806,
"loss/hidden": 3.508203125,
"loss/jsd": 0.0,
"loss/logits": 0.19792085662484168,
"step": 3700
},
{
"epoch": 0.1855,
"grad_norm": 29.5,
"grad_norm_var": 3.1666015625,
"learning_rate": 0.0001,
"loss": 6.9801,
"loss/crossentropy": 1.8196966513991355,
"loss/hidden": 3.364453125,
"loss/jsd": 0.0,
"loss/logits": 0.17692473586648702,
"step": 3710
},
{
"epoch": 0.186,
"grad_norm": 31.625,
"grad_norm_var": 7.036874289840129e+17,
"learning_rate": 0.0001,
"loss": 6.9754,
"loss/crossentropy": 1.7481721505522727,
"loss/hidden": 3.37734375,
"loss/jsd": 0.0,
"loss/logits": 0.1970391605515033,
"step": 3720
},
{
"epoch": 0.1865,
"grad_norm": 28.75,
"grad_norm_var": 7.036874289385746e+17,
"learning_rate": 0.0001,
"loss": 6.8514,
"loss/crossentropy": 1.609993650764227,
"loss/hidden": 3.378125,
"loss/jsd": 0.0,
"loss/logits": 0.19023605762049556,
"step": 3730
},
{
"epoch": 0.187,
"grad_norm": 32.75,
"grad_norm_var": 2.0978515625,
"learning_rate": 0.0001,
"loss": 7.0947,
"loss/crossentropy": 2.0539694875478745,
"loss/hidden": 3.399609375,
"loss/jsd": 0.0,
"loss/logits": 0.20270574633032085,
"step": 3740
},
{
"epoch": 0.1875,
"grad_norm": 29.0,
"grad_norm_var": 1.9103515625,
"learning_rate": 0.0001,
"loss": 6.9278,
"loss/crossentropy": 1.8215243116021156,
"loss/hidden": 3.2640625,
"loss/jsd": 0.0,
"loss/logits": 0.16428390927612782,
"step": 3750
},
{
"epoch": 0.188,
"grad_norm": 30.0,
"grad_norm_var": 2.0541666666666667,
"learning_rate": 0.0001,
"loss": 7.0503,
"loss/crossentropy": 1.8183038413524628,
"loss/hidden": 3.3234375,
"loss/jsd": 0.0,
"loss/logits": 0.19697826653718947,
"step": 3760
},
{
"epoch": 0.1885,
"grad_norm": 32.75,
"grad_norm_var": 1.0989583333333333,
"learning_rate": 0.0001,
"loss": 7.1034,
"loss/crossentropy": 1.7321583658456803,
"loss/hidden": 3.41484375,
"loss/jsd": 0.0,
"loss/logits": 0.19197138799354435,
"step": 3770
},
{
"epoch": 0.189,
"grad_norm": 28.25,
"grad_norm_var": 2.314322916666667,
"learning_rate": 0.0001,
"loss": 6.8113,
"loss/crossentropy": 1.8538015499711036,
"loss/hidden": 3.369140625,
"loss/jsd": 0.0,
"loss/logits": 0.18398043606430292,
"step": 3780
},
{
"epoch": 0.1895,
"grad_norm": 29.625,
"grad_norm_var": 5.827018229166667,
"learning_rate": 0.0001,
"loss": 7.0817,
"loss/crossentropy": 1.8768661253154277,
"loss/hidden": 3.369140625,
"loss/jsd": 0.0,
"loss/logits": 0.20710380356758834,
"step": 3790
},
{
"epoch": 0.19,
"grad_norm": 33.25,
"grad_norm_var": 4.195572916666666,
"learning_rate": 0.0001,
"loss": 7.0374,
"loss/crossentropy": 1.7977422267198562,
"loss/hidden": 3.28828125,
"loss/jsd": 0.0,
"loss/logits": 0.1821097361855209,
"step": 3800
},
{
"epoch": 0.1905,
"grad_norm": 36.0,
"grad_norm_var": 5.763997395833333,
"learning_rate": 0.0001,
"loss": 7.0815,
"loss/crossentropy": 1.743187139183283,
"loss/hidden": 3.371875,
"loss/jsd": 0.0,
"loss/logits": 0.18471882613375784,
"step": 3810
},
{
"epoch": 0.191,
"grad_norm": 30.25,
"grad_norm_var": 6.658333333333333,
"learning_rate": 0.0001,
"loss": 6.8304,
"loss/crossentropy": 1.7315315805375575,
"loss/hidden": 3.3015625,
"loss/jsd": 0.0,
"loss/logits": 0.16926794005557894,
"step": 3820
},
{
"epoch": 0.1915,
"grad_norm": 29.875,
"grad_norm_var": 7.5380756628017e+17,
"learning_rate": 0.0001,
"loss": 7.091,
"loss/crossentropy": 1.8176006272435188,
"loss/hidden": 3.350390625,
"loss/jsd": 0.0,
"loss/logits": 0.1842843361198902,
"step": 3830
},
{
"epoch": 0.192,
"grad_norm": 29.375,
"grad_norm_var": 7.563541666666667,
"learning_rate": 0.0001,
"loss": 6.9694,
"loss/crossentropy": 1.777810937166214,
"loss/hidden": 3.32578125,
"loss/jsd": 0.0,
"loss/logits": 0.1739983822219074,
"step": 3840
},
{
"epoch": 0.1925,
"grad_norm": 28.375,
"grad_norm_var": 5.01875,
"learning_rate": 0.0001,
"loss": 6.8715,
"loss/crossentropy": 1.9018649347126484,
"loss/hidden": 3.32421875,
"loss/jsd": 0.0,
"loss/logits": 0.18006115844473242,
"step": 3850
},
{
"epoch": 0.193,
"grad_norm": 30.0,
"grad_norm_var": 1.2455729166666667,
"learning_rate": 0.0001,
"loss": 6.881,
"loss/crossentropy": 1.8844246573746204,
"loss/hidden": 3.3390625,
"loss/jsd": 0.0,
"loss/logits": 0.19470994817093015,
"step": 3860
},
{
"epoch": 0.1935,
"grad_norm": 28.75,
"grad_norm_var": 2.6809895833333335,
"learning_rate": 0.0001,
"loss": 6.9021,
"loss/crossentropy": 1.7199362799525262,
"loss/hidden": 3.36328125,
"loss/jsd": 0.0,
"loss/logits": 0.18913396131247281,
"step": 3870
},
{
"epoch": 0.194,
"grad_norm": 32.25,
"grad_norm_var": 2.5197265625,
"learning_rate": 0.0001,
"loss": 6.9324,
"loss/crossentropy": 1.755439005047083,
"loss/hidden": 3.343359375,
"loss/jsd": 0.0,
"loss/logits": 0.1858789509162307,
"step": 3880
},
{
"epoch": 0.1945,
"grad_norm": 31.125,
"grad_norm_var": 3.384375,
"learning_rate": 0.0001,
"loss": 6.9477,
"loss/crossentropy": 1.7906312070786954,
"loss/hidden": 3.334765625,
"loss/jsd": 0.0,
"loss/logits": 0.19127205722033977,
"step": 3890
},
{
"epoch": 0.195,
"grad_norm": 32.25,
"grad_norm_var": 2.1504557291666666,
"learning_rate": 0.0001,
"loss": 7.1196,
"loss/crossentropy": 1.9957764573395251,
"loss/hidden": 3.36015625,
"loss/jsd": 0.0,
"loss/logits": 0.18469135276973248,
"step": 3900
},
{
"epoch": 0.1955,
"grad_norm": 31.875,
"grad_norm_var": 9.387366238726391e+17,
"learning_rate": 0.0001,
"loss": 6.9857,
"loss/crossentropy": 1.7901725992560387,
"loss/hidden": 3.300390625,
"loss/jsd": 0.0,
"loss/logits": 0.18710751123726369,
"step": 3910
},
{
"epoch": 0.196,
"grad_norm": 30.625,
"grad_norm_var": 27.239322916666666,
"learning_rate": 0.0001,
"loss": 6.9815,
"loss/crossentropy": 1.7652528271079064,
"loss/hidden": 3.234375,
"loss/jsd": 0.0,
"loss/logits": 0.16305868746712804,
"step": 3920
},
{
"epoch": 0.1965,
"grad_norm": 29.75,
"grad_norm_var": 21.822916666666668,
"learning_rate": 0.0001,
"loss": 6.873,
"loss/crossentropy": 1.7368287414312362,
"loss/hidden": 3.433984375,
"loss/jsd": 0.0,
"loss/logits": 0.19844600670039653,
"step": 3930
},
{
"epoch": 0.197,
"grad_norm": 29.625,
"grad_norm_var": 2.035416666666667,
"learning_rate": 0.0001,
"loss": 7.0347,
"loss/crossentropy": 1.9710937917232514,
"loss/hidden": 3.265625,
"loss/jsd": 0.0,
"loss/logits": 0.17006599269807338,
"step": 3940
},
{
"epoch": 0.1975,
"grad_norm": 27.875,
"grad_norm_var": 55.904622395833336,
"learning_rate": 0.0001,
"loss": 6.9427,
"loss/crossentropy": 1.6834511645138264,
"loss/hidden": 3.315234375,
"loss/jsd": 0.0,
"loss/logits": 0.18321871096268297,
"step": 3950
},
{
"epoch": 0.198,
"grad_norm": 28.25,
"grad_norm_var": 4.515559895833333,
"learning_rate": 0.0001,
"loss": 6.8793,
"loss/crossentropy": 1.8481898710131646,
"loss/hidden": 3.219921875,
"loss/jsd": 0.0,
"loss/logits": 0.16018803734332324,
"step": 3960
},
{
"epoch": 0.1985,
"grad_norm": 31.375,
"grad_norm_var": 4.105208333333334,
"learning_rate": 0.0001,
"loss": 6.9158,
"loss/crossentropy": 1.7632210277020932,
"loss/hidden": 3.35390625,
"loss/jsd": 0.0,
"loss/logits": 0.17569016199558973,
"step": 3970
},
{
"epoch": 0.199,
"grad_norm": 31.5,
"grad_norm_var": 3.6056640625,
"learning_rate": 0.0001,
"loss": 7.0644,
"loss/crossentropy": 1.8658879399299622,
"loss/hidden": 3.445703125,
"loss/jsd": 0.0,
"loss/logits": 0.2075220150873065,
"step": 3980
},
{
"epoch": 0.1995,
"grad_norm": 30.0,
"grad_norm_var": 3.3301432291666666,
"learning_rate": 0.0001,
"loss": 7.1057,
"loss/crossentropy": 1.915429985523224,
"loss/hidden": 3.390234375,
"loss/jsd": 0.0,
"loss/logits": 0.19412722568958998,
"step": 3990
},
{
"epoch": 0.2,
"grad_norm": 29.75,
"grad_norm_var": 139.13170572916667,
"learning_rate": 0.0001,
"loss": 6.9355,
"loss/crossentropy": 1.8257215216755867,
"loss/hidden": 3.39375,
"loss/jsd": 0.0,
"loss/logits": 0.18498760322108865,
"step": 4000
}
],
"logging_steps": 10,
"max_steps": 20000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 4000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.1430040128035226e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}