stable32-5000 / trainer_state.json
semran1's picture
Upload folder using huggingface_hub
a443057 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.125,
"eval_steps": 2000,
"global_step": 5000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00025,
"grad_norm": 32.5,
"learning_rate": 0.0001,
"loss": 7.9852,
"loss/crossentropy": 2.2558943748474123,
"loss/hidden": 3.53671875,
"loss/jsd": 0.0,
"loss/logits": 0.22032691352069378,
"step": 10
},
{
"epoch": 0.0005,
"grad_norm": 39.0,
"grad_norm_var": 6.1306640625,
"learning_rate": 0.0001,
"loss": 8.0827,
"loss/crossentropy": 2.219619666039944,
"loss/hidden": 3.397265625,
"loss/jsd": 0.0,
"loss/logits": 0.20763051956892015,
"step": 20
},
{
"epoch": 0.00075,
"grad_norm": 30.375,
"grad_norm_var": 7.383072916666666,
"learning_rate": 0.0001,
"loss": 7.8479,
"loss/crossentropy": 2.185364603996277,
"loss/hidden": 3.59296875,
"loss/jsd": 0.0,
"loss/logits": 0.23391152992844583,
"step": 30
},
{
"epoch": 0.001,
"grad_norm": 32.75,
"grad_norm_var": 187.0322265625,
"learning_rate": 0.0001,
"loss": 7.8341,
"loss/crossentropy": 2.083733668923378,
"loss/hidden": 3.375390625,
"loss/jsd": 0.0,
"loss/logits": 0.2040597340092063,
"step": 40
},
{
"epoch": 0.00125,
"grad_norm": 32.5,
"grad_norm_var": 5.111393229166667,
"learning_rate": 0.0001,
"loss": 7.6815,
"loss/crossentropy": 2.182037726044655,
"loss/hidden": 3.439453125,
"loss/jsd": 0.0,
"loss/logits": 0.20423058047890663,
"step": 50
},
{
"epoch": 0.0015,
"grad_norm": 33.0,
"grad_norm_var": 1.3853515625,
"learning_rate": 0.0001,
"loss": 7.6919,
"loss/crossentropy": 2.1419573068618774,
"loss/hidden": 3.375,
"loss/jsd": 0.0,
"loss/logits": 0.19463330563157796,
"step": 60
},
{
"epoch": 0.00175,
"grad_norm": 33.25,
"grad_norm_var": 2.228125,
"learning_rate": 0.0001,
"loss": 7.838,
"loss/crossentropy": 2.242894622683525,
"loss/hidden": 3.425390625,
"loss/jsd": 0.0,
"loss/logits": 0.22062199115753173,
"step": 70
},
{
"epoch": 0.002,
"grad_norm": 32.5,
"grad_norm_var": 8.371809895833334,
"learning_rate": 0.0001,
"loss": 8.018,
"loss/crossentropy": 2.0408543169498445,
"loss/hidden": 3.506640625,
"loss/jsd": 0.0,
"loss/logits": 0.20080858804285526,
"step": 80
},
{
"epoch": 0.00225,
"grad_norm": 32.75,
"grad_norm_var": 7.322916666666667,
"learning_rate": 0.0001,
"loss": 7.8807,
"loss/crossentropy": 2.0654082030057905,
"loss/hidden": 3.41640625,
"loss/jsd": 0.0,
"loss/logits": 0.2109438929706812,
"step": 90
},
{
"epoch": 0.0025,
"grad_norm": 34.5,
"grad_norm_var": 2805.3603515625,
"learning_rate": 0.0001,
"loss": 8.0497,
"loss/crossentropy": 2.0930048365145923,
"loss/hidden": 3.410546875,
"loss/jsd": 0.0,
"loss/logits": 0.2103888330049813,
"step": 100
},
{
"epoch": 0.00275,
"grad_norm": 33.0,
"grad_norm_var": 2798.430143229167,
"learning_rate": 0.0001,
"loss": 7.8583,
"loss/crossentropy": 2.3308374524116515,
"loss/hidden": 3.373046875,
"loss/jsd": 0.0,
"loss/logits": 0.2045755073428154,
"step": 110
},
{
"epoch": 0.003,
"grad_norm": 34.0,
"grad_norm_var": 1.6999348958333333,
"learning_rate": 0.0001,
"loss": 7.7088,
"loss/crossentropy": 2.120428466796875,
"loss/hidden": 3.56796875,
"loss/jsd": 0.0,
"loss/logits": 0.21561774536967276,
"step": 120
},
{
"epoch": 0.00325,
"grad_norm": 31.0,
"grad_norm_var": 40.73899739583333,
"learning_rate": 0.0001,
"loss": 7.762,
"loss/crossentropy": 2.1464689180254934,
"loss/hidden": 3.4203125,
"loss/jsd": 0.0,
"loss/logits": 0.2084670951589942,
"step": 130
},
{
"epoch": 0.0035,
"grad_norm": 33.0,
"grad_norm_var": 42.67265625,
"learning_rate": 0.0001,
"loss": 7.711,
"loss/crossentropy": 2.1301105961203577,
"loss/hidden": 3.512890625,
"loss/jsd": 0.0,
"loss/logits": 0.21644905991852284,
"step": 140
},
{
"epoch": 0.00375,
"grad_norm": 32.75,
"grad_norm_var": 8.066080729166666,
"learning_rate": 0.0001,
"loss": 7.7295,
"loss/crossentropy": 2.115240353345871,
"loss/hidden": 3.491796875,
"loss/jsd": 0.0,
"loss/logits": 0.2064087452366948,
"step": 150
},
{
"epoch": 0.004,
"grad_norm": 32.5,
"grad_norm_var": 422.72057291666664,
"learning_rate": 0.0001,
"loss": 7.9367,
"loss/crossentropy": 2.141779786348343,
"loss/hidden": 3.457421875,
"loss/jsd": 0.0,
"loss/logits": 0.21747791785746812,
"step": 160
},
{
"epoch": 0.00425,
"grad_norm": 32.0,
"grad_norm_var": 6.705208333333333,
"learning_rate": 0.0001,
"loss": 7.756,
"loss/crossentropy": 2.103468084335327,
"loss/hidden": 3.449609375,
"loss/jsd": 0.0,
"loss/logits": 0.2027540819719434,
"step": 170
},
{
"epoch": 0.0045,
"grad_norm": 32.25,
"grad_norm_var": 17.8556640625,
"learning_rate": 0.0001,
"loss": 7.7008,
"loss/crossentropy": 2.1950977832078933,
"loss/hidden": 3.453125,
"loss/jsd": 0.0,
"loss/logits": 0.2186649737879634,
"step": 180
},
{
"epoch": 0.00475,
"grad_norm": 30.5,
"grad_norm_var": 66.61608072916667,
"learning_rate": 0.0001,
"loss": 7.6552,
"loss/crossentropy": 2.029752139747143,
"loss/hidden": 3.585546875,
"loss/jsd": 0.0,
"loss/logits": 0.22110425475984813,
"step": 190
},
{
"epoch": 0.005,
"grad_norm": 36.0,
"grad_norm_var": 13.160872395833334,
"learning_rate": 0.0001,
"loss": 7.6964,
"loss/crossentropy": 2.169684535264969,
"loss/hidden": 3.52265625,
"loss/jsd": 0.0,
"loss/logits": 0.22122678495943546,
"step": 200
},
{
"epoch": 0.00525,
"grad_norm": 33.25,
"grad_norm_var": 3.442643229166667,
"learning_rate": 0.0001,
"loss": 7.7135,
"loss/crossentropy": 2.1366169154644012,
"loss/hidden": 3.449609375,
"loss/jsd": 0.0,
"loss/logits": 0.19148335698992014,
"step": 210
},
{
"epoch": 0.0055,
"grad_norm": 35.25,
"grad_norm_var": 4.151822916666666,
"learning_rate": 0.0001,
"loss": 7.654,
"loss/crossentropy": 2.196457767486572,
"loss/hidden": 3.573828125,
"loss/jsd": 0.0,
"loss/logits": 0.22240160517394542,
"step": 220
},
{
"epoch": 0.00575,
"grad_norm": 34.5,
"grad_norm_var": 61.07057291666667,
"learning_rate": 0.0001,
"loss": 7.7286,
"loss/crossentropy": 2.099469523131847,
"loss/hidden": 3.41796875,
"loss/jsd": 0.0,
"loss/logits": 0.19720815271139144,
"step": 230
},
{
"epoch": 0.006,
"grad_norm": 33.75,
"grad_norm_var": 5.479166666666667,
"learning_rate": 0.0001,
"loss": 7.6841,
"loss/crossentropy": 2.0730464071035386,
"loss/hidden": 3.480078125,
"loss/jsd": 0.0,
"loss/logits": 0.20290055498480797,
"step": 240
},
{
"epoch": 0.00625,
"grad_norm": 29.5,
"grad_norm_var": 4.9197265625,
"learning_rate": 0.0001,
"loss": 7.6999,
"loss/crossentropy": 2.1514100462198256,
"loss/hidden": 3.42109375,
"loss/jsd": 0.0,
"loss/logits": 0.2013280361890793,
"step": 250
},
{
"epoch": 0.0065,
"grad_norm": 33.25,
"grad_norm_var": 150.86712239583332,
"learning_rate": 0.0001,
"loss": 7.7617,
"loss/crossentropy": 2.0071112543344496,
"loss/hidden": 3.39921875,
"loss/jsd": 0.0,
"loss/logits": 0.1923872010782361,
"step": 260
},
{
"epoch": 0.00675,
"grad_norm": 32.25,
"grad_norm_var": 41.373958333333334,
"learning_rate": 0.0001,
"loss": 7.7968,
"loss/crossentropy": 2.254330241680145,
"loss/hidden": 3.391015625,
"loss/jsd": 0.0,
"loss/logits": 0.20844143405556678,
"step": 270
},
{
"epoch": 0.007,
"grad_norm": 30.25,
"grad_norm_var": 5.9625,
"learning_rate": 0.0001,
"loss": 7.7437,
"loss/crossentropy": 2.2058747708797455,
"loss/hidden": 3.406640625,
"loss/jsd": 0.0,
"loss/logits": 0.19603454861789943,
"step": 280
},
{
"epoch": 0.00725,
"grad_norm": 32.0,
"grad_norm_var": 3.4306640625,
"learning_rate": 0.0001,
"loss": 7.7308,
"loss/crossentropy": 2.158563455939293,
"loss/hidden": 3.485546875,
"loss/jsd": 0.0,
"loss/logits": 0.20696840062737465,
"step": 290
},
{
"epoch": 0.0075,
"grad_norm": 35.25,
"grad_norm_var": 4.163997395833333,
"learning_rate": 0.0001,
"loss": 7.7781,
"loss/crossentropy": 2.1728759124875068,
"loss/hidden": 3.6984375,
"loss/jsd": 0.0,
"loss/logits": 0.2232737574726343,
"step": 300
},
{
"epoch": 0.00775,
"grad_norm": 32.25,
"grad_norm_var": 9.7212890625,
"learning_rate": 0.0001,
"loss": 7.7039,
"loss/crossentropy": 2.1304744452238085,
"loss/hidden": 3.510546875,
"loss/jsd": 0.0,
"loss/logits": 0.21460633352398872,
"step": 310
},
{
"epoch": 0.008,
"grad_norm": 34.25,
"grad_norm_var": 102.21979166666667,
"learning_rate": 0.0001,
"loss": 7.7386,
"loss/crossentropy": 2.2150494754314423,
"loss/hidden": 3.470703125,
"loss/jsd": 0.0,
"loss/logits": 0.2204894032329321,
"step": 320
},
{
"epoch": 0.00825,
"grad_norm": 34.5,
"grad_norm_var": 103.0806640625,
"learning_rate": 0.0001,
"loss": 7.6843,
"loss/crossentropy": 2.1483607694506643,
"loss/hidden": 3.525,
"loss/jsd": 0.0,
"loss/logits": 0.2168185070157051,
"step": 330
},
{
"epoch": 0.0085,
"grad_norm": 29.75,
"grad_norm_var": 4.245768229166667,
"learning_rate": 0.0001,
"loss": 7.6824,
"loss/crossentropy": 2.2668254554271696,
"loss/hidden": 3.36171875,
"loss/jsd": 0.0,
"loss/logits": 0.1978676740080118,
"step": 340
},
{
"epoch": 0.00875,
"grad_norm": 30.75,
"grad_norm_var": 7.785416666666666,
"learning_rate": 0.0001,
"loss": 7.6476,
"loss/crossentropy": 2.247351437807083,
"loss/hidden": 3.45078125,
"loss/jsd": 0.0,
"loss/logits": 0.2044550308957696,
"step": 350
},
{
"epoch": 0.009,
"grad_norm": 33.0,
"grad_norm_var": 36.1150390625,
"learning_rate": 0.0001,
"loss": 7.6707,
"loss/crossentropy": 2.2508403569459916,
"loss/hidden": 3.351171875,
"loss/jsd": 0.0,
"loss/logits": 0.20586735829710961,
"step": 360
},
{
"epoch": 0.00925,
"grad_norm": 27.875,
"grad_norm_var": 42.02805989583333,
"learning_rate": 0.0001,
"loss": 7.787,
"loss/crossentropy": 2.2524363905191422,
"loss/hidden": 3.39921875,
"loss/jsd": 0.0,
"loss/logits": 0.20639074668288232,
"step": 370
},
{
"epoch": 0.0095,
"grad_norm": 29.625,
"grad_norm_var": 19.65625,
"learning_rate": 0.0001,
"loss": 7.7258,
"loss/crossentropy": 2.2901588469743728,
"loss/hidden": 3.37890625,
"loss/jsd": 0.0,
"loss/logits": 0.20220000632107257,
"step": 380
},
{
"epoch": 0.00975,
"grad_norm": 29.875,
"grad_norm_var": 15.653059895833334,
"learning_rate": 0.0001,
"loss": 7.6445,
"loss/crossentropy": 2.1256834477186204,
"loss/hidden": 3.337109375,
"loss/jsd": 0.0,
"loss/logits": 0.190180828794837,
"step": 390
},
{
"epoch": 0.01,
"grad_norm": 33.5,
"grad_norm_var": 4.7712890625,
"learning_rate": 0.0001,
"loss": 7.7719,
"loss/crossentropy": 2.2237626880407335,
"loss/hidden": 3.484765625,
"loss/jsd": 0.0,
"loss/logits": 0.2264560218900442,
"step": 400
},
{
"epoch": 0.01025,
"grad_norm": 33.0,
"grad_norm_var": 2.9247395833333334,
"learning_rate": 0.0001,
"loss": 7.7631,
"loss/crossentropy": 2.060544753074646,
"loss/hidden": 3.470703125,
"loss/jsd": 0.0,
"loss/logits": 0.2202487275004387,
"step": 410
},
{
"epoch": 0.0105,
"grad_norm": 32.75,
"grad_norm_var": 2.6712890625,
"learning_rate": 0.0001,
"loss": 7.8193,
"loss/crossentropy": 2.270045906305313,
"loss/hidden": 3.49921875,
"loss/jsd": 0.0,
"loss/logits": 0.22449300419539214,
"step": 420
},
{
"epoch": 0.01075,
"grad_norm": 40.0,
"grad_norm_var": 5.551822916666667,
"learning_rate": 0.0001,
"loss": 7.7502,
"loss/crossentropy": 2.220390594005585,
"loss/hidden": 3.421875,
"loss/jsd": 0.0,
"loss/logits": 0.20708895958960055,
"step": 430
},
{
"epoch": 0.011,
"grad_norm": 33.75,
"grad_norm_var": 6.554622395833333,
"learning_rate": 0.0001,
"loss": 7.6386,
"loss/crossentropy": 2.0412577211856844,
"loss/hidden": 3.52109375,
"loss/jsd": 0.0,
"loss/logits": 0.20780573673546315,
"step": 440
},
{
"epoch": 0.01125,
"grad_norm": 39.5,
"grad_norm_var": 9.018684895833333,
"learning_rate": 0.0001,
"loss": 7.7522,
"loss/crossentropy": 2.085190561413765,
"loss/hidden": 3.434375,
"loss/jsd": 0.0,
"loss/logits": 0.22059339918196202,
"step": 450
},
{
"epoch": 0.0115,
"grad_norm": 31.75,
"grad_norm_var": 5.703580729166666,
"learning_rate": 0.0001,
"loss": 7.604,
"loss/crossentropy": 1.9451062515378,
"loss/hidden": 3.527734375,
"loss/jsd": 0.0,
"loss/logits": 0.2082566052675247,
"step": 460
},
{
"epoch": 0.01175,
"grad_norm": 34.25,
"grad_norm_var": 1.4416666666666667,
"learning_rate": 0.0001,
"loss": 7.7286,
"loss/crossentropy": 2.23127267062664,
"loss/hidden": 3.560546875,
"loss/jsd": 0.0,
"loss/logits": 0.22645943984389305,
"step": 470
},
{
"epoch": 0.012,
"grad_norm": 43.5,
"grad_norm_var": 61.358333333333334,
"learning_rate": 0.0001,
"loss": 7.5857,
"loss/crossentropy": 2.0275631666183473,
"loss/hidden": 3.341015625,
"loss/jsd": 0.0,
"loss/logits": 0.19867698503658177,
"step": 480
},
{
"epoch": 0.01225,
"grad_norm": 30.5,
"grad_norm_var": 62.425455729166664,
"learning_rate": 0.0001,
"loss": 7.7023,
"loss/crossentropy": 2.1564531326293945,
"loss/hidden": 3.422265625,
"loss/jsd": 0.0,
"loss/logits": 0.21295207217335702,
"step": 490
},
{
"epoch": 0.0125,
"grad_norm": 39.0,
"grad_norm_var": 7.580208333333333,
"learning_rate": 0.0001,
"loss": 7.8035,
"loss/crossentropy": 2.1716607972979545,
"loss/hidden": 3.489453125,
"loss/jsd": 0.0,
"loss/logits": 0.21046865545213223,
"step": 500
},
{
"epoch": 0.01275,
"grad_norm": 35.25,
"grad_norm_var": 5.121809895833334,
"learning_rate": 0.0001,
"loss": 7.764,
"loss/crossentropy": 2.181091034412384,
"loss/hidden": 3.410546875,
"loss/jsd": 0.0,
"loss/logits": 0.20339491367340087,
"step": 510
},
{
"epoch": 0.013,
"grad_norm": 31.5,
"grad_norm_var": 3.820572916666667,
"learning_rate": 0.0001,
"loss": 7.6112,
"loss/crossentropy": 2.012446442246437,
"loss/hidden": 3.454296875,
"loss/jsd": 0.0,
"loss/logits": 0.20432959645986556,
"step": 520
},
{
"epoch": 0.01325,
"grad_norm": 33.0,
"grad_norm_var": 2.92890625,
"learning_rate": 0.0001,
"loss": 7.73,
"loss/crossentropy": 2.0826023176312445,
"loss/hidden": 3.51171875,
"loss/jsd": 0.0,
"loss/logits": 0.21190985422581435,
"step": 530
},
{
"epoch": 0.0135,
"grad_norm": 31.375,
"grad_norm_var": 2.0229166666666667,
"learning_rate": 0.0001,
"loss": 7.7527,
"loss/crossentropy": 2.191486455500126,
"loss/hidden": 3.378125,
"loss/jsd": 0.0,
"loss/logits": 0.1943425141274929,
"step": 540
},
{
"epoch": 0.01375,
"grad_norm": 29.75,
"grad_norm_var": 4.983268229166667,
"learning_rate": 0.0001,
"loss": 7.7035,
"loss/crossentropy": 2.0664332896471023,
"loss/hidden": 3.475,
"loss/jsd": 0.0,
"loss/logits": 0.20404404532164336,
"step": 550
},
{
"epoch": 0.014,
"grad_norm": 31.875,
"grad_norm_var": 3.381705729166667,
"learning_rate": 0.0001,
"loss": 7.5679,
"loss/crossentropy": 2.0470397621393204,
"loss/hidden": 3.430859375,
"loss/jsd": 0.0,
"loss/logits": 0.21556914187967777,
"step": 560
},
{
"epoch": 0.01425,
"grad_norm": 33.0,
"grad_norm_var": 2.508072916666667,
"learning_rate": 0.0001,
"loss": 7.7909,
"loss/crossentropy": 2.21784293949604,
"loss/hidden": 3.4671875,
"loss/jsd": 0.0,
"loss/logits": 0.20632803943008185,
"step": 570
},
{
"epoch": 0.0145,
"grad_norm": 32.25,
"grad_norm_var": 2.6837890625,
"learning_rate": 0.0001,
"loss": 7.6449,
"loss/crossentropy": 2.291385439038277,
"loss/hidden": 3.432421875,
"loss/jsd": 0.0,
"loss/logits": 0.19818231668323277,
"step": 580
},
{
"epoch": 0.01475,
"grad_norm": 30.875,
"grad_norm_var": 106.909375,
"learning_rate": 0.0001,
"loss": 7.7009,
"loss/crossentropy": 2.102944087982178,
"loss/hidden": 3.532421875,
"loss/jsd": 0.0,
"loss/logits": 0.20003035496920346,
"step": 590
},
{
"epoch": 0.015,
"grad_norm": 33.0,
"grad_norm_var": 11.816080729166666,
"learning_rate": 0.0001,
"loss": 7.5907,
"loss/crossentropy": 2.239013722538948,
"loss/hidden": 3.471484375,
"loss/jsd": 0.0,
"loss/logits": 0.22811597101390363,
"step": 600
},
{
"epoch": 0.01525,
"grad_norm": 34.75,
"grad_norm_var": 3.2811848958333334,
"learning_rate": 0.0001,
"loss": 7.6578,
"loss/crossentropy": 2.1826944231986998,
"loss/hidden": 3.36796875,
"loss/jsd": 0.0,
"loss/logits": 0.19526719450950622,
"step": 610
},
{
"epoch": 0.0155,
"grad_norm": 32.0,
"grad_norm_var": 29.54140625,
"learning_rate": 0.0001,
"loss": 7.6592,
"loss/crossentropy": 2.0491475805640222,
"loss/hidden": 3.47109375,
"loss/jsd": 0.0,
"loss/logits": 0.19470291025936604,
"step": 620
},
{
"epoch": 0.01575,
"grad_norm": 29.625,
"grad_norm_var": 3.458268229166667,
"learning_rate": 0.0001,
"loss": 7.7237,
"loss/crossentropy": 2.171899539232254,
"loss/hidden": 3.417578125,
"loss/jsd": 0.0,
"loss/logits": 0.19583625346422195,
"step": 630
},
{
"epoch": 0.016,
"grad_norm": 41.5,
"grad_norm_var": 835.4395182291667,
"learning_rate": 0.0001,
"loss": 7.6493,
"loss/crossentropy": 2.019927790760994,
"loss/hidden": 3.40546875,
"loss/jsd": 0.0,
"loss/logits": 0.1889604590833187,
"step": 640
},
{
"epoch": 0.01625,
"grad_norm": 33.25,
"grad_norm_var": 816.1582682291667,
"learning_rate": 0.0001,
"loss": 7.7628,
"loss/crossentropy": 2.146556834876537,
"loss/hidden": 3.53046875,
"loss/jsd": 0.0,
"loss/logits": 0.21917179077863694,
"step": 650
},
{
"epoch": 0.0165,
"grad_norm": 34.0,
"grad_norm_var": 13.574739583333333,
"learning_rate": 0.0001,
"loss": 7.6199,
"loss/crossentropy": 2.2131199680268763,
"loss/hidden": 3.393359375,
"loss/jsd": 0.0,
"loss/logits": 0.20602547163143753,
"step": 660
},
{
"epoch": 0.01675,
"grad_norm": 34.75,
"grad_norm_var": 28.6125,
"learning_rate": 0.0001,
"loss": 7.6113,
"loss/crossentropy": 2.0343705236911775,
"loss/hidden": 3.37734375,
"loss/jsd": 0.0,
"loss/logits": 0.18553176671266555,
"step": 670
},
{
"epoch": 0.017,
"grad_norm": 31.625,
"grad_norm_var": 19.8369140625,
"learning_rate": 0.0001,
"loss": 7.6443,
"loss/crossentropy": 2.1528817296028135,
"loss/hidden": 3.456640625,
"loss/jsd": 0.0,
"loss/logits": 0.20593271851539613,
"step": 680
},
{
"epoch": 0.01725,
"grad_norm": 36.25,
"grad_norm_var": 5.97265625,
"learning_rate": 0.0001,
"loss": 7.7476,
"loss/crossentropy": 2.2202903479337692,
"loss/hidden": 3.419921875,
"loss/jsd": 0.0,
"loss/logits": 0.2035485502332449,
"step": 690
},
{
"epoch": 0.0175,
"grad_norm": 31.5,
"grad_norm_var": 7.5431640625,
"learning_rate": 0.0001,
"loss": 7.621,
"loss/crossentropy": 2.0744683638215067,
"loss/hidden": 3.359765625,
"loss/jsd": 0.0,
"loss/logits": 0.2023961789906025,
"step": 700
},
{
"epoch": 0.01775,
"grad_norm": 29.875,
"grad_norm_var": 26.912239583333335,
"learning_rate": 0.0001,
"loss": 7.7058,
"loss/crossentropy": 2.1098077327013014,
"loss/hidden": 3.48359375,
"loss/jsd": 0.0,
"loss/logits": 0.20867060720920563,
"step": 710
},
{
"epoch": 0.018,
"grad_norm": 34.0,
"grad_norm_var": 5.291080729166667,
"learning_rate": 0.0001,
"loss": 7.5224,
"loss/crossentropy": 2.0663078971207143,
"loss/hidden": 3.439453125,
"loss/jsd": 0.0,
"loss/logits": 0.1999094202183187,
"step": 720
},
{
"epoch": 0.01825,
"grad_norm": 36.75,
"grad_norm_var": 34.112239583333334,
"learning_rate": 0.0001,
"loss": 7.5679,
"loss/crossentropy": 2.264209559559822,
"loss/hidden": 3.438671875,
"loss/jsd": 0.0,
"loss/logits": 0.213060562312603,
"step": 730
},
{
"epoch": 0.0185,
"grad_norm": 31.875,
"grad_norm_var": 31.756705729166665,
"learning_rate": 0.0001,
"loss": 7.6714,
"loss/crossentropy": 2.128282290697098,
"loss/hidden": 3.494921875,
"loss/jsd": 0.0,
"loss/logits": 0.21533375550061465,
"step": 740
},
{
"epoch": 0.01875,
"grad_norm": 29.125,
"grad_norm_var": 4.314518229166667,
"learning_rate": 0.0001,
"loss": 7.6825,
"loss/crossentropy": 2.0806978911161425,
"loss/hidden": 3.446875,
"loss/jsd": 0.0,
"loss/logits": 0.19196727648377418,
"step": 750
},
{
"epoch": 0.019,
"grad_norm": 31.625,
"grad_norm_var": 9.520833333333334,
"learning_rate": 0.0001,
"loss": 7.6058,
"loss/crossentropy": 2.2315777271986006,
"loss/hidden": 3.484765625,
"loss/jsd": 0.0,
"loss/logits": 0.21225934717804193,
"step": 760
},
{
"epoch": 0.01925,
"grad_norm": 33.25,
"grad_norm_var": 29.381705729166665,
"learning_rate": 0.0001,
"loss": 7.8377,
"loss/crossentropy": 2.200978134572506,
"loss/hidden": 3.52890625,
"loss/jsd": 0.0,
"loss/logits": 0.22721426151692867,
"step": 770
},
{
"epoch": 0.0195,
"grad_norm": 32.75,
"grad_norm_var": 11.095247395833333,
"learning_rate": 0.0001,
"loss": 7.7305,
"loss/crossentropy": 2.2799030035734176,
"loss/hidden": 3.512890625,
"loss/jsd": 0.0,
"loss/logits": 0.2418980894610286,
"step": 780
},
{
"epoch": 0.01975,
"grad_norm": 46.5,
"grad_norm_var": 22.12265625,
"learning_rate": 0.0001,
"loss": 7.663,
"loss/crossentropy": 2.0916543275117876,
"loss/hidden": 3.4765625,
"loss/jsd": 0.0,
"loss/logits": 0.20638740565627814,
"step": 790
},
{
"epoch": 0.02,
"grad_norm": 53.75,
"grad_norm_var": 2.3053503983968586e+18,
"learning_rate": 0.0001,
"loss": 7.6743,
"loss/crossentropy": 2.2010276943445204,
"loss/hidden": 3.33671875,
"loss/jsd": 0.0,
"loss/logits": 0.2011238183826208,
"step": 800
},
{
"epoch": 0.02025,
"grad_norm": 34.75,
"grad_norm_var": 2.3053503962269005e+18,
"learning_rate": 0.0001,
"loss": 7.7365,
"loss/crossentropy": 2.2651585280895232,
"loss/hidden": 3.425390625,
"loss/jsd": 0.0,
"loss/logits": 0.22051467839628458,
"step": 810
},
{
"epoch": 0.0205,
"grad_norm": 32.25,
"grad_norm_var": 69.03854166666666,
"learning_rate": 0.0001,
"loss": 7.5788,
"loss/crossentropy": 2.1270318403840065,
"loss/hidden": 3.47109375,
"loss/jsd": 0.0,
"loss/logits": 0.20940354652702808,
"step": 820
},
{
"epoch": 0.02075,
"grad_norm": 33.0,
"grad_norm_var": 8.0603515625,
"learning_rate": 0.0001,
"loss": 7.6237,
"loss/crossentropy": 2.1726820170879364,
"loss/hidden": 3.428125,
"loss/jsd": 0.0,
"loss/logits": 0.20644128862768413,
"step": 830
},
{
"epoch": 0.021,
"grad_norm": 33.5,
"grad_norm_var": 9.085416666666667,
"learning_rate": 0.0001,
"loss": 7.455,
"loss/crossentropy": 2.166350546479225,
"loss/hidden": 3.412109375,
"loss/jsd": 0.0,
"loss/logits": 0.20701032150536774,
"step": 840
},
{
"epoch": 0.02125,
"grad_norm": 37.75,
"grad_norm_var": 6.362239583333333,
"learning_rate": 0.0001,
"loss": 7.5725,
"loss/crossentropy": 2.095318245887756,
"loss/hidden": 3.429296875,
"loss/jsd": 0.0,
"loss/logits": 0.20427223704755307,
"step": 850
},
{
"epoch": 0.0215,
"grad_norm": 29.625,
"grad_norm_var": 15.934830729166666,
"learning_rate": 0.0001,
"loss": 7.5728,
"loss/crossentropy": 2.1999073296785356,
"loss/hidden": 3.400390625,
"loss/jsd": 0.0,
"loss/logits": 0.2012161746621132,
"step": 860
},
{
"epoch": 0.02175,
"grad_norm": 30.5,
"grad_norm_var": 5.681184895833334,
"learning_rate": 0.0001,
"loss": 7.6168,
"loss/crossentropy": 2.2957967817783356,
"loss/hidden": 3.395703125,
"loss/jsd": 0.0,
"loss/logits": 0.20650502648204566,
"step": 870
},
{
"epoch": 0.022,
"grad_norm": 30.0,
"grad_norm_var": 10.9181640625,
"learning_rate": 0.0001,
"loss": 7.6929,
"loss/crossentropy": 2.2093090921640397,
"loss/hidden": 3.350390625,
"loss/jsd": 0.0,
"loss/logits": 0.18699637930840254,
"step": 880
},
{
"epoch": 0.02225,
"grad_norm": 35.0,
"grad_norm_var": 11.192122395833334,
"learning_rate": 0.0001,
"loss": 7.5316,
"loss/crossentropy": 2.0586251467466354,
"loss/hidden": 3.428125,
"loss/jsd": 0.0,
"loss/logits": 0.21464286223053933,
"step": 890
},
{
"epoch": 0.0225,
"grad_norm": 48.0,
"grad_norm_var": 22.92265625,
"learning_rate": 0.0001,
"loss": 7.5857,
"loss/crossentropy": 2.1724458605051042,
"loss/hidden": 3.3828125,
"loss/jsd": 0.0,
"loss/logits": 0.19797445200383662,
"step": 900
},
{
"epoch": 0.02275,
"grad_norm": 33.0,
"grad_norm_var": 38.4837890625,
"learning_rate": 0.0001,
"loss": 7.6184,
"loss/crossentropy": 2.15934486836195,
"loss/hidden": 3.357421875,
"loss/jsd": 0.0,
"loss/logits": 0.19419073052704333,
"step": 910
},
{
"epoch": 0.023,
"grad_norm": 31.0,
"grad_norm_var": 29.8869140625,
"learning_rate": 0.0001,
"loss": 7.6209,
"loss/crossentropy": 2.1601561695337295,
"loss/hidden": 3.384375,
"loss/jsd": 0.0,
"loss/logits": 0.20290262177586554,
"step": 920
},
{
"epoch": 0.02325,
"grad_norm": 33.0,
"grad_norm_var": 2.5143229166666665,
"learning_rate": 0.0001,
"loss": 7.5969,
"loss/crossentropy": 2.184823766350746,
"loss/hidden": 3.392578125,
"loss/jsd": 0.0,
"loss/logits": 0.2016730338335037,
"step": 930
},
{
"epoch": 0.0235,
"grad_norm": 30.125,
"grad_norm_var": 1.5764973958333333,
"learning_rate": 0.0001,
"loss": 7.5772,
"loss/crossentropy": 2.2380657255649568,
"loss/hidden": 3.319140625,
"loss/jsd": 0.0,
"loss/logits": 0.19638443663716315,
"step": 940
},
{
"epoch": 0.02375,
"grad_norm": 28.875,
"grad_norm_var": 5.199934895833334,
"learning_rate": 0.0001,
"loss": 7.6348,
"loss/crossentropy": 2.183762513846159,
"loss/hidden": 3.4171875,
"loss/jsd": 0.0,
"loss/logits": 0.19700763542205096,
"step": 950
},
{
"epoch": 0.024,
"grad_norm": 33.0,
"grad_norm_var": 20.1666015625,
"learning_rate": 0.0001,
"loss": 7.6351,
"loss/crossentropy": 2.1142914414405825,
"loss/hidden": 3.46953125,
"loss/jsd": 0.0,
"loss/logits": 0.2082229983061552,
"step": 960
},
{
"epoch": 0.02425,
"grad_norm": 31.5,
"grad_norm_var": 18.738541666666666,
"learning_rate": 0.0001,
"loss": 7.6302,
"loss/crossentropy": 2.315229868888855,
"loss/hidden": 3.32734375,
"loss/jsd": 0.0,
"loss/logits": 0.20684304945170878,
"step": 970
},
{
"epoch": 0.0245,
"grad_norm": 33.75,
"grad_norm_var": 2.501822916666667,
"learning_rate": 0.0001,
"loss": 7.6816,
"loss/crossentropy": 2.094369947910309,
"loss/hidden": 3.59921875,
"loss/jsd": 0.0,
"loss/logits": 0.2428071454167366,
"step": 980
},
{
"epoch": 0.02475,
"grad_norm": 29.25,
"grad_norm_var": 4.06015625,
"learning_rate": 0.0001,
"loss": 7.6289,
"loss/crossentropy": 2.183109185099602,
"loss/hidden": 3.5,
"loss/jsd": 0.0,
"loss/logits": 0.21173047311604024,
"step": 990
},
{
"epoch": 0.025,
"grad_norm": 33.5,
"grad_norm_var": 2.9593098958333335,
"learning_rate": 0.0001,
"loss": 7.6417,
"loss/crossentropy": 2.1311034083366396,
"loss/hidden": 3.44765625,
"loss/jsd": 0.0,
"loss/logits": 0.1998819222673774,
"step": 1000
},
{
"epoch": 0.02525,
"grad_norm": 31.0,
"grad_norm_var": 4.510872395833333,
"learning_rate": 0.0001,
"loss": 7.6299,
"loss/crossentropy": 2.1297010451555254,
"loss/hidden": 3.475390625,
"loss/jsd": 0.0,
"loss/logits": 0.20842864252626897,
"step": 1010
},
{
"epoch": 0.0255,
"grad_norm": 30.375,
"grad_norm_var": 3.9893229166666666,
"learning_rate": 0.0001,
"loss": 7.6593,
"loss/crossentropy": 2.2224902719259263,
"loss/hidden": 3.509375,
"loss/jsd": 0.0,
"loss/logits": 0.21827217563986778,
"step": 1020
},
{
"epoch": 0.02575,
"grad_norm": 38.25,
"grad_norm_var": 6.010416666666667,
"learning_rate": 0.0001,
"loss": 7.6712,
"loss/crossentropy": 2.1976612359285355,
"loss/hidden": 3.506640625,
"loss/jsd": 0.0,
"loss/logits": 0.210753770545125,
"step": 1030
},
{
"epoch": 0.026,
"grad_norm": 33.5,
"grad_norm_var": 9.435872395833334,
"learning_rate": 0.0001,
"loss": 7.748,
"loss/crossentropy": 2.1889317661523817,
"loss/hidden": 3.398828125,
"loss/jsd": 0.0,
"loss/logits": 0.2081079863011837,
"step": 1040
},
{
"epoch": 0.02625,
"grad_norm": 31.0,
"grad_norm_var": 6.206184895833333,
"learning_rate": 0.0001,
"loss": 7.5803,
"loss/crossentropy": 2.0802227064967154,
"loss/hidden": 3.46796875,
"loss/jsd": 0.0,
"loss/logits": 0.19740718584507705,
"step": 1050
},
{
"epoch": 0.0265,
"grad_norm": 31.625,
"grad_norm_var": 1.3770182291666666,
"learning_rate": 0.0001,
"loss": 7.6098,
"loss/crossentropy": 1.969551184773445,
"loss/hidden": 3.4546875,
"loss/jsd": 0.0,
"loss/logits": 0.1954148853197694,
"step": 1060
},
{
"epoch": 0.02675,
"grad_norm": 30.75,
"grad_norm_var": 2.09375,
"learning_rate": 0.0001,
"loss": 7.7826,
"loss/crossentropy": 2.160974936187267,
"loss/hidden": 3.581640625,
"loss/jsd": 0.0,
"loss/logits": 0.2183740811422467,
"step": 1070
},
{
"epoch": 0.027,
"grad_norm": 30.625,
"grad_norm_var": 2.6520182291666665,
"learning_rate": 0.0001,
"loss": 7.6186,
"loss/crossentropy": 2.179084411263466,
"loss/hidden": 3.4109375,
"loss/jsd": 0.0,
"loss/logits": 0.19020346291363238,
"step": 1080
},
{
"epoch": 0.02725,
"grad_norm": 56.5,
"grad_norm_var": 48.91868489583333,
"learning_rate": 0.0001,
"loss": 7.7756,
"loss/crossentropy": 2.1456103891134264,
"loss/hidden": 3.5,
"loss/jsd": 0.0,
"loss/logits": 0.2178689869120717,
"step": 1090
},
{
"epoch": 0.0275,
"grad_norm": 32.0,
"grad_norm_var": 42.19557291666667,
"learning_rate": 0.0001,
"loss": 7.6714,
"loss/crossentropy": 2.2156156271696092,
"loss/hidden": 3.43515625,
"loss/jsd": 0.0,
"loss/logits": 0.2013509316369891,
"step": 1100
},
{
"epoch": 0.02775,
"grad_norm": 31.625,
"grad_norm_var": 29.3775390625,
"learning_rate": 0.0001,
"loss": 7.6833,
"loss/crossentropy": 2.0683740943670275,
"loss/hidden": 3.5046875,
"loss/jsd": 0.0,
"loss/logits": 0.21320818569511174,
"step": 1110
},
{
"epoch": 0.028,
"grad_norm": 27.5,
"grad_norm_var": 35.483333333333334,
"learning_rate": 0.0001,
"loss": 7.7302,
"loss/crossentropy": 2.098052313923836,
"loss/hidden": 3.46484375,
"loss/jsd": 0.0,
"loss/logits": 0.19890500828623772,
"step": 1120
},
{
"epoch": 0.02825,
"grad_norm": 29.0,
"grad_norm_var": 10.8775390625,
"learning_rate": 0.0001,
"loss": 7.6716,
"loss/crossentropy": 2.0999813921749593,
"loss/hidden": 3.373046875,
"loss/jsd": 0.0,
"loss/logits": 0.18992104195058346,
"step": 1130
},
{
"epoch": 0.0285,
"grad_norm": 30.875,
"grad_norm_var": 3.67890625,
"learning_rate": 0.0001,
"loss": 7.5401,
"loss/crossentropy": 2.07411085665226,
"loss/hidden": 3.433984375,
"loss/jsd": 0.0,
"loss/logits": 0.2018830729648471,
"step": 1140
},
{
"epoch": 0.02875,
"grad_norm": 31.125,
"grad_norm_var": 18.053580729166665,
"learning_rate": 0.0001,
"loss": 7.645,
"loss/crossentropy": 2.0945233553647995,
"loss/hidden": 3.4859375,
"loss/jsd": 0.0,
"loss/logits": 0.21366582233458759,
"step": 1150
},
{
"epoch": 0.029,
"grad_norm": 31.25,
"grad_norm_var": 16.978125,
"learning_rate": 0.0001,
"loss": 7.6514,
"loss/crossentropy": 2.0980678737163543,
"loss/hidden": 3.434375,
"loss/jsd": 0.0,
"loss/logits": 0.19850811325013637,
"step": 1160
},
{
"epoch": 0.02925,
"grad_norm": 32.5,
"grad_norm_var": 30.2556640625,
"learning_rate": 0.0001,
"loss": 7.6021,
"loss/crossentropy": 2.155895306169987,
"loss/hidden": 3.378125,
"loss/jsd": 0.0,
"loss/logits": 0.19983574748039246,
"step": 1170
},
{
"epoch": 0.0295,
"grad_norm": 30.5,
"grad_norm_var": 5.733072916666667,
"learning_rate": 0.0001,
"loss": 7.6343,
"loss/crossentropy": 2.1906268298625946,
"loss/hidden": 3.348046875,
"loss/jsd": 0.0,
"loss/logits": 0.19584416709840297,
"step": 1180
},
{
"epoch": 0.02975,
"grad_norm": 32.5,
"grad_norm_var": 3.89765625,
"learning_rate": 0.0001,
"loss": 7.7077,
"loss/crossentropy": 2.163237012922764,
"loss/hidden": 3.562109375,
"loss/jsd": 0.0,
"loss/logits": 0.21741114580072463,
"step": 1190
},
{
"epoch": 0.03,
"grad_norm": 27.875,
"grad_norm_var": 3.6639973958333334,
"learning_rate": 0.0001,
"loss": 7.6695,
"loss/crossentropy": 2.1346954315900804,
"loss/hidden": 3.394921875,
"loss/jsd": 0.0,
"loss/logits": 0.19178961254656315,
"step": 1200
},
{
"epoch": 0.03025,
"grad_norm": 30.5,
"grad_norm_var": 24.9125,
"learning_rate": 0.0001,
"loss": 7.6108,
"loss/crossentropy": 2.2493597716093063,
"loss/hidden": 3.433203125,
"loss/jsd": 0.0,
"loss/logits": 0.209975734166801,
"step": 1210
},
{
"epoch": 0.0305,
"grad_norm": 30.75,
"grad_norm_var": 5.566080729166667,
"learning_rate": 0.0001,
"loss": 7.5902,
"loss/crossentropy": 2.046968361735344,
"loss/hidden": 3.451171875,
"loss/jsd": 0.0,
"loss/logits": 0.1867401722818613,
"step": 1220
},
{
"epoch": 0.03075,
"grad_norm": 29.75,
"grad_norm_var": 5.396875,
"learning_rate": 0.0001,
"loss": 7.6659,
"loss/crossentropy": 2.0429708033800127,
"loss/hidden": 3.3984375,
"loss/jsd": 0.0,
"loss/logits": 0.18792454693466426,
"step": 1230
},
{
"epoch": 0.031,
"grad_norm": 30.375,
"grad_norm_var": 2.8827473958333334,
"learning_rate": 0.0001,
"loss": 7.5425,
"loss/crossentropy": 2.124686148762703,
"loss/hidden": 3.45625,
"loss/jsd": 0.0,
"loss/logits": 0.23028194047510625,
"step": 1240
},
{
"epoch": 0.03125,
"grad_norm": 33.0,
"grad_norm_var": 2.220833333333333,
"learning_rate": 0.0001,
"loss": 7.7766,
"loss/crossentropy": 2.1431034594774245,
"loss/hidden": 3.30390625,
"loss/jsd": 0.0,
"loss/logits": 0.1852023523300886,
"step": 1250
},
{
"epoch": 0.0315,
"grad_norm": 34.0,
"grad_norm_var": 3.3080729166666667,
"learning_rate": 0.0001,
"loss": 7.6547,
"loss/crossentropy": 2.208024913072586,
"loss/hidden": 3.32265625,
"loss/jsd": 0.0,
"loss/logits": 0.19009452145546674,
"step": 1260
},
{
"epoch": 0.03175,
"grad_norm": 31.375,
"grad_norm_var": 3.7249348958333335,
"learning_rate": 0.0001,
"loss": 7.6331,
"loss/crossentropy": 2.0774734795093535,
"loss/hidden": 3.378515625,
"loss/jsd": 0.0,
"loss/logits": 0.19277823474258185,
"step": 1270
},
{
"epoch": 0.032,
"grad_norm": 32.75,
"grad_norm_var": 2.6393229166666665,
"learning_rate": 0.0001,
"loss": 7.6147,
"loss/crossentropy": 2.244540962576866,
"loss/hidden": 3.444921875,
"loss/jsd": 0.0,
"loss/logits": 0.20150573179125786,
"step": 1280
},
{
"epoch": 0.03225,
"grad_norm": 31.75,
"grad_norm_var": 3.2280598958333333,
"learning_rate": 0.0001,
"loss": 7.6724,
"loss/crossentropy": 2.1218355029821394,
"loss/hidden": 3.41640625,
"loss/jsd": 0.0,
"loss/logits": 0.19727950319647788,
"step": 1290
},
{
"epoch": 0.0325,
"grad_norm": 33.25,
"grad_norm_var": 3.6113932291666666,
"learning_rate": 0.0001,
"loss": 7.5881,
"loss/crossentropy": 2.048227934539318,
"loss/hidden": 3.352734375,
"loss/jsd": 0.0,
"loss/logits": 0.19401397118344904,
"step": 1300
},
{
"epoch": 0.03275,
"grad_norm": 35.5,
"grad_norm_var": 2.8384765625,
"learning_rate": 0.0001,
"loss": 7.6481,
"loss/crossentropy": 2.0217724472284315,
"loss/hidden": 3.398828125,
"loss/jsd": 0.0,
"loss/logits": 0.19338970091193913,
"step": 1310
},
{
"epoch": 0.033,
"grad_norm": 32.75,
"grad_norm_var": 2.3671223958333334,
"learning_rate": 0.0001,
"loss": 7.5734,
"loss/crossentropy": 2.123840129375458,
"loss/hidden": 3.377734375,
"loss/jsd": 0.0,
"loss/logits": 0.19704403057694436,
"step": 1320
},
{
"epoch": 0.03325,
"grad_norm": 31.375,
"grad_norm_var": 1.8207682291666667,
"learning_rate": 0.0001,
"loss": 7.5379,
"loss/crossentropy": 2.1691371381282805,
"loss/hidden": 3.416796875,
"loss/jsd": 0.0,
"loss/logits": 0.20379403475672006,
"step": 1330
},
{
"epoch": 0.0335,
"grad_norm": 30.5,
"grad_norm_var": 2.162239583333333,
"learning_rate": 0.0001,
"loss": 7.5824,
"loss/crossentropy": 2.0320975854992867,
"loss/hidden": 3.490625,
"loss/jsd": 0.0,
"loss/logits": 0.204788769595325,
"step": 1340
},
{
"epoch": 0.03375,
"grad_norm": 29.875,
"grad_norm_var": 29.69140625,
"learning_rate": 0.0001,
"loss": 7.6835,
"loss/crossentropy": 2.1799038141965865,
"loss/hidden": 3.39765625,
"loss/jsd": 0.0,
"loss/logits": 0.21532316971570253,
"step": 1350
},
{
"epoch": 0.034,
"grad_norm": 30.625,
"grad_norm_var": 2.4837890625,
"learning_rate": 0.0001,
"loss": 7.6461,
"loss/crossentropy": 2.075017270445824,
"loss/hidden": 3.483984375,
"loss/jsd": 0.0,
"loss/logits": 0.21185822309926153,
"step": 1360
},
{
"epoch": 0.03425,
"grad_norm": 29.375,
"grad_norm_var": 2.6806640625,
"learning_rate": 0.0001,
"loss": 7.6084,
"loss/crossentropy": 2.2061389327049254,
"loss/hidden": 3.3671875,
"loss/jsd": 0.0,
"loss/logits": 0.19864549599587916,
"step": 1370
},
{
"epoch": 0.0345,
"grad_norm": 46.25,
"grad_norm_var": 16.428125,
"learning_rate": 0.0001,
"loss": 7.5773,
"loss/crossentropy": 2.1411263316869737,
"loss/hidden": 3.40703125,
"loss/jsd": 0.0,
"loss/logits": 0.18519967906177043,
"step": 1380
},
{
"epoch": 0.03475,
"grad_norm": 33.5,
"grad_norm_var": 312.2525390625,
"learning_rate": 0.0001,
"loss": 7.6651,
"loss/crossentropy": 2.1760765284299852,
"loss/hidden": 3.457421875,
"loss/jsd": 0.0,
"loss/logits": 0.19441114887595176,
"step": 1390
},
{
"epoch": 0.035,
"grad_norm": 30.375,
"grad_norm_var": 7.5697265625,
"learning_rate": 0.0001,
"loss": 7.6384,
"loss/crossentropy": 2.148920811712742,
"loss/hidden": 3.503515625,
"loss/jsd": 0.0,
"loss/logits": 0.20845398511737584,
"step": 1400
},
{
"epoch": 0.03525,
"grad_norm": 31.5,
"grad_norm_var": 3.2301432291666665,
"learning_rate": 0.0001,
"loss": 7.6503,
"loss/crossentropy": 2.2241507709026336,
"loss/hidden": 3.478515625,
"loss/jsd": 0.0,
"loss/logits": 0.2013774536550045,
"step": 1410
},
{
"epoch": 0.0355,
"grad_norm": 32.75,
"grad_norm_var": 3.67265625,
"learning_rate": 0.0001,
"loss": 7.6107,
"loss/crossentropy": 2.151585003733635,
"loss/hidden": 3.422265625,
"loss/jsd": 0.0,
"loss/logits": 0.19790932536125183,
"step": 1420
},
{
"epoch": 0.03575,
"grad_norm": 31.0,
"grad_norm_var": 2.192643229166667,
"learning_rate": 0.0001,
"loss": 7.6089,
"loss/crossentropy": 2.106749549508095,
"loss/hidden": 3.555078125,
"loss/jsd": 0.0,
"loss/logits": 0.20702882390469313,
"step": 1430
},
{
"epoch": 0.036,
"grad_norm": 34.0,
"grad_norm_var": 2.8622395833333334,
"learning_rate": 0.0001,
"loss": 7.6081,
"loss/crossentropy": 2.0727885022759436,
"loss/hidden": 3.425,
"loss/jsd": 0.0,
"loss/logits": 0.21057205237448215,
"step": 1440
},
{
"epoch": 0.03625,
"grad_norm": 31.25,
"grad_norm_var": 3.32890625,
"learning_rate": 0.0001,
"loss": 7.7005,
"loss/crossentropy": 2.3014074742794035,
"loss/hidden": 3.35859375,
"loss/jsd": 0.0,
"loss/logits": 0.20723759960383176,
"step": 1450
},
{
"epoch": 0.0365,
"grad_norm": 28.625,
"grad_norm_var": 4.240625,
"learning_rate": 0.0001,
"loss": 7.5677,
"loss/crossentropy": 2.12650800794363,
"loss/hidden": 3.44609375,
"loss/jsd": 0.0,
"loss/logits": 0.20236929692327976,
"step": 1460
},
{
"epoch": 0.03675,
"grad_norm": 31.0,
"grad_norm_var": 8.195768229166667,
"learning_rate": 0.0001,
"loss": 7.7299,
"loss/crossentropy": 2.187662351131439,
"loss/hidden": 3.421875,
"loss/jsd": 0.0,
"loss/logits": 0.20965678989887238,
"step": 1470
},
{
"epoch": 0.037,
"grad_norm": 33.0,
"grad_norm_var": 6.1666015625,
"learning_rate": 0.0001,
"loss": 7.5942,
"loss/crossentropy": 2.174471014738083,
"loss/hidden": 3.534765625,
"loss/jsd": 0.0,
"loss/logits": 0.21107099391520023,
"step": 1480
},
{
"epoch": 0.03725,
"grad_norm": 40.5,
"grad_norm_var": 7.112239583333333,
"learning_rate": 0.0001,
"loss": 7.5362,
"loss/crossentropy": 2.069592148065567,
"loss/hidden": 3.373046875,
"loss/jsd": 0.0,
"loss/logits": 0.19253854881972074,
"step": 1490
},
{
"epoch": 0.0375,
"grad_norm": 33.25,
"grad_norm_var": 6.4869140625,
"learning_rate": 0.0001,
"loss": 7.5941,
"loss/crossentropy": 2.0679068714380264,
"loss/hidden": 3.453125,
"loss/jsd": 0.0,
"loss/logits": 0.20371587071567773,
"step": 1500
},
{
"epoch": 0.03775,
"grad_norm": 30.875,
"grad_norm_var": 2.5686848958333335,
"learning_rate": 0.0001,
"loss": 7.6557,
"loss/crossentropy": 2.1364961892366408,
"loss/hidden": 3.4546875,
"loss/jsd": 0.0,
"loss/logits": 0.20447015166282653,
"step": 1510
},
{
"epoch": 0.038,
"grad_norm": 32.5,
"grad_norm_var": 2.6738932291666666,
"learning_rate": 0.0001,
"loss": 7.622,
"loss/crossentropy": 2.1275009989738463,
"loss/hidden": 3.431640625,
"loss/jsd": 0.0,
"loss/logits": 0.18973923586308955,
"step": 1520
},
{
"epoch": 0.03825,
"grad_norm": 32.5,
"grad_norm_var": 1.3177083333333333,
"learning_rate": 0.0001,
"loss": 7.7366,
"loss/crossentropy": 2.212426933646202,
"loss/hidden": 3.44296875,
"loss/jsd": 0.0,
"loss/logits": 0.21735910680145026,
"step": 1530
},
{
"epoch": 0.0385,
"grad_norm": 27.75,
"grad_norm_var": 2.7348307291666667,
"learning_rate": 0.0001,
"loss": 7.6356,
"loss/crossentropy": 2.1074824020266534,
"loss/hidden": 3.39609375,
"loss/jsd": 0.0,
"loss/logits": 0.19886015299707652,
"step": 1540
},
{
"epoch": 0.03875,
"grad_norm": 31.75,
"grad_norm_var": 4.234830729166666,
"learning_rate": 0.0001,
"loss": 7.7567,
"loss/crossentropy": 2.177932971715927,
"loss/hidden": 3.5609375,
"loss/jsd": 0.0,
"loss/logits": 0.2221821215003729,
"step": 1550
},
{
"epoch": 0.039,
"grad_norm": 36.25,
"grad_norm_var": 2.8791015625,
"learning_rate": 0.0001,
"loss": 7.5766,
"loss/crossentropy": 2.0759232968091963,
"loss/hidden": 3.386328125,
"loss/jsd": 0.0,
"loss/logits": 0.19086231291294098,
"step": 1560
},
{
"epoch": 0.03925,
"grad_norm": 32.0,
"grad_norm_var": 3.7046223958333333,
"learning_rate": 0.0001,
"loss": 7.537,
"loss/crossentropy": 2.2487298890948297,
"loss/hidden": 3.3515625,
"loss/jsd": 0.0,
"loss/logits": 0.19235554365441204,
"step": 1570
},
{
"epoch": 0.0395,
"grad_norm": 33.0,
"grad_norm_var": 2.5927083333333334,
"learning_rate": 0.0001,
"loss": 7.5167,
"loss/crossentropy": 2.2023983120918276,
"loss/hidden": 3.405078125,
"loss/jsd": 0.0,
"loss/logits": 0.20318181458860635,
"step": 1580
},
{
"epoch": 0.03975,
"grad_norm": 30.0,
"grad_norm_var": 3.4436848958333335,
"learning_rate": 0.0001,
"loss": 7.5528,
"loss/crossentropy": 1.9812082558870316,
"loss/hidden": 3.426171875,
"loss/jsd": 0.0,
"loss/logits": 0.19151438660919667,
"step": 1590
},
{
"epoch": 0.04,
"grad_norm": 30.0,
"grad_norm_var": 2.6098307291666667,
"learning_rate": 0.0001,
"loss": 7.5232,
"loss/crossentropy": 2.0910235196352005,
"loss/hidden": 3.4640625,
"loss/jsd": 0.0,
"loss/logits": 0.20488944984972476,
"step": 1600
},
{
"epoch": 0.04025,
"grad_norm": 30.0,
"grad_norm_var": 1.5854166666666667,
"learning_rate": 0.0001,
"loss": 7.7078,
"loss/crossentropy": 2.079045358300209,
"loss/hidden": 3.483203125,
"loss/jsd": 0.0,
"loss/logits": 0.20172932054847478,
"step": 1610
},
{
"epoch": 0.0405,
"grad_norm": 33.5,
"grad_norm_var": 3.4368798010809257e+18,
"learning_rate": 0.0001,
"loss": 7.5441,
"loss/crossentropy": 2.146761792898178,
"loss/hidden": 3.562890625,
"loss/jsd": 0.0,
"loss/logits": 0.20237026009708642,
"step": 1620
},
{
"epoch": 0.04075,
"grad_norm": 31.625,
"grad_norm_var": 3.4368798024404393e+18,
"learning_rate": 0.0001,
"loss": 7.7162,
"loss/crossentropy": 2.1575208425521852,
"loss/hidden": 3.35703125,
"loss/jsd": 0.0,
"loss/logits": 0.1914537126198411,
"step": 1630
},
{
"epoch": 0.041,
"grad_norm": 29.0,
"grad_norm_var": 37.940625,
"learning_rate": 0.0001,
"loss": 7.5673,
"loss/crossentropy": 2.2059059768915175,
"loss/hidden": 3.385546875,
"loss/jsd": 0.0,
"loss/logits": 0.1920375470072031,
"step": 1640
},
{
"epoch": 0.04125,
"grad_norm": 30.75,
"grad_norm_var": 61.631184895833336,
"learning_rate": 0.0001,
"loss": 7.5788,
"loss/crossentropy": 2.1531882882118225,
"loss/hidden": 3.3765625,
"loss/jsd": 0.0,
"loss/logits": 0.192771671153605,
"step": 1650
},
{
"epoch": 0.0415,
"grad_norm": 35.25,
"grad_norm_var": 47.353125,
"learning_rate": 0.0001,
"loss": 7.5891,
"loss/crossentropy": 2.1217811673879625,
"loss/hidden": 3.4046875,
"loss/jsd": 0.0,
"loss/logits": 0.19420051630586385,
"step": 1660
},
{
"epoch": 0.04175,
"grad_norm": 29.875,
"grad_norm_var": 21.580989583333334,
"learning_rate": 0.0001,
"loss": 7.6361,
"loss/crossentropy": 2.0970900297164916,
"loss/hidden": 3.48828125,
"loss/jsd": 0.0,
"loss/logits": 0.21125762071460485,
"step": 1670
},
{
"epoch": 0.042,
"grad_norm": 36.5,
"grad_norm_var": 72.77916666666667,
"learning_rate": 0.0001,
"loss": 7.7045,
"loss/crossentropy": 2.1662445843219755,
"loss/hidden": 3.441015625,
"loss/jsd": 0.0,
"loss/logits": 0.21249269619584082,
"step": 1680
},
{
"epoch": 0.04225,
"grad_norm": 32.75,
"grad_norm_var": 72.42473958333333,
"learning_rate": 0.0001,
"loss": 7.5754,
"loss/crossentropy": 2.1745404630899428,
"loss/hidden": 3.478125,
"loss/jsd": 0.0,
"loss/logits": 0.20168567337095739,
"step": 1690
},
{
"epoch": 0.0425,
"grad_norm": 32.5,
"grad_norm_var": 9.149739583333334,
"learning_rate": 0.0001,
"loss": 7.5555,
"loss/crossentropy": 2.126581160724163,
"loss/hidden": 3.34296875,
"loss/jsd": 0.0,
"loss/logits": 0.1933064443990588,
"step": 1700
},
{
"epoch": 0.04275,
"grad_norm": 32.0,
"grad_norm_var": 10.1369140625,
"learning_rate": 0.0001,
"loss": 7.5377,
"loss/crossentropy": 2.1726800590753554,
"loss/hidden": 3.441796875,
"loss/jsd": 0.0,
"loss/logits": 0.20911512654274703,
"step": 1710
},
{
"epoch": 0.043,
"grad_norm": 31.0,
"grad_norm_var": 3.314322916666667,
"learning_rate": 0.0001,
"loss": 7.6106,
"loss/crossentropy": 1.9999253153800964,
"loss/hidden": 3.43046875,
"loss/jsd": 0.0,
"loss/logits": 0.18493321686983108,
"step": 1720
},
{
"epoch": 0.04325,
"grad_norm": 38.0,
"grad_norm_var": 6.673958333333333,
"learning_rate": 0.0001,
"loss": 7.7146,
"loss/crossentropy": 2.1180618047714233,
"loss/hidden": 3.46953125,
"loss/jsd": 0.0,
"loss/logits": 0.2076649811118841,
"step": 1730
},
{
"epoch": 0.0435,
"grad_norm": 34.0,
"grad_norm_var": 7.339322916666666,
"learning_rate": 0.0001,
"loss": 7.5464,
"loss/crossentropy": 2.0583778262138366,
"loss/hidden": 3.322265625,
"loss/jsd": 0.0,
"loss/logits": 0.18418019600212573,
"step": 1740
},
{
"epoch": 0.04375,
"grad_norm": 31.0,
"grad_norm_var": 4.749739583333334,
"learning_rate": 0.0001,
"loss": 7.5528,
"loss/crossentropy": 2.2259044647216797,
"loss/hidden": 3.394921875,
"loss/jsd": 0.0,
"loss/logits": 0.2031107559800148,
"step": 1750
},
{
"epoch": 0.044,
"grad_norm": 30.75,
"grad_norm_var": 3.1389973958333335,
"learning_rate": 0.0001,
"loss": 7.5321,
"loss/crossentropy": 2.124616578221321,
"loss/hidden": 3.46484375,
"loss/jsd": 0.0,
"loss/logits": 0.2149519257247448,
"step": 1760
},
{
"epoch": 0.04425,
"grad_norm": 28.625,
"grad_norm_var": 6.442708333333333,
"learning_rate": 0.0001,
"loss": 7.63,
"loss/crossentropy": 2.1418310686945916,
"loss/hidden": 3.35390625,
"loss/jsd": 0.0,
"loss/logits": 0.19125681575387715,
"step": 1770
},
{
"epoch": 0.0445,
"grad_norm": 32.25,
"grad_norm_var": 3.011458333333333,
"learning_rate": 0.0001,
"loss": 7.5307,
"loss/crossentropy": 2.144363935291767,
"loss/hidden": 3.440625,
"loss/jsd": 0.0,
"loss/logits": 0.19334549438208343,
"step": 1780
},
{
"epoch": 0.04475,
"grad_norm": 32.75,
"grad_norm_var": 1.6559895833333333,
"learning_rate": 0.0001,
"loss": 7.627,
"loss/crossentropy": 2.2538854971528055,
"loss/hidden": 3.385546875,
"loss/jsd": 0.0,
"loss/logits": 0.2010333575308323,
"step": 1790
},
{
"epoch": 0.045,
"grad_norm": 31.5,
"grad_norm_var": 1.9416015625,
"learning_rate": 0.0001,
"loss": 7.6471,
"loss/crossentropy": 2.091226762533188,
"loss/hidden": 3.450390625,
"loss/jsd": 0.0,
"loss/logits": 0.2152847982943058,
"step": 1800
},
{
"epoch": 0.04525,
"grad_norm": 28.875,
"grad_norm_var": 18.03515625,
"learning_rate": 0.0001,
"loss": 7.5626,
"loss/crossentropy": 2.295862782001495,
"loss/hidden": 3.3984375,
"loss/jsd": 0.0,
"loss/logits": 0.20952691733837128,
"step": 1810
},
{
"epoch": 0.0455,
"grad_norm": 30.75,
"grad_norm_var": 18.4541015625,
"learning_rate": 0.0001,
"loss": 7.6872,
"loss/crossentropy": 2.0833657890558244,
"loss/hidden": 3.4875,
"loss/jsd": 0.0,
"loss/logits": 0.2070673793554306,
"step": 1820
},
{
"epoch": 0.04575,
"grad_norm": 30.25,
"grad_norm_var": 2.812239583333333,
"learning_rate": 0.0001,
"loss": 7.647,
"loss/crossentropy": 2.1021256439387797,
"loss/hidden": 3.387890625,
"loss/jsd": 0.0,
"loss/logits": 0.18996517434716226,
"step": 1830
},
{
"epoch": 0.046,
"grad_norm": 33.75,
"grad_norm_var": 2.7009765625,
"learning_rate": 0.0001,
"loss": 7.5203,
"loss/crossentropy": 2.2004275381565095,
"loss/hidden": 3.314453125,
"loss/jsd": 0.0,
"loss/logits": 0.187726416811347,
"step": 1840
},
{
"epoch": 0.04625,
"grad_norm": 30.0,
"grad_norm_var": 4.067643229166666,
"learning_rate": 0.0001,
"loss": 7.6201,
"loss/crossentropy": 2.1705893486738206,
"loss/hidden": 3.522265625,
"loss/jsd": 0.0,
"loss/logits": 0.21116435080766677,
"step": 1850
},
{
"epoch": 0.0465,
"grad_norm": 32.5,
"grad_norm_var": 5.030989583333334,
"learning_rate": 0.0001,
"loss": 7.6235,
"loss/crossentropy": 2.1204057022929192,
"loss/hidden": 3.351953125,
"loss/jsd": 0.0,
"loss/logits": 0.19528388790786266,
"step": 1860
},
{
"epoch": 0.04675,
"grad_norm": 45.0,
"grad_norm_var": 15.6603515625,
"learning_rate": 0.0001,
"loss": 7.4562,
"loss/crossentropy": 2.154220977425575,
"loss/hidden": 3.4125,
"loss/jsd": 0.0,
"loss/logits": 0.19214881088119,
"step": 1870
},
{
"epoch": 0.047,
"grad_norm": 32.5,
"grad_norm_var": 15.614518229166666,
"learning_rate": 0.0001,
"loss": 7.6258,
"loss/crossentropy": 2.1000912792980673,
"loss/hidden": 3.459375,
"loss/jsd": 0.0,
"loss/logits": 0.20185065623372794,
"step": 1880
},
{
"epoch": 0.04725,
"grad_norm": 33.25,
"grad_norm_var": 5.931184895833334,
"learning_rate": 0.0001,
"loss": 7.4863,
"loss/crossentropy": 2.2270338363945483,
"loss/hidden": 3.337890625,
"loss/jsd": 0.0,
"loss/logits": 0.19286383930593728,
"step": 1890
},
{
"epoch": 0.0475,
"grad_norm": 32.5,
"grad_norm_var": 3.311393229166667,
"learning_rate": 0.0001,
"loss": 7.6001,
"loss/crossentropy": 2.1998794853687285,
"loss/hidden": 3.382421875,
"loss/jsd": 0.0,
"loss/logits": 0.19646506551653148,
"step": 1900
},
{
"epoch": 0.04775,
"grad_norm": 31.375,
"grad_norm_var": 3.577018229166667,
"learning_rate": 0.0001,
"loss": 7.653,
"loss/crossentropy": 2.104053999483585,
"loss/hidden": 3.48203125,
"loss/jsd": 0.0,
"loss/logits": 0.201510801166296,
"step": 1910
},
{
"epoch": 0.048,
"grad_norm": 30.5,
"grad_norm_var": 17.68125,
"learning_rate": 0.0001,
"loss": 7.5273,
"loss/crossentropy": 2.1184800997376443,
"loss/hidden": 3.358984375,
"loss/jsd": 0.0,
"loss/logits": 0.19248204957693815,
"step": 1920
},
{
"epoch": 0.04825,
"grad_norm": 33.5,
"grad_norm_var": 1.3,
"learning_rate": 0.0001,
"loss": 7.655,
"loss/crossentropy": 2.1734499007463457,
"loss/hidden": 3.455859375,
"loss/jsd": 0.0,
"loss/logits": 0.1961166137829423,
"step": 1930
},
{
"epoch": 0.0485,
"grad_norm": 30.125,
"grad_norm_var": 2.5205729166666666,
"learning_rate": 0.0001,
"loss": 7.6186,
"loss/crossentropy": 2.237542712688446,
"loss/hidden": 3.400390625,
"loss/jsd": 0.0,
"loss/logits": 0.20616979897022247,
"step": 1940
},
{
"epoch": 0.04875,
"grad_norm": 30.75,
"grad_norm_var": 2.810416666666667,
"learning_rate": 0.0001,
"loss": 7.5436,
"loss/crossentropy": 2.1811093270778654,
"loss/hidden": 3.40234375,
"loss/jsd": 0.0,
"loss/logits": 0.1928004425019026,
"step": 1950
},
{
"epoch": 0.049,
"grad_norm": 34.25,
"grad_norm_var": 1.5181640625,
"learning_rate": 0.0001,
"loss": 7.6175,
"loss/crossentropy": 2.045266591012478,
"loss/hidden": 3.45078125,
"loss/jsd": 0.0,
"loss/logits": 0.21615136358886958,
"step": 1960
},
{
"epoch": 0.04925,
"grad_norm": 33.5,
"grad_norm_var": 26.225455729166665,
"learning_rate": 0.0001,
"loss": 7.7083,
"loss/crossentropy": 2.086462992429733,
"loss/hidden": 3.469921875,
"loss/jsd": 0.0,
"loss/logits": 0.2192224683240056,
"step": 1970
},
{
"epoch": 0.0495,
"grad_norm": 29.75,
"grad_norm_var": 8.340625,
"learning_rate": 0.0001,
"loss": 7.5032,
"loss/crossentropy": 2.0052025958895685,
"loss/hidden": 3.407421875,
"loss/jsd": 0.0,
"loss/logits": 0.18473264537751674,
"step": 1980
},
{
"epoch": 0.04975,
"grad_norm": 44.5,
"grad_norm_var": 4635.33515625,
"learning_rate": 0.0001,
"loss": 7.5567,
"loss/crossentropy": 2.04089385792613,
"loss/hidden": 3.394921875,
"loss/jsd": 0.0,
"loss/logits": 0.19560968028381467,
"step": 1990
},
{
"epoch": 0.05,
"grad_norm": 51.0,
"grad_norm_var": 4555.746875,
"learning_rate": 0.0001,
"loss": 7.6029,
"loss/crossentropy": 2.2241372987627983,
"loss/hidden": 3.387109375,
"loss/jsd": 0.0,
"loss/logits": 0.2209097046405077,
"step": 2000
},
{
"epoch": 0.05025,
"grad_norm": 32.5,
"grad_norm_var": 49.11608072916667,
"learning_rate": 0.0001,
"loss": 7.5781,
"loss/crossentropy": 2.2160742044448853,
"loss/hidden": 3.4171875,
"loss/jsd": 0.0,
"loss/logits": 0.19539013858884574,
"step": 2010
},
{
"epoch": 0.0505,
"grad_norm": 28.625,
"grad_norm_var": 73.70201822916667,
"learning_rate": 0.0001,
"loss": 7.6375,
"loss/crossentropy": 2.094863271713257,
"loss/hidden": 3.413671875,
"loss/jsd": 0.0,
"loss/logits": 0.20763578601181507,
"step": 2020
},
{
"epoch": 0.05075,
"grad_norm": 30.125,
"grad_norm_var": 22.494791666666668,
"learning_rate": 0.0001,
"loss": 7.6164,
"loss/crossentropy": 2.121490868926048,
"loss/hidden": 3.402734375,
"loss/jsd": 0.0,
"loss/logits": 0.19190637897700072,
"step": 2030
},
{
"epoch": 0.051,
"grad_norm": 34.5,
"grad_norm_var": 20.549739583333334,
"learning_rate": 0.0001,
"loss": 7.5717,
"loss/crossentropy": 2.0418698236346247,
"loss/hidden": 3.390625,
"loss/jsd": 0.0,
"loss/logits": 0.18076165094971658,
"step": 2040
},
{
"epoch": 0.05125,
"grad_norm": 29.125,
"grad_norm_var": 14.407291666666667,
"learning_rate": 0.0001,
"loss": 7.5867,
"loss/crossentropy": 2.1989556729793547,
"loss/hidden": 3.385546875,
"loss/jsd": 0.0,
"loss/logits": 0.2011225748807192,
"step": 2050
},
{
"epoch": 0.0515,
"grad_norm": 32.75,
"grad_norm_var": 17.120572916666667,
"learning_rate": 0.0001,
"loss": 7.6813,
"loss/crossentropy": 2.073545518517494,
"loss/hidden": 3.546484375,
"loss/jsd": 0.0,
"loss/logits": 0.22323863469064237,
"step": 2060
},
{
"epoch": 0.05175,
"grad_norm": 42.25,
"grad_norm_var": 76.27291666666666,
"learning_rate": 0.0001,
"loss": 7.6323,
"loss/crossentropy": 2.1424515694379807,
"loss/hidden": 3.44140625,
"loss/jsd": 0.0,
"loss/logits": 0.19750071745365858,
"step": 2070
},
{
"epoch": 0.052,
"grad_norm": 30.625,
"grad_norm_var": 74.925,
"learning_rate": 0.0001,
"loss": 7.6198,
"loss/crossentropy": 2.006666135787964,
"loss/hidden": 3.463671875,
"loss/jsd": 0.0,
"loss/logits": 0.19839615989476442,
"step": 2080
},
{
"epoch": 0.05225,
"grad_norm": 41.5,
"grad_norm_var": 30.869791666666668,
"learning_rate": 0.0001,
"loss": 7.5671,
"loss/crossentropy": 2.2227450221776963,
"loss/hidden": 3.451171875,
"loss/jsd": 0.0,
"loss/logits": 0.20443473970517517,
"step": 2090
},
{
"epoch": 0.0525,
"grad_norm": 34.75,
"grad_norm_var": 24.609309895833334,
"learning_rate": 0.0001,
"loss": 7.5928,
"loss/crossentropy": 2.186553081870079,
"loss/hidden": 3.378125,
"loss/jsd": 0.0,
"loss/logits": 0.18928833175450563,
"step": 2100
},
{
"epoch": 0.05275,
"grad_norm": 29.125,
"grad_norm_var": 18.37265625,
"learning_rate": 0.0001,
"loss": 7.7389,
"loss/crossentropy": 2.0680640071630476,
"loss/hidden": 3.4515625,
"loss/jsd": 0.0,
"loss/logits": 0.201955908536911,
"step": 2110
},
{
"epoch": 0.053,
"grad_norm": 33.5,
"grad_norm_var": 9.276497395833333,
"learning_rate": 0.0001,
"loss": 7.6148,
"loss/crossentropy": 2.1981600403785704,
"loss/hidden": 3.352734375,
"loss/jsd": 0.0,
"loss/logits": 0.1901057105511427,
"step": 2120
},
{
"epoch": 0.05325,
"grad_norm": 31.75,
"grad_norm_var": 5.308072916666666,
"learning_rate": 0.0001,
"loss": 7.48,
"loss/crossentropy": 2.2574460208415985,
"loss/hidden": 3.362109375,
"loss/jsd": 0.0,
"loss/logits": 0.19361322987824678,
"step": 2130
},
{
"epoch": 0.0535,
"grad_norm": 29.625,
"grad_norm_var": 8.5900390625,
"learning_rate": 0.0001,
"loss": 7.6644,
"loss/crossentropy": 2.154197073727846,
"loss/hidden": 3.50625,
"loss/jsd": 0.0,
"loss/logits": 0.2086074635386467,
"step": 2140
},
{
"epoch": 0.05375,
"grad_norm": 30.5,
"grad_norm_var": 8.333072916666667,
"learning_rate": 0.0001,
"loss": 7.5279,
"loss/crossentropy": 2.1245498836040495,
"loss/hidden": 3.37265625,
"loss/jsd": 0.0,
"loss/logits": 0.19000006280839443,
"step": 2150
},
{
"epoch": 0.054,
"grad_norm": 31.375,
"grad_norm_var": 10.856184895833334,
"learning_rate": 0.0001,
"loss": 7.7098,
"loss/crossentropy": 2.1968549311161043,
"loss/hidden": 3.469140625,
"loss/jsd": 0.0,
"loss/logits": 0.21285793352872134,
"step": 2160
},
{
"epoch": 0.05425,
"grad_norm": 34.25,
"grad_norm_var": 17.820572916666666,
"learning_rate": 0.0001,
"loss": 7.7095,
"loss/crossentropy": 2.2533976465463637,
"loss/hidden": 3.434375,
"loss/jsd": 0.0,
"loss/logits": 0.20727334953844548,
"step": 2170
},
{
"epoch": 0.0545,
"grad_norm": 36.0,
"grad_norm_var": 22.128580729166668,
"learning_rate": 0.0001,
"loss": 7.5933,
"loss/crossentropy": 2.227231651544571,
"loss/hidden": 3.380078125,
"loss/jsd": 0.0,
"loss/logits": 0.19693543761968613,
"step": 2180
},
{
"epoch": 0.05475,
"grad_norm": 29.125,
"grad_norm_var": 11.466666666666667,
"learning_rate": 0.0001,
"loss": 7.5348,
"loss/crossentropy": 2.1581582985818386,
"loss/hidden": 3.397265625,
"loss/jsd": 0.0,
"loss/logits": 0.19432583590969443,
"step": 2190
},
{
"epoch": 0.055,
"grad_norm": 37.25,
"grad_norm_var": 10.514518229166667,
"learning_rate": 0.0001,
"loss": 7.6264,
"loss/crossentropy": 2.0908095851540565,
"loss/hidden": 3.39296875,
"loss/jsd": 0.0,
"loss/logits": 0.20568534098565577,
"step": 2200
},
{
"epoch": 0.05525,
"grad_norm": 33.25,
"grad_norm_var": 5.6494140625,
"learning_rate": 0.0001,
"loss": 7.671,
"loss/crossentropy": 2.0827252097427844,
"loss/hidden": 3.54921875,
"loss/jsd": 0.0,
"loss/logits": 0.20747530292719601,
"step": 2210
},
{
"epoch": 0.0555,
"grad_norm": 31.875,
"grad_norm_var": 47.234375,
"learning_rate": 0.0001,
"loss": 7.5823,
"loss/crossentropy": 2.2815075665712357,
"loss/hidden": 3.41484375,
"loss/jsd": 0.0,
"loss/logits": 0.20698099359869956,
"step": 2220
},
{
"epoch": 0.05575,
"grad_norm": 37.5,
"grad_norm_var": 30.277018229166668,
"learning_rate": 0.0001,
"loss": 7.5812,
"loss/crossentropy": 2.2282338082790374,
"loss/hidden": 3.365234375,
"loss/jsd": 0.0,
"loss/logits": 0.19244133178144693,
"step": 2230
},
{
"epoch": 0.056,
"grad_norm": 31.0,
"grad_norm_var": 27.68515625,
"learning_rate": 0.0001,
"loss": 7.6485,
"loss/crossentropy": 2.184544026851654,
"loss/hidden": 3.39375,
"loss/jsd": 0.0,
"loss/logits": 0.1996760057285428,
"step": 2240
},
{
"epoch": 0.05625,
"grad_norm": 30.125,
"grad_norm_var": 2.223372395833333,
"learning_rate": 0.0001,
"loss": 7.6003,
"loss/crossentropy": 2.1572179198265076,
"loss/hidden": 3.38984375,
"loss/jsd": 0.0,
"loss/logits": 0.20082181300967933,
"step": 2250
},
{
"epoch": 0.0565,
"grad_norm": 63.5,
"grad_norm_var": 240.61295572916666,
"learning_rate": 0.0001,
"loss": 7.6129,
"loss/crossentropy": 2.1616927281022074,
"loss/hidden": 3.494140625,
"loss/jsd": 0.0,
"loss/logits": 0.21196384327486156,
"step": 2260
},
{
"epoch": 0.05675,
"grad_norm": 29.875,
"grad_norm_var": 276.98932291666665,
"learning_rate": 0.0001,
"loss": 7.5683,
"loss/crossentropy": 2.1882350742816925,
"loss/hidden": 3.43671875,
"loss/jsd": 0.0,
"loss/logits": 0.2052389807999134,
"step": 2270
},
{
"epoch": 0.057,
"grad_norm": 33.5,
"grad_norm_var": 11.6681640625,
"learning_rate": 0.0001,
"loss": 7.6076,
"loss/crossentropy": 2.1808354407548904,
"loss/hidden": 3.540625,
"loss/jsd": 0.0,
"loss/logits": 0.21027475781738758,
"step": 2280
},
{
"epoch": 0.05725,
"grad_norm": 30.5,
"grad_norm_var": 1.6872395833333333,
"learning_rate": 0.0001,
"loss": 7.5295,
"loss/crossentropy": 2.1488232225179673,
"loss/hidden": 3.496875,
"loss/jsd": 0.0,
"loss/logits": 0.2111368477344513,
"step": 2290
},
{
"epoch": 0.0575,
"grad_norm": 30.125,
"grad_norm_var": 2.0603515625,
"learning_rate": 0.0001,
"loss": 7.6875,
"loss/crossentropy": 2.1594634115695954,
"loss/hidden": 3.384765625,
"loss/jsd": 0.0,
"loss/logits": 0.20478933807462454,
"step": 2300
},
{
"epoch": 0.05775,
"grad_norm": 31.625,
"grad_norm_var": 3.408072916666667,
"learning_rate": 0.0001,
"loss": 7.6636,
"loss/crossentropy": 2.0932443618774412,
"loss/hidden": 3.484375,
"loss/jsd": 0.0,
"loss/logits": 0.21733616031706332,
"step": 2310
},
{
"epoch": 0.058,
"grad_norm": 30.5,
"grad_norm_var": 3.5416015625,
"learning_rate": 0.0001,
"loss": 7.5737,
"loss/crossentropy": 2.1831247925758364,
"loss/hidden": 3.409765625,
"loss/jsd": 0.0,
"loss/logits": 0.1989585768431425,
"step": 2320
},
{
"epoch": 0.05825,
"grad_norm": 30.25,
"grad_norm_var": 2.2393229166666666,
"learning_rate": 0.0001,
"loss": 7.5009,
"loss/crossentropy": 2.2847615987062455,
"loss/hidden": 3.424609375,
"loss/jsd": 0.0,
"loss/logits": 0.20479252003133297,
"step": 2330
},
{
"epoch": 0.0585,
"grad_norm": 33.0,
"grad_norm_var": 6.374934895833333,
"learning_rate": 0.0001,
"loss": 7.7313,
"loss/crossentropy": 2.1928456306457518,
"loss/hidden": 3.401953125,
"loss/jsd": 0.0,
"loss/logits": 0.20380632691085337,
"step": 2340
},
{
"epoch": 0.05875,
"grad_norm": 29.125,
"grad_norm_var": 4.7697265625,
"learning_rate": 0.0001,
"loss": 7.5476,
"loss/crossentropy": 2.1438629984855653,
"loss/hidden": 3.321875,
"loss/jsd": 0.0,
"loss/logits": 0.19849517289549112,
"step": 2350
},
{
"epoch": 0.059,
"grad_norm": 33.5,
"grad_norm_var": 3.8893229166666665,
"learning_rate": 0.0001,
"loss": 7.6545,
"loss/crossentropy": 2.1132273241877555,
"loss/hidden": 3.419140625,
"loss/jsd": 0.0,
"loss/logits": 0.21593583207577466,
"step": 2360
},
{
"epoch": 0.05925,
"grad_norm": 38.0,
"grad_norm_var": 3.9934895833333335,
"learning_rate": 0.0001,
"loss": 7.6438,
"loss/crossentropy": 2.1729103833436967,
"loss/hidden": 3.4515625,
"loss/jsd": 0.0,
"loss/logits": 0.204007020406425,
"step": 2370
},
{
"epoch": 0.0595,
"grad_norm": 28.875,
"grad_norm_var": 4.180989583333333,
"learning_rate": 0.0001,
"loss": 7.5261,
"loss/crossentropy": 2.1972236961126326,
"loss/hidden": 3.37421875,
"loss/jsd": 0.0,
"loss/logits": 0.21200564429163932,
"step": 2380
},
{
"epoch": 0.05975,
"grad_norm": 30.375,
"grad_norm_var": 2.9854166666666666,
"learning_rate": 0.0001,
"loss": 7.6233,
"loss/crossentropy": 2.130857673287392,
"loss/hidden": 3.398828125,
"loss/jsd": 0.0,
"loss/logits": 0.2043182048946619,
"step": 2390
},
{
"epoch": 0.06,
"grad_norm": 33.0,
"grad_norm_var": 2.552018229166667,
"learning_rate": 0.0001,
"loss": 7.5626,
"loss/crossentropy": 2.1780240714550017,
"loss/hidden": 3.4515625,
"loss/jsd": 0.0,
"loss/logits": 0.19501971434801818,
"step": 2400
},
{
"epoch": 0.06025,
"grad_norm": 33.25,
"grad_norm_var": 1.4978515625,
"learning_rate": 0.0001,
"loss": 7.5495,
"loss/crossentropy": 2.075847564637661,
"loss/hidden": 3.377734375,
"loss/jsd": 0.0,
"loss/logits": 0.19856880297884344,
"step": 2410
},
{
"epoch": 0.0605,
"grad_norm": 30.375,
"grad_norm_var": 8.763541666666667,
"learning_rate": 0.0001,
"loss": 7.5161,
"loss/crossentropy": 1.960635770857334,
"loss/hidden": 3.521484375,
"loss/jsd": 0.0,
"loss/logits": 0.2014876109547913,
"step": 2420
},
{
"epoch": 0.06075,
"grad_norm": 30.375,
"grad_norm_var": 5.85390625,
"learning_rate": 0.0001,
"loss": 7.6453,
"loss/crossentropy": 2.235329329967499,
"loss/hidden": 3.438671875,
"loss/jsd": 0.0,
"loss/logits": 0.22043778821825982,
"step": 2430
},
{
"epoch": 0.061,
"grad_norm": 35.0,
"grad_norm_var": 2.928465629875169e+18,
"learning_rate": 0.0001,
"loss": 7.5153,
"loss/crossentropy": 2.0534446865320204,
"loss/hidden": 3.469140625,
"loss/jsd": 0.0,
"loss/logits": 0.20287865065038205,
"step": 2440
},
{
"epoch": 0.06125,
"grad_norm": 31.0,
"grad_norm_var": 2.928465629810996e+18,
"learning_rate": 0.0001,
"loss": 7.5862,
"loss/crossentropy": 2.1208725392818453,
"loss/hidden": 3.534765625,
"loss/jsd": 0.0,
"loss/logits": 0.22640016246587039,
"step": 2450
},
{
"epoch": 0.0615,
"grad_norm": 31.125,
"grad_norm_var": 1.5614583333333334,
"learning_rate": 0.0001,
"loss": 7.5751,
"loss/crossentropy": 2.206133508682251,
"loss/hidden": 3.387109375,
"loss/jsd": 0.0,
"loss/logits": 0.19071156904101372,
"step": 2460
},
{
"epoch": 0.06175,
"grad_norm": 28.625,
"grad_norm_var": 7.667643229166667,
"learning_rate": 0.0001,
"loss": 7.5604,
"loss/crossentropy": 2.227986590564251,
"loss/hidden": 3.30859375,
"loss/jsd": 0.0,
"loss/logits": 0.19392532519996167,
"step": 2470
},
{
"epoch": 0.062,
"grad_norm": 34.0,
"grad_norm_var": 5.81015625,
"learning_rate": 0.0001,
"loss": 7.6077,
"loss/crossentropy": 2.1948474526405333,
"loss/hidden": 3.427734375,
"loss/jsd": 0.0,
"loss/logits": 0.2092638686299324,
"step": 2480
},
{
"epoch": 0.06225,
"grad_norm": 34.5,
"grad_norm_var": 7.656184895833333,
"learning_rate": 0.0001,
"loss": 7.4338,
"loss/crossentropy": 1.935844713449478,
"loss/hidden": 3.413671875,
"loss/jsd": 0.0,
"loss/logits": 0.184383431263268,
"step": 2490
},
{
"epoch": 0.0625,
"grad_norm": 42.75,
"grad_norm_var": 17.27890625,
"learning_rate": 0.0001,
"loss": 7.5645,
"loss/crossentropy": 2.2957077413797378,
"loss/hidden": 3.409765625,
"loss/jsd": 0.0,
"loss/logits": 0.2223764518275857,
"step": 2500
},
{
"epoch": 0.06275,
"grad_norm": 30.125,
"grad_norm_var": 31.021809895833332,
"learning_rate": 0.0001,
"loss": 7.5539,
"loss/crossentropy": 2.2163919866085053,
"loss/hidden": 3.365234375,
"loss/jsd": 0.0,
"loss/logits": 0.2129704337567091,
"step": 2510
},
{
"epoch": 0.063,
"grad_norm": 33.25,
"grad_norm_var": 40.36764322916667,
"learning_rate": 0.0001,
"loss": 7.5664,
"loss/crossentropy": 2.08260739967227,
"loss/hidden": 3.4765625,
"loss/jsd": 0.0,
"loss/logits": 0.20597089193761348,
"step": 2520
},
{
"epoch": 0.06325,
"grad_norm": 35.5,
"grad_norm_var": 34.153125,
"learning_rate": 0.0001,
"loss": 7.559,
"loss/crossentropy": 2.0747475802898405,
"loss/hidden": 3.383203125,
"loss/jsd": 0.0,
"loss/logits": 0.1930603832937777,
"step": 2530
},
{
"epoch": 0.0635,
"grad_norm": 31.25,
"grad_norm_var": 12.302018229166666,
"learning_rate": 0.0001,
"loss": 7.609,
"loss/crossentropy": 2.1542711734771727,
"loss/hidden": 3.391796875,
"loss/jsd": 0.0,
"loss/logits": 0.19414089974015952,
"step": 2540
},
{
"epoch": 0.06375,
"grad_norm": 29.125,
"grad_norm_var": 34.333072916666666,
"learning_rate": 0.0001,
"loss": 7.5103,
"loss/crossentropy": 2.1697633042931557,
"loss/hidden": 3.325390625,
"loss/jsd": 0.0,
"loss/logits": 0.18199986461549997,
"step": 2550
},
{
"epoch": 0.064,
"grad_norm": 30.75,
"grad_norm_var": 21.8322265625,
"learning_rate": 0.0001,
"loss": 7.6335,
"loss/crossentropy": 2.20294488966465,
"loss/hidden": 3.35703125,
"loss/jsd": 0.0,
"loss/logits": 0.20988359525799752,
"step": 2560
},
{
"epoch": 0.06425,
"grad_norm": 36.5,
"grad_norm_var": 21.339322916666667,
"learning_rate": 0.0001,
"loss": 7.665,
"loss/crossentropy": 2.1367219746112824,
"loss/hidden": 3.4421875,
"loss/jsd": 0.0,
"loss/logits": 0.19768773801624775,
"step": 2570
},
{
"epoch": 0.0645,
"grad_norm": 42.5,
"grad_norm_var": 30.690625,
"learning_rate": 0.0001,
"loss": 7.6476,
"loss/crossentropy": 2.2177338540554046,
"loss/hidden": 3.605078125,
"loss/jsd": 0.0,
"loss/logits": 0.22041462864726782,
"step": 2580
},
{
"epoch": 0.06475,
"grad_norm": 34.0,
"grad_norm_var": 31.164583333333333,
"learning_rate": 0.0001,
"loss": 7.6158,
"loss/crossentropy": 2.2423059731721877,
"loss/hidden": 3.358984375,
"loss/jsd": 0.0,
"loss/logits": 0.21196699403226377,
"step": 2590
},
{
"epoch": 0.065,
"grad_norm": 34.25,
"grad_norm_var": 23.593684895833334,
"learning_rate": 0.0001,
"loss": 7.5107,
"loss/crossentropy": 2.241020438075066,
"loss/hidden": 3.25859375,
"loss/jsd": 0.0,
"loss/logits": 0.18191274981945754,
"step": 2600
},
{
"epoch": 0.06525,
"grad_norm": 28.75,
"grad_norm_var": 30.975455729166665,
"learning_rate": 0.0001,
"loss": 7.5492,
"loss/crossentropy": 2.052956056594849,
"loss/hidden": 3.447265625,
"loss/jsd": 0.0,
"loss/logits": 0.2021762602031231,
"step": 2610
},
{
"epoch": 0.0655,
"grad_norm": 32.5,
"grad_norm_var": 31.032291666666666,
"learning_rate": 0.0001,
"loss": 7.503,
"loss/crossentropy": 2.17630957365036,
"loss/hidden": 3.364453125,
"loss/jsd": 0.0,
"loss/logits": 0.19023955501616002,
"step": 2620
},
{
"epoch": 0.06575,
"grad_norm": 34.5,
"grad_norm_var": 6.083072916666667,
"learning_rate": 0.0001,
"loss": 7.5728,
"loss/crossentropy": 2.178831994533539,
"loss/hidden": 3.38046875,
"loss/jsd": 0.0,
"loss/logits": 0.1946109678596258,
"step": 2630
},
{
"epoch": 0.066,
"grad_norm": 33.5,
"grad_norm_var": 4.01640625,
"learning_rate": 0.0001,
"loss": 7.5928,
"loss/crossentropy": 2.0746659457683565,
"loss/hidden": 3.374609375,
"loss/jsd": 0.0,
"loss/logits": 0.19469854161143302,
"step": 2640
},
{
"epoch": 0.06625,
"grad_norm": 31.0,
"grad_norm_var": 21.718489583333334,
"learning_rate": 0.0001,
"loss": 7.6008,
"loss/crossentropy": 2.202178010344505,
"loss/hidden": 3.44609375,
"loss/jsd": 0.0,
"loss/logits": 0.2072868559509516,
"step": 2650
},
{
"epoch": 0.0665,
"grad_norm": 31.375,
"grad_norm_var": 24.743489583333332,
"learning_rate": 0.0001,
"loss": 7.5532,
"loss/crossentropy": 2.0579031884670256,
"loss/hidden": 3.33671875,
"loss/jsd": 0.0,
"loss/logits": 0.18248077742755414,
"step": 2660
},
{
"epoch": 0.06675,
"grad_norm": 29.5,
"grad_norm_var": 6.5181640625,
"learning_rate": 0.0001,
"loss": 7.6173,
"loss/crossentropy": 2.0323975652456285,
"loss/hidden": 3.47265625,
"loss/jsd": 0.0,
"loss/logits": 0.20279558952897786,
"step": 2670
},
{
"epoch": 0.067,
"grad_norm": 30.125,
"grad_norm_var": 9.7931640625,
"learning_rate": 0.0001,
"loss": 7.6202,
"loss/crossentropy": 2.170803511887789,
"loss/hidden": 3.448828125,
"loss/jsd": 0.0,
"loss/logits": 0.21887149531394243,
"step": 2680
},
{
"epoch": 0.06725,
"grad_norm": 28.75,
"grad_norm_var": 6.42890625,
"learning_rate": 0.0001,
"loss": 7.6042,
"loss/crossentropy": 2.129136848449707,
"loss/hidden": 3.401171875,
"loss/jsd": 0.0,
"loss/logits": 0.21714741103351115,
"step": 2690
},
{
"epoch": 0.0675,
"grad_norm": 28.5,
"grad_norm_var": 5.76640625,
"learning_rate": 0.0001,
"loss": 7.5307,
"loss/crossentropy": 2.2333458453416823,
"loss/hidden": 3.35390625,
"loss/jsd": 0.0,
"loss/logits": 0.19759367052465676,
"step": 2700
},
{
"epoch": 0.06775,
"grad_norm": 31.25,
"grad_norm_var": 4.925,
"learning_rate": 0.0001,
"loss": 7.5684,
"loss/crossentropy": 2.1930347234010696,
"loss/hidden": 3.3265625,
"loss/jsd": 0.0,
"loss/logits": 0.19194095116108656,
"step": 2710
},
{
"epoch": 0.068,
"grad_norm": 29.875,
"grad_norm_var": 1.0291666666666666,
"learning_rate": 0.0001,
"loss": 7.6177,
"loss/crossentropy": 2.1858380883932114,
"loss/hidden": 3.350390625,
"loss/jsd": 0.0,
"loss/logits": 0.18830202352255582,
"step": 2720
},
{
"epoch": 0.06825,
"grad_norm": 33.25,
"grad_norm_var": 5.19765625,
"learning_rate": 0.0001,
"loss": 7.6231,
"loss/crossentropy": 2.1380992412567137,
"loss/hidden": 3.40703125,
"loss/jsd": 0.0,
"loss/logits": 0.19919480197131634,
"step": 2730
},
{
"epoch": 0.0685,
"grad_norm": 34.75,
"grad_norm_var": 2.5872395833333335,
"learning_rate": 0.0001,
"loss": 7.7166,
"loss/crossentropy": 2.299161267280579,
"loss/hidden": 3.403125,
"loss/jsd": 0.0,
"loss/logits": 0.21277474984526634,
"step": 2740
},
{
"epoch": 0.06875,
"grad_norm": 28.25,
"grad_norm_var": 10.6525390625,
"learning_rate": 0.0001,
"loss": 7.6385,
"loss/crossentropy": 2.0595856219530106,
"loss/hidden": 3.397265625,
"loss/jsd": 0.0,
"loss/logits": 0.20196862574666738,
"step": 2750
},
{
"epoch": 0.069,
"grad_norm": 30.0,
"grad_norm_var": 3.1885416666666666,
"learning_rate": 0.0001,
"loss": 7.5306,
"loss/crossentropy": 2.1047334015369414,
"loss/hidden": 3.527734375,
"loss/jsd": 0.0,
"loss/logits": 0.20494798701256514,
"step": 2760
},
{
"epoch": 0.06925,
"grad_norm": 35.5,
"grad_norm_var": 2.5136418842603423e+18,
"learning_rate": 0.0001,
"loss": 7.5758,
"loss/crossentropy": 1.9750292956829072,
"loss/hidden": 3.49453125,
"loss/jsd": 0.0,
"loss/logits": 0.1999529790133238,
"step": 2770
},
{
"epoch": 0.0695,
"grad_norm": 30.875,
"grad_norm_var": 2.513641880435452e+18,
"learning_rate": 0.0001,
"loss": 7.5901,
"loss/crossentropy": 2.1417267471551895,
"loss/hidden": 3.412109375,
"loss/jsd": 0.0,
"loss/logits": 0.20058641098439695,
"step": 2780
},
{
"epoch": 0.06975,
"grad_norm": 28.25,
"grad_norm_var": 155.97024739583333,
"learning_rate": 0.0001,
"loss": 7.5485,
"loss/crossentropy": 2.1048891723155974,
"loss/hidden": 3.33515625,
"loss/jsd": 0.0,
"loss/logits": 0.19070767909288405,
"step": 2790
},
{
"epoch": 0.07,
"grad_norm": 32.25,
"grad_norm_var": 6.499739583333334,
"learning_rate": 0.0001,
"loss": 7.5996,
"loss/crossentropy": 2.176823168247938,
"loss/hidden": 3.344921875,
"loss/jsd": 0.0,
"loss/logits": 0.20276176873594523,
"step": 2800
},
{
"epoch": 0.07025,
"grad_norm": 29.5,
"grad_norm_var": 4.684309895833334,
"learning_rate": 0.0001,
"loss": 7.4426,
"loss/crossentropy": 2.041793665289879,
"loss/hidden": 3.3265625,
"loss/jsd": 0.0,
"loss/logits": 0.17560106106102466,
"step": 2810
},
{
"epoch": 0.0705,
"grad_norm": 31.75,
"grad_norm_var": 3.005989583333333,
"learning_rate": 0.0001,
"loss": 7.5811,
"loss/crossentropy": 2.1094699330627917,
"loss/hidden": 3.424609375,
"loss/jsd": 0.0,
"loss/logits": 0.19798500649631023,
"step": 2820
},
{
"epoch": 0.07075,
"grad_norm": 33.75,
"grad_norm_var": 4.320572916666666,
"learning_rate": 0.0001,
"loss": 7.5475,
"loss/crossentropy": 2.1311514347791674,
"loss/hidden": 3.344921875,
"loss/jsd": 0.0,
"loss/logits": 0.21825175136327743,
"step": 2830
},
{
"epoch": 0.071,
"grad_norm": 34.75,
"grad_norm_var": 3.7549465266724797e+18,
"learning_rate": 0.0001,
"loss": 7.5823,
"loss/crossentropy": 2.120433983206749,
"loss/hidden": 3.412109375,
"loss/jsd": 0.0,
"loss/logits": 0.20115374326705932,
"step": 2840
},
{
"epoch": 0.07125,
"grad_norm": 29.25,
"grad_norm_var": 3.7549465268743306e+18,
"learning_rate": 0.0001,
"loss": 7.5444,
"loss/crossentropy": 2.1518867701292037,
"loss/hidden": 3.398046875,
"loss/jsd": 0.0,
"loss/logits": 0.19730570819228888,
"step": 2850
},
{
"epoch": 0.0715,
"grad_norm": 31.0,
"grad_norm_var": 2.317122395833333,
"learning_rate": 0.0001,
"loss": 7.6362,
"loss/crossentropy": 2.137280356884003,
"loss/hidden": 3.422265625,
"loss/jsd": 0.0,
"loss/logits": 0.20695031639188527,
"step": 2860
},
{
"epoch": 0.07175,
"grad_norm": 34.0,
"grad_norm_var": 157.31015625,
"learning_rate": 0.0001,
"loss": 7.6643,
"loss/crossentropy": 2.067914080619812,
"loss/hidden": 3.5015625,
"loss/jsd": 0.0,
"loss/logits": 0.22938031535595654,
"step": 2870
},
{
"epoch": 0.072,
"grad_norm": 32.5,
"grad_norm_var": 151.77057291666668,
"learning_rate": 0.0001,
"loss": 7.5762,
"loss/crossentropy": 2.0962916165590286,
"loss/hidden": 3.456640625,
"loss/jsd": 0.0,
"loss/logits": 0.19928023852407933,
"step": 2880
},
{
"epoch": 0.07225,
"grad_norm": 35.75,
"grad_norm_var": 104.17962239583333,
"learning_rate": 0.0001,
"loss": 7.7066,
"loss/crossentropy": 2.1184468276798727,
"loss/hidden": 3.376953125,
"loss/jsd": 0.0,
"loss/logits": 0.1974171632900834,
"step": 2890
},
{
"epoch": 0.0725,
"grad_norm": 32.25,
"grad_norm_var": 116.02493489583334,
"learning_rate": 0.0001,
"loss": 7.6075,
"loss/crossentropy": 2.1381339877843857,
"loss/hidden": 3.434765625,
"loss/jsd": 0.0,
"loss/logits": 0.19655965138226747,
"step": 2900
},
{
"epoch": 0.07275,
"grad_norm": 31.5,
"grad_norm_var": 1.9551432291666666,
"learning_rate": 0.0001,
"loss": 7.5247,
"loss/crossentropy": 2.141963595151901,
"loss/hidden": 3.476953125,
"loss/jsd": 0.0,
"loss/logits": 0.19582534320652484,
"step": 2910
},
{
"epoch": 0.073,
"grad_norm": 31.875,
"grad_norm_var": 1.3014973958333333,
"learning_rate": 0.0001,
"loss": 7.5588,
"loss/crossentropy": 2.077046422660351,
"loss/hidden": 3.465234375,
"loss/jsd": 0.0,
"loss/logits": 0.20551967658102513,
"step": 2920
},
{
"epoch": 0.07325,
"grad_norm": 32.25,
"grad_norm_var": 8.734375,
"learning_rate": 0.0001,
"loss": 7.6195,
"loss/crossentropy": 2.0110410653054713,
"loss/hidden": 3.3984375,
"loss/jsd": 0.0,
"loss/logits": 0.1805876674130559,
"step": 2930
},
{
"epoch": 0.0735,
"grad_norm": 32.25,
"grad_norm_var": 4838.416666666667,
"learning_rate": 0.0001,
"loss": 7.6623,
"loss/crossentropy": 2.1130379527807235,
"loss/hidden": 3.478125,
"loss/jsd": 0.0,
"loss/logits": 0.2008265011012554,
"step": 2940
},
{
"epoch": 0.07375,
"grad_norm": 39.0,
"grad_norm_var": 57.90807291666667,
"learning_rate": 0.0001,
"loss": 7.59,
"loss/crossentropy": 2.1667733818292616,
"loss/hidden": 3.3109375,
"loss/jsd": 0.0,
"loss/logits": 0.19806477334350348,
"step": 2950
},
{
"epoch": 0.074,
"grad_norm": 31.875,
"grad_norm_var": 31.228059895833333,
"learning_rate": 0.0001,
"loss": 7.5131,
"loss/crossentropy": 2.2757086992263793,
"loss/hidden": 3.445703125,
"loss/jsd": 0.0,
"loss/logits": 0.197940625064075,
"step": 2960
},
{
"epoch": 0.07425,
"grad_norm": 34.25,
"grad_norm_var": 2.996809895833333,
"learning_rate": 0.0001,
"loss": 7.5841,
"loss/crossentropy": 2.143115535378456,
"loss/hidden": 3.45625,
"loss/jsd": 0.0,
"loss/logits": 0.20177703239023687,
"step": 2970
},
{
"epoch": 0.0745,
"grad_norm": 31.0,
"grad_norm_var": 3.1546223958333335,
"learning_rate": 0.0001,
"loss": 7.5363,
"loss/crossentropy": 2.2850147604942324,
"loss/hidden": 3.283984375,
"loss/jsd": 0.0,
"loss/logits": 0.19140432458370923,
"step": 2980
},
{
"epoch": 0.07475,
"grad_norm": 32.25,
"grad_norm_var": 7.310416666666667,
"learning_rate": 0.0001,
"loss": 7.5339,
"loss/crossentropy": 2.104042625427246,
"loss/hidden": 3.490625,
"loss/jsd": 0.0,
"loss/logits": 0.211691821180284,
"step": 2990
},
{
"epoch": 0.075,
"grad_norm": 32.5,
"grad_norm_var": 8.506705729166667,
"learning_rate": 0.0001,
"loss": 7.5448,
"loss/crossentropy": 2.2270253866910936,
"loss/hidden": 3.4390625,
"loss/jsd": 0.0,
"loss/logits": 0.20328052509576083,
"step": 3000
},
{
"epoch": 0.07525,
"grad_norm": 46.75,
"grad_norm_var": 17.10390625,
"learning_rate": 0.0001,
"loss": 7.6114,
"loss/crossentropy": 2.2623429775238035,
"loss/hidden": 3.32265625,
"loss/jsd": 0.0,
"loss/logits": 0.19479246698319913,
"step": 3010
},
{
"epoch": 0.0755,
"grad_norm": 31.75,
"grad_norm_var": 18.981184895833334,
"learning_rate": 0.0001,
"loss": 7.5739,
"loss/crossentropy": 2.1012531995773314,
"loss/hidden": 3.454296875,
"loss/jsd": 0.0,
"loss/logits": 0.20767469964921476,
"step": 3020
},
{
"epoch": 0.07575,
"grad_norm": 45.25,
"grad_norm_var": 15.55,
"learning_rate": 0.0001,
"loss": 7.6153,
"loss/crossentropy": 2.090057593584061,
"loss/hidden": 3.468359375,
"loss/jsd": 0.0,
"loss/logits": 0.1894347405061126,
"step": 3030
},
{
"epoch": 0.076,
"grad_norm": 32.0,
"grad_norm_var": 29.878125,
"learning_rate": 0.0001,
"loss": 7.6585,
"loss/crossentropy": 2.200800988078117,
"loss/hidden": 3.4234375,
"loss/jsd": 0.0,
"loss/logits": 0.19270651414990425,
"step": 3040
},
{
"epoch": 0.07625,
"grad_norm": 31.0,
"grad_norm_var": 10.167708333333334,
"learning_rate": 0.0001,
"loss": 7.5696,
"loss/crossentropy": 2.0458697080612183,
"loss/hidden": 3.53046875,
"loss/jsd": 0.0,
"loss/logits": 0.20314501021057368,
"step": 3050
},
{
"epoch": 0.0765,
"grad_norm": 30.375,
"grad_norm_var": 10.0181640625,
"learning_rate": 0.0001,
"loss": 7.5747,
"loss/crossentropy": 2.2118399769067763,
"loss/hidden": 3.482421875,
"loss/jsd": 0.0,
"loss/logits": 0.22158339023590087,
"step": 3060
},
{
"epoch": 0.07675,
"grad_norm": 32.25,
"grad_norm_var": 0.9650390625,
"learning_rate": 0.0001,
"loss": 7.5631,
"loss/crossentropy": 2.1812553733587263,
"loss/hidden": 3.4375,
"loss/jsd": 0.0,
"loss/logits": 0.217013025470078,
"step": 3070
},
{
"epoch": 0.077,
"grad_norm": 32.75,
"grad_norm_var": 0.9171223958333333,
"learning_rate": 0.0001,
"loss": 7.578,
"loss/crossentropy": 2.181536224484444,
"loss/hidden": 3.36953125,
"loss/jsd": 0.0,
"loss/logits": 0.19952490609139203,
"step": 3080
},
{
"epoch": 0.07725,
"grad_norm": 31.5,
"grad_norm_var": 19.015559895833334,
"learning_rate": 0.0001,
"loss": 7.4878,
"loss/crossentropy": 2.122265163064003,
"loss/hidden": 3.4171875,
"loss/jsd": 0.0,
"loss/logits": 0.19191155321896075,
"step": 3090
},
{
"epoch": 0.0775,
"grad_norm": 32.75,
"grad_norm_var": 5.199739583333334,
"learning_rate": 0.0001,
"loss": 7.7471,
"loss/crossentropy": 2.210584083199501,
"loss/hidden": 3.42890625,
"loss/jsd": 0.0,
"loss/logits": 0.21269479542970657,
"step": 3100
},
{
"epoch": 0.07775,
"grad_norm": 33.25,
"grad_norm_var": 4.490625,
"learning_rate": 0.0001,
"loss": 7.6586,
"loss/crossentropy": 2.090894425660372,
"loss/hidden": 3.46328125,
"loss/jsd": 0.0,
"loss/logits": 0.19853799045085907,
"step": 3110
},
{
"epoch": 0.078,
"grad_norm": 31.375,
"grad_norm_var": 27.370833333333334,
"learning_rate": 0.0001,
"loss": 7.5818,
"loss/crossentropy": 2.044330509006977,
"loss/hidden": 3.50078125,
"loss/jsd": 0.0,
"loss/logits": 0.20974230151623488,
"step": 3120
},
{
"epoch": 0.07825,
"grad_norm": 31.75,
"grad_norm_var": 6.688997395833334,
"learning_rate": 0.0001,
"loss": 7.654,
"loss/crossentropy": 2.1380004197359086,
"loss/hidden": 3.3875,
"loss/jsd": 0.0,
"loss/logits": 0.20455153118818997,
"step": 3130
},
{
"epoch": 0.0785,
"grad_norm": 29.875,
"grad_norm_var": 7.715625,
"learning_rate": 0.0001,
"loss": 7.6551,
"loss/crossentropy": 2.1576705113053323,
"loss/hidden": 3.4484375,
"loss/jsd": 0.0,
"loss/logits": 0.20728315506130457,
"step": 3140
},
{
"epoch": 0.07875,
"grad_norm": 30.875,
"grad_norm_var": 3.2864583333333335,
"learning_rate": 0.0001,
"loss": 7.5478,
"loss/crossentropy": 2.233546493947506,
"loss/hidden": 3.424609375,
"loss/jsd": 0.0,
"loss/logits": 0.20845879297703504,
"step": 3150
},
{
"epoch": 0.079,
"grad_norm": 34.25,
"grad_norm_var": 15.4728515625,
"learning_rate": 0.0001,
"loss": 7.6264,
"loss/crossentropy": 2.0873878210783006,
"loss/hidden": 3.355859375,
"loss/jsd": 0.0,
"loss/logits": 0.19576258175075054,
"step": 3160
},
{
"epoch": 0.07925,
"grad_norm": 29.625,
"grad_norm_var": 3.12265625,
"learning_rate": 0.0001,
"loss": 7.5192,
"loss/crossentropy": 2.132347696274519,
"loss/hidden": 3.345703125,
"loss/jsd": 0.0,
"loss/logits": 0.1958464809693396,
"step": 3170
},
{
"epoch": 0.0795,
"grad_norm": 33.0,
"grad_norm_var": 3.370247395833333,
"learning_rate": 0.0001,
"loss": 7.5169,
"loss/crossentropy": 2.2280534476041796,
"loss/hidden": 3.35546875,
"loss/jsd": 0.0,
"loss/logits": 0.19412651136517525,
"step": 3180
},
{
"epoch": 0.07975,
"grad_norm": 28.25,
"grad_norm_var": 4.989518229166666,
"learning_rate": 0.0001,
"loss": 7.6576,
"loss/crossentropy": 2.138143754005432,
"loss/hidden": 3.58828125,
"loss/jsd": 0.0,
"loss/logits": 0.23302078600972892,
"step": 3190
},
{
"epoch": 0.08,
"grad_norm": 30.375,
"grad_norm_var": 6.858072916666667,
"learning_rate": 0.0001,
"loss": 7.5707,
"loss/crossentropy": 2.1512865126132965,
"loss/hidden": 3.424609375,
"loss/jsd": 0.0,
"loss/logits": 0.1957532402127981,
"step": 3200
},
{
"epoch": 0.08025,
"grad_norm": 39.0,
"grad_norm_var": 34.83515625,
"learning_rate": 0.0001,
"loss": 7.6814,
"loss/crossentropy": 2.113873428106308,
"loss/hidden": 3.52890625,
"loss/jsd": 0.0,
"loss/logits": 0.22486987188458443,
"step": 3210
},
{
"epoch": 0.0805,
"grad_norm": 30.625,
"grad_norm_var": 33.244205729166666,
"learning_rate": 0.0001,
"loss": 7.6542,
"loss/crossentropy": 2.153007471561432,
"loss/hidden": 3.5078125,
"loss/jsd": 0.0,
"loss/logits": 0.20893471594899893,
"step": 3220
},
{
"epoch": 0.08075,
"grad_norm": 31.25,
"grad_norm_var": 619.4145182291667,
"learning_rate": 0.0001,
"loss": 7.5091,
"loss/crossentropy": 2.169908273220062,
"loss/hidden": 3.456640625,
"loss/jsd": 0.0,
"loss/logits": 0.20535510070621968,
"step": 3230
},
{
"epoch": 0.081,
"grad_norm": 30.0,
"grad_norm_var": 599.9247395833333,
"learning_rate": 0.0001,
"loss": 7.5568,
"loss/crossentropy": 2.048283484578133,
"loss/hidden": 3.593359375,
"loss/jsd": 0.0,
"loss/logits": 0.21250182073563337,
"step": 3240
},
{
"epoch": 0.08125,
"grad_norm": 30.5,
"grad_norm_var": 2.982291666666667,
"learning_rate": 0.0001,
"loss": 7.5569,
"loss/crossentropy": 2.0828478574752807,
"loss/hidden": 3.496484375,
"loss/jsd": 0.0,
"loss/logits": 0.20137296654284,
"step": 3250
},
{
"epoch": 0.0815,
"grad_norm": 28.25,
"grad_norm_var": 17.9900390625,
"learning_rate": 0.0001,
"loss": 7.5869,
"loss/crossentropy": 2.28429861664772,
"loss/hidden": 3.375,
"loss/jsd": 0.0,
"loss/logits": 0.21088685300201176,
"step": 3260
},
{
"epoch": 0.08175,
"grad_norm": 32.25,
"grad_norm_var": 2.5587890625,
"learning_rate": 0.0001,
"loss": 7.6037,
"loss/crossentropy": 2.1336950808763504,
"loss/hidden": 3.431640625,
"loss/jsd": 0.0,
"loss/logits": 0.20057348478585482,
"step": 3270
},
{
"epoch": 0.082,
"grad_norm": 33.75,
"grad_norm_var": 34.57076822916667,
"learning_rate": 0.0001,
"loss": 7.6129,
"loss/crossentropy": 2.0662791609764097,
"loss/hidden": 3.408203125,
"loss/jsd": 0.0,
"loss/logits": 0.22612145096063613,
"step": 3280
},
{
"epoch": 0.08225,
"grad_norm": 32.75,
"grad_norm_var": 18.8181640625,
"learning_rate": 0.0001,
"loss": 7.5469,
"loss/crossentropy": 2.069541847705841,
"loss/hidden": 3.376171875,
"loss/jsd": 0.0,
"loss/logits": 0.1844556663185358,
"step": 3290
},
{
"epoch": 0.0825,
"grad_norm": 29.375,
"grad_norm_var": 3.0712890625,
"learning_rate": 0.0001,
"loss": 7.6025,
"loss/crossentropy": 2.2499852567911147,
"loss/hidden": 3.451953125,
"loss/jsd": 0.0,
"loss/logits": 0.21941267363727093,
"step": 3300
},
{
"epoch": 0.08275,
"grad_norm": 33.0,
"grad_norm_var": 1.29140625,
"learning_rate": 0.0001,
"loss": 7.6334,
"loss/crossentropy": 2.1277933359146117,
"loss/hidden": 3.437109375,
"loss/jsd": 0.0,
"loss/logits": 0.19795072823762894,
"step": 3310
},
{
"epoch": 0.083,
"grad_norm": 34.0,
"grad_norm_var": 3.41015625,
"learning_rate": 0.0001,
"loss": 7.6326,
"loss/crossentropy": 2.214772176742554,
"loss/hidden": 3.35390625,
"loss/jsd": 0.0,
"loss/logits": 0.20066177062690257,
"step": 3320
},
{
"epoch": 0.08325,
"grad_norm": 29.875,
"grad_norm_var": 18.768489583333334,
"learning_rate": 0.0001,
"loss": 7.569,
"loss/crossentropy": 2.093920087814331,
"loss/hidden": 3.4375,
"loss/jsd": 0.0,
"loss/logits": 0.2021244278177619,
"step": 3330
},
{
"epoch": 0.0835,
"grad_norm": 30.25,
"grad_norm_var": 18.695833333333333,
"learning_rate": 0.0001,
"loss": 7.5041,
"loss/crossentropy": 2.0556397035717966,
"loss/hidden": 3.34765625,
"loss/jsd": 0.0,
"loss/logits": 0.17018966227769852,
"step": 3340
},
{
"epoch": 0.08375,
"grad_norm": 28.875,
"grad_norm_var": 2.854622395833333,
"learning_rate": 0.0001,
"loss": 7.5837,
"loss/crossentropy": 2.0227382972836496,
"loss/hidden": 3.4890625,
"loss/jsd": 0.0,
"loss/logits": 0.21123163159936667,
"step": 3350
},
{
"epoch": 0.084,
"grad_norm": 33.5,
"grad_norm_var": 2.894791666666667,
"learning_rate": 0.0001,
"loss": 7.5374,
"loss/crossentropy": 2.146658593416214,
"loss/hidden": 3.3515625,
"loss/jsd": 0.0,
"loss/logits": 0.18999559991061687,
"step": 3360
},
{
"epoch": 0.08425,
"grad_norm": 32.5,
"grad_norm_var": 2.5759765625,
"learning_rate": 0.0001,
"loss": 7.6197,
"loss/crossentropy": 1.9798088841140271,
"loss/hidden": 3.569921875,
"loss/jsd": 0.0,
"loss/logits": 0.20031734639778734,
"step": 3370
},
{
"epoch": 0.0845,
"grad_norm": 30.5,
"grad_norm_var": 1.6436848958333334,
"learning_rate": 0.0001,
"loss": 7.5738,
"loss/crossentropy": 2.1442878276109694,
"loss/hidden": 3.458984375,
"loss/jsd": 0.0,
"loss/logits": 0.20664554908871652,
"step": 3380
},
{
"epoch": 0.08475,
"grad_norm": 29.25,
"grad_norm_var": 1.9875,
"learning_rate": 0.0001,
"loss": 7.4965,
"loss/crossentropy": 2.150037130713463,
"loss/hidden": 3.3625,
"loss/jsd": 0.0,
"loss/logits": 0.19511928483843805,
"step": 3390
},
{
"epoch": 0.085,
"grad_norm": 32.25,
"grad_norm_var": 2.2122395833333335,
"learning_rate": 0.0001,
"loss": 7.5151,
"loss/crossentropy": 2.0501646161079408,
"loss/hidden": 3.425390625,
"loss/jsd": 0.0,
"loss/logits": 0.19615819547325372,
"step": 3400
},
{
"epoch": 0.08525,
"grad_norm": 33.75,
"grad_norm_var": 2.720833333333333,
"learning_rate": 0.0001,
"loss": 7.6432,
"loss/crossentropy": 2.2328430742025374,
"loss/hidden": 3.41796875,
"loss/jsd": 0.0,
"loss/logits": 0.19816372059285642,
"step": 3410
},
{
"epoch": 0.0855,
"grad_norm": 31.375,
"grad_norm_var": 7.519791666666666,
"learning_rate": 0.0001,
"loss": 7.665,
"loss/crossentropy": 2.1567111521959306,
"loss/hidden": 3.500390625,
"loss/jsd": 0.0,
"loss/logits": 0.19839788805693387,
"step": 3420
},
{
"epoch": 0.08575,
"grad_norm": 40.25,
"grad_norm_var": 7.808072916666666,
"learning_rate": 0.0001,
"loss": 7.5919,
"loss/crossentropy": 2.2465982705354692,
"loss/hidden": 3.46328125,
"loss/jsd": 0.0,
"loss/logits": 0.21547308284789324,
"step": 3430
},
{
"epoch": 0.086,
"grad_norm": 33.0,
"grad_norm_var": 8.09765625,
"learning_rate": 0.0001,
"loss": 7.6239,
"loss/crossentropy": 2.1680932968854902,
"loss/hidden": 3.391015625,
"loss/jsd": 0.0,
"loss/logits": 0.21484404131770135,
"step": 3440
},
{
"epoch": 0.08625,
"grad_norm": 28.625,
"grad_norm_var": 3.070572916666667,
"learning_rate": 0.0001,
"loss": 7.4802,
"loss/crossentropy": 2.050510385632515,
"loss/hidden": 3.421484375,
"loss/jsd": 0.0,
"loss/logits": 0.18613745234906673,
"step": 3450
},
{
"epoch": 0.0865,
"grad_norm": 33.5,
"grad_norm_var": 2.2143229166666667,
"learning_rate": 0.0001,
"loss": 7.5311,
"loss/crossentropy": 2.0699805982410906,
"loss/hidden": 3.40390625,
"loss/jsd": 0.0,
"loss/logits": 0.1896037317812443,
"step": 3460
},
{
"epoch": 0.08675,
"grad_norm": 31.875,
"grad_norm_var": 5.651822916666666,
"learning_rate": 0.0001,
"loss": 7.596,
"loss/crossentropy": 2.0576356425881386,
"loss/hidden": 3.522265625,
"loss/jsd": 0.0,
"loss/logits": 0.20867941789329053,
"step": 3470
},
{
"epoch": 0.087,
"grad_norm": 29.625,
"grad_norm_var": 5.998372395833333,
"learning_rate": 0.0001,
"loss": 7.4738,
"loss/crossentropy": 2.2489717990159988,
"loss/hidden": 3.3203125,
"loss/jsd": 0.0,
"loss/logits": 0.18595699593424797,
"step": 3480
},
{
"epoch": 0.08725,
"grad_norm": 28.5,
"grad_norm_var": 5.2056640625,
"learning_rate": 0.0001,
"loss": 7.5246,
"loss/crossentropy": 2.178911143541336,
"loss/hidden": 3.36796875,
"loss/jsd": 0.0,
"loss/logits": 0.204429741948843,
"step": 3490
},
{
"epoch": 0.0875,
"grad_norm": 28.25,
"grad_norm_var": 1.8134765625,
"learning_rate": 0.0001,
"loss": 7.4901,
"loss/crossentropy": 2.1816289871931076,
"loss/hidden": 3.425390625,
"loss/jsd": 0.0,
"loss/logits": 0.20369651466608046,
"step": 3500
},
{
"epoch": 0.08775,
"grad_norm": 32.25,
"grad_norm_var": 59.895572916666666,
"learning_rate": 0.0001,
"loss": 7.6056,
"loss/crossentropy": 2.124045217037201,
"loss/hidden": 3.562890625,
"loss/jsd": 0.0,
"loss/logits": 0.2361205333843827,
"step": 3510
},
{
"epoch": 0.088,
"grad_norm": 31.375,
"grad_norm_var": 58.15358072916667,
"learning_rate": 0.0001,
"loss": 7.6185,
"loss/crossentropy": 2.0576027542352677,
"loss/hidden": 3.408984375,
"loss/jsd": 0.0,
"loss/logits": 0.19163726801052688,
"step": 3520
},
{
"epoch": 0.08825,
"grad_norm": 30.875,
"grad_norm_var": 19.622330729166666,
"learning_rate": 0.0001,
"loss": 7.608,
"loss/crossentropy": 2.2166375398635862,
"loss/hidden": 3.430078125,
"loss/jsd": 0.0,
"loss/logits": 0.20556784830987454,
"step": 3530
},
{
"epoch": 0.0885,
"grad_norm": 33.25,
"grad_norm_var": 3.76875,
"learning_rate": 0.0001,
"loss": 7.5655,
"loss/crossentropy": 2.215288892388344,
"loss/hidden": 3.43515625,
"loss/jsd": 0.0,
"loss/logits": 0.21560241151601076,
"step": 3540
},
{
"epoch": 0.08875,
"grad_norm": 30.0,
"grad_norm_var": 3.316666666666667,
"learning_rate": 0.0001,
"loss": 7.4909,
"loss/crossentropy": 2.0867604553699493,
"loss/hidden": 3.391015625,
"loss/jsd": 0.0,
"loss/logits": 0.1992955395951867,
"step": 3550
},
{
"epoch": 0.089,
"grad_norm": 33.25,
"grad_norm_var": 286.81555989583336,
"learning_rate": 0.0001,
"loss": 7.716,
"loss/crossentropy": 2.2209325939416886,
"loss/hidden": 3.46484375,
"loss/jsd": 0.0,
"loss/logits": 0.19783683270215988,
"step": 3560
},
{
"epoch": 0.08925,
"grad_norm": 34.75,
"grad_norm_var": 286.8197916666667,
"learning_rate": 0.0001,
"loss": 7.5905,
"loss/crossentropy": 2.098256954550743,
"loss/hidden": 3.309765625,
"loss/jsd": 0.0,
"loss/logits": 0.184653827175498,
"step": 3570
},
{
"epoch": 0.0895,
"grad_norm": 31.125,
"grad_norm_var": 5.868489583333333,
"learning_rate": 0.0001,
"loss": 7.625,
"loss/crossentropy": 2.0555992782115937,
"loss/hidden": 3.45546875,
"loss/jsd": 0.0,
"loss/logits": 0.19005871675908564,
"step": 3580
},
{
"epoch": 0.08975,
"grad_norm": 37.0,
"grad_norm_var": 7.585416666666666,
"learning_rate": 0.0001,
"loss": 7.542,
"loss/crossentropy": 2.0350914053618907,
"loss/hidden": 3.396484375,
"loss/jsd": 0.0,
"loss/logits": 0.192917075753212,
"step": 3590
},
{
"epoch": 0.09,
"grad_norm": 34.25,
"grad_norm_var": 26.8322265625,
"learning_rate": 0.0001,
"loss": 7.6028,
"loss/crossentropy": 2.249291920661926,
"loss/hidden": 3.33125,
"loss/jsd": 0.0,
"loss/logits": 0.20490463990718127,
"step": 3600
},
{
"epoch": 0.09025,
"grad_norm": 30.375,
"grad_norm_var": 2.9457682291666667,
"learning_rate": 0.0001,
"loss": 7.5738,
"loss/crossentropy": 2.201851597428322,
"loss/hidden": 3.434765625,
"loss/jsd": 0.0,
"loss/logits": 0.20158183835446836,
"step": 3610
},
{
"epoch": 0.0905,
"grad_norm": 30.125,
"grad_norm_var": 14.446809895833333,
"learning_rate": 0.0001,
"loss": 7.5562,
"loss/crossentropy": 2.188534340262413,
"loss/hidden": 3.491796875,
"loss/jsd": 0.0,
"loss/logits": 0.22296689171344042,
"step": 3620
},
{
"epoch": 0.09075,
"grad_norm": 32.0,
"grad_norm_var": 15.792643229166666,
"learning_rate": 0.0001,
"loss": 7.5509,
"loss/crossentropy": 2.1134337186813354,
"loss/hidden": 3.4078125,
"loss/jsd": 0.0,
"loss/logits": 0.19732800796627997,
"step": 3630
},
{
"epoch": 0.091,
"grad_norm": 29.875,
"grad_norm_var": 14.366080729166667,
"learning_rate": 0.0001,
"loss": 7.6349,
"loss/crossentropy": 2.0555363953113557,
"loss/hidden": 3.4328125,
"loss/jsd": 0.0,
"loss/logits": 0.19681458938866853,
"step": 3640
},
{
"epoch": 0.09125,
"grad_norm": 33.25,
"grad_norm_var": 13.740559895833334,
"learning_rate": 0.0001,
"loss": 7.6513,
"loss/crossentropy": 2.2402344048023224,
"loss/hidden": 3.46171875,
"loss/jsd": 0.0,
"loss/logits": 0.20878477580845356,
"step": 3650
},
{
"epoch": 0.0915,
"grad_norm": 32.5,
"grad_norm_var": 30.0150390625,
"learning_rate": 0.0001,
"loss": 7.5723,
"loss/crossentropy": 2.14640394449234,
"loss/hidden": 3.453515625,
"loss/jsd": 0.0,
"loss/logits": 0.20020358953624964,
"step": 3660
},
{
"epoch": 0.09175,
"grad_norm": 36.0,
"grad_norm_var": 49.2619140625,
"learning_rate": 0.0001,
"loss": 7.5782,
"loss/crossentropy": 2.2063133299350737,
"loss/hidden": 3.390234375,
"loss/jsd": 0.0,
"loss/logits": 0.19907979741692544,
"step": 3670
},
{
"epoch": 0.092,
"grad_norm": 32.0,
"grad_norm_var": 15.4400390625,
"learning_rate": 0.0001,
"loss": 7.5501,
"loss/crossentropy": 2.1933626160025597,
"loss/hidden": 3.332421875,
"loss/jsd": 0.0,
"loss/logits": 0.19394716806709766,
"step": 3680
},
{
"epoch": 0.09225,
"grad_norm": 28.375,
"grad_norm_var": 13.159375,
"learning_rate": 0.0001,
"loss": 7.5258,
"loss/crossentropy": 2.0778674989938737,
"loss/hidden": 3.344921875,
"loss/jsd": 0.0,
"loss/logits": 0.19343881569802762,
"step": 3690
},
{
"epoch": 0.0925,
"grad_norm": 32.5,
"grad_norm_var": 15.096809895833333,
"learning_rate": 0.0001,
"loss": 7.4597,
"loss/crossentropy": 2.0565507017076015,
"loss/hidden": 3.389453125,
"loss/jsd": 0.0,
"loss/logits": 0.19122497290372847,
"step": 3700
},
{
"epoch": 0.09275,
"grad_norm": 29.625,
"grad_norm_var": 2.729622395833333,
"learning_rate": 0.0001,
"loss": 7.5714,
"loss/crossentropy": 2.082234078645706,
"loss/hidden": 3.437109375,
"loss/jsd": 0.0,
"loss/logits": 0.2102882768958807,
"step": 3710
},
{
"epoch": 0.093,
"grad_norm": 30.0,
"grad_norm_var": 2.6254557291666667,
"learning_rate": 0.0001,
"loss": 7.6155,
"loss/crossentropy": 2.2312920093536377,
"loss/hidden": 3.34609375,
"loss/jsd": 0.0,
"loss/logits": 0.18575988691300155,
"step": 3720
},
{
"epoch": 0.09325,
"grad_norm": 33.25,
"grad_norm_var": 1.6181640625,
"learning_rate": 0.0001,
"loss": 7.5344,
"loss/crossentropy": 2.2397233605384828,
"loss/hidden": 3.343359375,
"loss/jsd": 0.0,
"loss/logits": 0.20670964010059834,
"step": 3730
},
{
"epoch": 0.0935,
"grad_norm": 33.5,
"grad_norm_var": 3.3124348958333334,
"learning_rate": 0.0001,
"loss": 7.5839,
"loss/crossentropy": 2.2065307170152666,
"loss/hidden": 3.348046875,
"loss/jsd": 0.0,
"loss/logits": 0.20138480551540852,
"step": 3740
},
{
"epoch": 0.09375,
"grad_norm": 36.25,
"grad_norm_var": 9.07265625,
"learning_rate": 0.0001,
"loss": 7.6468,
"loss/crossentropy": 2.192644628882408,
"loss/hidden": 3.420703125,
"loss/jsd": 0.0,
"loss/logits": 0.20751077253371478,
"step": 3750
},
{
"epoch": 0.094,
"grad_norm": 32.25,
"grad_norm_var": 18.11015625,
"learning_rate": 0.0001,
"loss": 7.731,
"loss/crossentropy": 2.205154886841774,
"loss/hidden": 3.38984375,
"loss/jsd": 0.0,
"loss/logits": 0.20712865255773066,
"step": 3760
},
{
"epoch": 0.09425,
"grad_norm": 30.75,
"grad_norm_var": 30.259309895833333,
"learning_rate": 0.0001,
"loss": 7.4267,
"loss/crossentropy": 2.0174816213548183,
"loss/hidden": 3.363671875,
"loss/jsd": 0.0,
"loss/logits": 0.1771852731704712,
"step": 3770
},
{
"epoch": 0.0945,
"grad_norm": 32.75,
"grad_norm_var": 2.687239583333333,
"learning_rate": 0.0001,
"loss": 7.5078,
"loss/crossentropy": 2.142752841114998,
"loss/hidden": 3.384375,
"loss/jsd": 0.0,
"loss/logits": 0.2145843595266342,
"step": 3780
},
{
"epoch": 0.09475,
"grad_norm": 41.5,
"grad_norm_var": 28.512239583333333,
"learning_rate": 0.0001,
"loss": 7.5586,
"loss/crossentropy": 2.2184203058481216,
"loss/hidden": 3.433203125,
"loss/jsd": 0.0,
"loss/logits": 0.2169014524668455,
"step": 3790
},
{
"epoch": 0.095,
"grad_norm": 32.5,
"grad_norm_var": 29.637955729166666,
"learning_rate": 0.0001,
"loss": 7.5796,
"loss/crossentropy": 2.0424424298107624,
"loss/hidden": 3.440234375,
"loss/jsd": 0.0,
"loss/logits": 0.19471338465809823,
"step": 3800
},
{
"epoch": 0.09525,
"grad_norm": 31.25,
"grad_norm_var": 2.846809895833333,
"learning_rate": 0.0001,
"loss": 7.4697,
"loss/crossentropy": 2.2178969264030455,
"loss/hidden": 3.215625,
"loss/jsd": 0.0,
"loss/logits": 0.18599100317806005,
"step": 3810
},
{
"epoch": 0.0955,
"grad_norm": 35.75,
"grad_norm_var": 5.571809895833334,
"learning_rate": 0.0001,
"loss": 7.5899,
"loss/crossentropy": 2.147325333952904,
"loss/hidden": 3.373046875,
"loss/jsd": 0.0,
"loss/logits": 0.19797458127141,
"step": 3820
},
{
"epoch": 0.09575,
"grad_norm": 31.0,
"grad_norm_var": 19.242708333333333,
"learning_rate": 0.0001,
"loss": 7.4524,
"loss/crossentropy": 2.222689136862755,
"loss/hidden": 3.390234375,
"loss/jsd": 0.0,
"loss/logits": 0.20618323031812907,
"step": 3830
},
{
"epoch": 0.096,
"grad_norm": 31.125,
"grad_norm_var": 6.659309895833333,
"learning_rate": 0.0001,
"loss": 7.5371,
"loss/crossentropy": 2.045217160880566,
"loss/hidden": 3.50703125,
"loss/jsd": 0.0,
"loss/logits": 0.21105091590434313,
"step": 3840
},
{
"epoch": 0.09625,
"grad_norm": 29.5,
"grad_norm_var": 3.6624348958333335,
"learning_rate": 0.0001,
"loss": 7.4832,
"loss/crossentropy": 2.1246922612190247,
"loss/hidden": 3.37109375,
"loss/jsd": 0.0,
"loss/logits": 0.19520843997597695,
"step": 3850
},
{
"epoch": 0.0965,
"grad_norm": 32.25,
"grad_norm_var": 3.840625,
"learning_rate": 0.0001,
"loss": 7.5247,
"loss/crossentropy": 2.1101455599069596,
"loss/hidden": 3.391796875,
"loss/jsd": 0.0,
"loss/logits": 0.19655006285756826,
"step": 3860
},
{
"epoch": 0.09675,
"grad_norm": 31.375,
"grad_norm_var": 4.0759765625,
"learning_rate": 0.0001,
"loss": 7.7123,
"loss/crossentropy": 2.1925098091363906,
"loss/hidden": 3.57265625,
"loss/jsd": 0.0,
"loss/logits": 0.21303071565926074,
"step": 3870
},
{
"epoch": 0.097,
"grad_norm": 32.25,
"grad_norm_var": 2.940559895833333,
"learning_rate": 0.0001,
"loss": 7.6194,
"loss/crossentropy": 2.0254321210086346,
"loss/hidden": 3.504296875,
"loss/jsd": 0.0,
"loss/logits": 0.21549067068845035,
"step": 3880
},
{
"epoch": 0.09725,
"grad_norm": 32.75,
"grad_norm_var": 1.6895833333333334,
"learning_rate": 0.0001,
"loss": 7.5341,
"loss/crossentropy": 2.18448192179203,
"loss/hidden": 3.331640625,
"loss/jsd": 0.0,
"loss/logits": 0.18761022239923478,
"step": 3890
},
{
"epoch": 0.0975,
"grad_norm": 32.5,
"grad_norm_var": 14.326822916666666,
"learning_rate": 0.0001,
"loss": 7.6378,
"loss/crossentropy": 2.0941652059555054,
"loss/hidden": 3.45703125,
"loss/jsd": 0.0,
"loss/logits": 0.20138136427849532,
"step": 3900
},
{
"epoch": 0.09775,
"grad_norm": 31.875,
"grad_norm_var": 2.1186848958333333,
"learning_rate": 0.0001,
"loss": 7.6588,
"loss/crossentropy": 2.09155390933156,
"loss/hidden": 3.487109375,
"loss/jsd": 0.0,
"loss/logits": 0.2071656842716038,
"step": 3910
},
{
"epoch": 0.098,
"grad_norm": 32.5,
"grad_norm_var": 3.729622395833333,
"learning_rate": 0.0001,
"loss": 7.6663,
"loss/crossentropy": 2.1858886659145353,
"loss/hidden": 3.41796875,
"loss/jsd": 0.0,
"loss/logits": 0.20006632767617702,
"step": 3920
},
{
"epoch": 0.09825,
"grad_norm": 40.25,
"grad_norm_var": 15.908072916666667,
"learning_rate": 0.0001,
"loss": 7.5567,
"loss/crossentropy": 2.2354005187749864,
"loss/hidden": 3.355859375,
"loss/jsd": 0.0,
"loss/logits": 0.20157534964382648,
"step": 3930
},
{
"epoch": 0.0985,
"grad_norm": 31.625,
"grad_norm_var": 17.745768229166668,
"learning_rate": 0.0001,
"loss": 7.5189,
"loss/crossentropy": 2.015377716720104,
"loss/hidden": 3.390234375,
"loss/jsd": 0.0,
"loss/logits": 0.1868499366566539,
"step": 3940
},
{
"epoch": 0.09875,
"grad_norm": 33.0,
"grad_norm_var": 7.66640625,
"learning_rate": 0.0001,
"loss": 7.533,
"loss/crossentropy": 2.0591939449310304,
"loss/hidden": 3.385546875,
"loss/jsd": 0.0,
"loss/logits": 0.20258171651512386,
"step": 3950
},
{
"epoch": 0.099,
"grad_norm": 41.25,
"grad_norm_var": 3.3442041663272433e+18,
"learning_rate": 0.0001,
"loss": 7.4832,
"loss/crossentropy": 2.1117863088846205,
"loss/hidden": 3.559375,
"loss/jsd": 0.0,
"loss/logits": 0.2428264247253537,
"step": 3960
},
{
"epoch": 0.09925,
"grad_norm": 30.875,
"grad_norm_var": 3.3442041639803904e+18,
"learning_rate": 0.0001,
"loss": 7.6253,
"loss/crossentropy": 2.1176558315753935,
"loss/hidden": 3.55546875,
"loss/jsd": 0.0,
"loss/logits": 0.22726768516004087,
"step": 3970
},
{
"epoch": 0.0995,
"grad_norm": 31.25,
"grad_norm_var": 19.5384765625,
"learning_rate": 0.0001,
"loss": 7.4413,
"loss/crossentropy": 2.088257111608982,
"loss/hidden": 3.34609375,
"loss/jsd": 0.0,
"loss/logits": 0.19193989606574177,
"step": 3980
},
{
"epoch": 0.09975,
"grad_norm": 32.75,
"grad_norm_var": 2.28125,
"learning_rate": 0.0001,
"loss": 7.6017,
"loss/crossentropy": 2.134918417036533,
"loss/hidden": 3.411328125,
"loss/jsd": 0.0,
"loss/logits": 0.19602251183241606,
"step": 3990
},
{
"epoch": 0.1,
"grad_norm": 32.25,
"grad_norm_var": 6.172916666666667,
"learning_rate": 0.0001,
"loss": 7.5138,
"loss/crossentropy": 2.1410045489668845,
"loss/hidden": 3.4515625,
"loss/jsd": 0.0,
"loss/logits": 0.20588791109621524,
"step": 4000
},
{
"epoch": 0.10025,
"grad_norm": 30.0,
"grad_norm_var": 11.4306640625,
"learning_rate": 0.0001,
"loss": 7.5549,
"loss/crossentropy": 2.241489386558533,
"loss/hidden": 3.343359375,
"loss/jsd": 0.0,
"loss/logits": 0.20121449399739505,
"step": 4010
},
{
"epoch": 0.1005,
"grad_norm": 33.75,
"grad_norm_var": 5.220247395833334,
"learning_rate": 0.0001,
"loss": 7.6037,
"loss/crossentropy": 2.177129751443863,
"loss/hidden": 3.4484375,
"loss/jsd": 0.0,
"loss/logits": 0.195396139472723,
"step": 4020
},
{
"epoch": 0.10075,
"grad_norm": 31.375,
"grad_norm_var": 25.279622395833332,
"learning_rate": 0.0001,
"loss": 7.5495,
"loss/crossentropy": 2.1074128076434135,
"loss/hidden": 3.40859375,
"loss/jsd": 0.0,
"loss/logits": 0.18998505976051092,
"step": 4030
},
{
"epoch": 0.101,
"grad_norm": 30.375,
"grad_norm_var": 12.564518229166667,
"learning_rate": 0.0001,
"loss": 7.4628,
"loss/crossentropy": 2.031256873905659,
"loss/hidden": 3.369921875,
"loss/jsd": 0.0,
"loss/logits": 0.19107761420309544,
"step": 4040
},
{
"epoch": 0.10125,
"grad_norm": 29.25,
"grad_norm_var": 8.242643229166667,
"learning_rate": 0.0001,
"loss": 7.5906,
"loss/crossentropy": 2.2593255966901777,
"loss/hidden": 3.33515625,
"loss/jsd": 0.0,
"loss/logits": 0.20071447864174843,
"step": 4050
},
{
"epoch": 0.1015,
"grad_norm": 28.875,
"grad_norm_var": 7.72890625,
"learning_rate": 0.0001,
"loss": 7.5008,
"loss/crossentropy": 2.1623566120862963,
"loss/hidden": 3.442578125,
"loss/jsd": 0.0,
"loss/logits": 0.20114662442356349,
"step": 4060
},
{
"epoch": 0.10175,
"grad_norm": 28.75,
"grad_norm_var": 2.5291015625,
"learning_rate": 0.0001,
"loss": 7.4901,
"loss/crossentropy": 2.1303680926561355,
"loss/hidden": 3.458984375,
"loss/jsd": 0.0,
"loss/logits": 0.19380100946873427,
"step": 4070
},
{
"epoch": 0.102,
"grad_norm": 33.25,
"grad_norm_var": 1.9583333333333333,
"learning_rate": 0.0001,
"loss": 7.6067,
"loss/crossentropy": 2.180470046401024,
"loss/hidden": 3.40390625,
"loss/jsd": 0.0,
"loss/logits": 0.20080227889120578,
"step": 4080
},
{
"epoch": 0.10225,
"grad_norm": 31.0,
"grad_norm_var": 43.7791015625,
"learning_rate": 0.0001,
"loss": 7.3949,
"loss/crossentropy": 2.04951853454113,
"loss/hidden": 3.391796875,
"loss/jsd": 0.0,
"loss/logits": 0.1938589910045266,
"step": 4090
},
{
"epoch": 0.1025,
"grad_norm": 31.5,
"grad_norm_var": 40.233333333333334,
"learning_rate": 0.0001,
"loss": 7.5,
"loss/crossentropy": 1.9436089858412742,
"loss/hidden": 3.387890625,
"loss/jsd": 0.0,
"loss/logits": 0.1772780598141253,
"step": 4100
},
{
"epoch": 0.10275,
"grad_norm": 30.75,
"grad_norm_var": 6.14765625,
"learning_rate": 0.0001,
"loss": 7.5926,
"loss/crossentropy": 2.1081591993570328,
"loss/hidden": 3.436328125,
"loss/jsd": 0.0,
"loss/logits": 0.20672952029854058,
"step": 4110
},
{
"epoch": 0.103,
"grad_norm": 31.875,
"grad_norm_var": 11.225455729166667,
"learning_rate": 0.0001,
"loss": 7.6334,
"loss/crossentropy": 2.0973087579011915,
"loss/hidden": 3.393359375,
"loss/jsd": 0.0,
"loss/logits": 0.20208985283970832,
"step": 4120
},
{
"epoch": 0.10325,
"grad_norm": 29.375,
"grad_norm_var": 25.839583333333334,
"learning_rate": 0.0001,
"loss": 7.4644,
"loss/crossentropy": 2.205477836728096,
"loss/hidden": 3.3546875,
"loss/jsd": 0.0,
"loss/logits": 0.19515758529305458,
"step": 4130
},
{
"epoch": 0.1035,
"grad_norm": 31.25,
"grad_norm_var": 3.6567057291666667,
"learning_rate": 0.0001,
"loss": 7.4746,
"loss/crossentropy": 2.1042226657271383,
"loss/hidden": 3.378515625,
"loss/jsd": 0.0,
"loss/logits": 0.1850555408746004,
"step": 4140
},
{
"epoch": 0.10375,
"grad_norm": 29.875,
"grad_norm_var": 2.5952473958333333,
"learning_rate": 0.0001,
"loss": 7.5805,
"loss/crossentropy": 2.1080187141895292,
"loss/hidden": 3.481640625,
"loss/jsd": 0.0,
"loss/logits": 0.20642356667667627,
"step": 4150
},
{
"epoch": 0.104,
"grad_norm": 34.75,
"grad_norm_var": 12.11875,
"learning_rate": 0.0001,
"loss": 7.6864,
"loss/crossentropy": 2.1868012815713884,
"loss/hidden": 3.444140625,
"loss/jsd": 0.0,
"loss/logits": 0.19968394786119462,
"step": 4160
},
{
"epoch": 0.10425,
"grad_norm": 32.25,
"grad_norm_var": 11.406184895833333,
"learning_rate": 0.0001,
"loss": 7.6461,
"loss/crossentropy": 2.0963368862867355,
"loss/hidden": 3.4609375,
"loss/jsd": 0.0,
"loss/logits": 0.21419077794998884,
"step": 4170
},
{
"epoch": 0.1045,
"grad_norm": 31.0,
"grad_norm_var": 11.670833333333333,
"learning_rate": 0.0001,
"loss": 7.6029,
"loss/crossentropy": 2.100097879767418,
"loss/hidden": 3.35390625,
"loss/jsd": 0.0,
"loss/logits": 0.18958222791552543,
"step": 4180
},
{
"epoch": 0.10475,
"grad_norm": 29.375,
"grad_norm_var": 7.664518229166666,
"learning_rate": 0.0001,
"loss": 7.5954,
"loss/crossentropy": 2.2243005722761153,
"loss/hidden": 3.40625,
"loss/jsd": 0.0,
"loss/logits": 0.19990855641663074,
"step": 4190
},
{
"epoch": 0.105,
"grad_norm": 29.75,
"grad_norm_var": 4.333072916666667,
"learning_rate": 0.0001,
"loss": 7.6671,
"loss/crossentropy": 2.250749832391739,
"loss/hidden": 3.308203125,
"loss/jsd": 0.0,
"loss/logits": 0.19082491770386695,
"step": 4200
},
{
"epoch": 0.10525,
"grad_norm": 29.25,
"grad_norm_var": 10.568489583333333,
"learning_rate": 0.0001,
"loss": 7.6118,
"loss/crossentropy": 2.155411234498024,
"loss/hidden": 3.42421875,
"loss/jsd": 0.0,
"loss/logits": 0.2127680890262127,
"step": 4210
},
{
"epoch": 0.1055,
"grad_norm": 34.25,
"grad_norm_var": 8.6556640625,
"learning_rate": 0.0001,
"loss": 7.4391,
"loss/crossentropy": 2.037487879395485,
"loss/hidden": 3.31015625,
"loss/jsd": 0.0,
"loss/logits": 0.17465929109603168,
"step": 4220
},
{
"epoch": 0.10575,
"grad_norm": 30.625,
"grad_norm_var": 8.183072916666667,
"learning_rate": 0.0001,
"loss": 7.6537,
"loss/crossentropy": 2.067080709338188,
"loss/hidden": 3.316796875,
"loss/jsd": 0.0,
"loss/logits": 0.1857584908604622,
"step": 4230
},
{
"epoch": 0.106,
"grad_norm": 31.25,
"grad_norm_var": 1.4171223958333334,
"learning_rate": 0.0001,
"loss": 7.608,
"loss/crossentropy": 2.279679241776466,
"loss/hidden": 3.33984375,
"loss/jsd": 0.0,
"loss/logits": 0.206354571133852,
"step": 4240
},
{
"epoch": 0.10625,
"grad_norm": 31.75,
"grad_norm_var": 31.7681640625,
"learning_rate": 0.0001,
"loss": 7.5259,
"loss/crossentropy": 2.1007855504751207,
"loss/hidden": 3.37265625,
"loss/jsd": 0.0,
"loss/logits": 0.1838985349982977,
"step": 4250
},
{
"epoch": 0.1065,
"grad_norm": 28.625,
"grad_norm_var": 2.5104166666666665,
"learning_rate": 0.0001,
"loss": 7.5786,
"loss/crossentropy": 2.1117694169282912,
"loss/hidden": 3.465625,
"loss/jsd": 0.0,
"loss/logits": 0.2134232448413968,
"step": 4260
},
{
"epoch": 0.10675,
"grad_norm": 33.25,
"grad_norm_var": 5.585416666666666,
"learning_rate": 0.0001,
"loss": 7.5424,
"loss/crossentropy": 2.1906253546476364,
"loss/hidden": 3.380859375,
"loss/jsd": 0.0,
"loss/logits": 0.19627480674535036,
"step": 4270
},
{
"epoch": 0.107,
"grad_norm": 33.75,
"grad_norm_var": 16.780208333333334,
"learning_rate": 0.0001,
"loss": 7.7039,
"loss/crossentropy": 2.167680537700653,
"loss/hidden": 3.469921875,
"loss/jsd": 0.0,
"loss/logits": 0.21632006093859674,
"step": 4280
},
{
"epoch": 0.10725,
"grad_norm": 32.75,
"grad_norm_var": 14.226497395833333,
"learning_rate": 0.0001,
"loss": 7.6197,
"loss/crossentropy": 2.2265933483839033,
"loss/hidden": 3.403125,
"loss/jsd": 0.0,
"loss/logits": 0.20777842812240124,
"step": 4290
},
{
"epoch": 0.1075,
"grad_norm": 32.25,
"grad_norm_var": 5.06640625,
"learning_rate": 0.0001,
"loss": 7.5019,
"loss/crossentropy": 2.0171881064772608,
"loss/hidden": 3.36640625,
"loss/jsd": 0.0,
"loss/logits": 0.1733700342476368,
"step": 4300
},
{
"epoch": 0.10775,
"grad_norm": 32.75,
"grad_norm_var": 39.37473958333333,
"learning_rate": 0.0001,
"loss": 7.6624,
"loss/crossentropy": 2.052209459245205,
"loss/hidden": 3.487109375,
"loss/jsd": 0.0,
"loss/logits": 0.2088288875296712,
"step": 4310
},
{
"epoch": 0.108,
"grad_norm": 29.375,
"grad_norm_var": 2.460724588873515e+18,
"learning_rate": 0.0001,
"loss": 7.5801,
"loss/crossentropy": 2.096518099308014,
"loss/hidden": 3.43671875,
"loss/jsd": 0.0,
"loss/logits": 0.2093063434585929,
"step": 4320
},
{
"epoch": 0.10825,
"grad_norm": 28.25,
"grad_norm_var": 2.4607245888865874e+18,
"learning_rate": 0.0001,
"loss": 7.4793,
"loss/crossentropy": 2.1164163142442702,
"loss/hidden": 3.405078125,
"loss/jsd": 0.0,
"loss/logits": 0.19930963944643737,
"step": 4330
},
{
"epoch": 0.1085,
"grad_norm": 33.5,
"grad_norm_var": 46.587239583333336,
"learning_rate": 0.0001,
"loss": 7.5915,
"loss/crossentropy": 2.1831407219171526,
"loss/hidden": 3.446875,
"loss/jsd": 0.0,
"loss/logits": 0.21474861968308687,
"step": 4340
},
{
"epoch": 0.10875,
"grad_norm": 31.0,
"grad_norm_var": 17.534375,
"learning_rate": 0.0001,
"loss": 7.4743,
"loss/crossentropy": 2.1157304018735887,
"loss/hidden": 3.416015625,
"loss/jsd": 0.0,
"loss/logits": 0.2028682116419077,
"step": 4350
},
{
"epoch": 0.109,
"grad_norm": 28.625,
"grad_norm_var": 4.282291666666667,
"learning_rate": 0.0001,
"loss": 7.6076,
"loss/crossentropy": 1.980335572361946,
"loss/hidden": 3.541796875,
"loss/jsd": 0.0,
"loss/logits": 0.20984734632074833,
"step": 4360
},
{
"epoch": 0.10925,
"grad_norm": 31.25,
"grad_norm_var": 30.812955729166667,
"learning_rate": 0.0001,
"loss": 7.5349,
"loss/crossentropy": 2.107056123018265,
"loss/hidden": 3.25859375,
"loss/jsd": 0.0,
"loss/logits": 0.18440892472863196,
"step": 4370
},
{
"epoch": 0.1095,
"grad_norm": 30.375,
"grad_norm_var": 21.462239583333332,
"learning_rate": 0.0001,
"loss": 7.5459,
"loss/crossentropy": 2.2860846698284147,
"loss/hidden": 3.297265625,
"loss/jsd": 0.0,
"loss/logits": 0.1911198776215315,
"step": 4380
},
{
"epoch": 0.10975,
"grad_norm": 30.0,
"grad_norm_var": 5.008333333333334,
"learning_rate": 0.0001,
"loss": 7.59,
"loss/crossentropy": 2.150055022537708,
"loss/hidden": 3.396875,
"loss/jsd": 0.0,
"loss/logits": 0.20779079720377922,
"step": 4390
},
{
"epoch": 0.11,
"grad_norm": 37.25,
"grad_norm_var": 24.933268229166668,
"learning_rate": 0.0001,
"loss": 7.6256,
"loss/crossentropy": 2.225931641459465,
"loss/hidden": 3.569921875,
"loss/jsd": 0.0,
"loss/logits": 0.23082431070506573,
"step": 4400
},
{
"epoch": 0.11025,
"grad_norm": 29.375,
"grad_norm_var": 26.770572916666666,
"learning_rate": 0.0001,
"loss": 7.5585,
"loss/crossentropy": 2.1318808451294897,
"loss/hidden": 3.41484375,
"loss/jsd": 0.0,
"loss/logits": 0.205277425237,
"step": 4410
},
{
"epoch": 0.1105,
"grad_norm": 28.25,
"grad_norm_var": 6.009830729166667,
"learning_rate": 0.0001,
"loss": 7.5037,
"loss/crossentropy": 2.108688759803772,
"loss/hidden": 3.426171875,
"loss/jsd": 0.0,
"loss/logits": 0.19271691460162402,
"step": 4420
},
{
"epoch": 0.11075,
"grad_norm": 30.875,
"grad_norm_var": 2.85390625,
"learning_rate": 0.0001,
"loss": 7.5599,
"loss/crossentropy": 2.1696368783712385,
"loss/hidden": 3.391796875,
"loss/jsd": 0.0,
"loss/logits": 0.19752040579915048,
"step": 4430
},
{
"epoch": 0.111,
"grad_norm": 30.25,
"grad_norm_var": 18.476822916666666,
"learning_rate": 0.0001,
"loss": 7.5321,
"loss/crossentropy": 2.167906680703163,
"loss/hidden": 3.48828125,
"loss/jsd": 0.0,
"loss/logits": 0.2087454443797469,
"step": 4440
},
{
"epoch": 0.11125,
"grad_norm": 30.75,
"grad_norm_var": 4.6634765625,
"learning_rate": 0.0001,
"loss": 7.6168,
"loss/crossentropy": 2.0555444791913033,
"loss/hidden": 3.325,
"loss/jsd": 0.0,
"loss/logits": 0.17809300348162652,
"step": 4450
},
{
"epoch": 0.1115,
"grad_norm": 31.25,
"grad_norm_var": 5.618489583333333,
"learning_rate": 0.0001,
"loss": 7.669,
"loss/crossentropy": 2.2362961381673814,
"loss/hidden": 3.480078125,
"loss/jsd": 0.0,
"loss/logits": 0.21373681984841825,
"step": 4460
},
{
"epoch": 0.11175,
"grad_norm": 31.625,
"grad_norm_var": 1.8311848958333334,
"learning_rate": 0.0001,
"loss": 7.469,
"loss/crossentropy": 2.207303923368454,
"loss/hidden": 3.400390625,
"loss/jsd": 0.0,
"loss/logits": 0.21020539589226245,
"step": 4470
},
{
"epoch": 0.112,
"grad_norm": 66.0,
"grad_norm_var": 76.79973958333333,
"learning_rate": 0.0001,
"loss": 7.5099,
"loss/crossentropy": 2.170773930847645,
"loss/hidden": 3.4703125,
"loss/jsd": 0.0,
"loss/logits": 0.20324019938707352,
"step": 4480
},
{
"epoch": 0.11225,
"grad_norm": 35.0,
"grad_norm_var": 76.34895833333333,
"learning_rate": 0.0001,
"loss": 7.5149,
"loss/crossentropy": 2.047274041175842,
"loss/hidden": 3.337890625,
"loss/jsd": 0.0,
"loss/logits": 0.18544426914304496,
"step": 4490
},
{
"epoch": 0.1125,
"grad_norm": 28.25,
"grad_norm_var": 4.081705729166667,
"learning_rate": 0.0001,
"loss": 7.5745,
"loss/crossentropy": 2.1270942091941833,
"loss/hidden": 3.311328125,
"loss/jsd": 0.0,
"loss/logits": 0.19201683439314365,
"step": 4500
},
{
"epoch": 0.11275,
"grad_norm": 28.25,
"grad_norm_var": 2.6192057291666666,
"learning_rate": 0.0001,
"loss": 7.632,
"loss/crossentropy": 2.0815075978636743,
"loss/hidden": 3.438671875,
"loss/jsd": 0.0,
"loss/logits": 0.20613169986754656,
"step": 4510
},
{
"epoch": 0.113,
"grad_norm": 33.25,
"grad_norm_var": 166.34212239583334,
"learning_rate": 0.0001,
"loss": 7.5383,
"loss/crossentropy": 2.144553080201149,
"loss/hidden": 3.33125,
"loss/jsd": 0.0,
"loss/logits": 0.19253196399658917,
"step": 4520
},
{
"epoch": 0.11325,
"grad_norm": 35.75,
"grad_norm_var": 1.734519148252968e+18,
"learning_rate": 0.0001,
"loss": 7.5876,
"loss/crossentropy": 2.212298333644867,
"loss/hidden": 3.34765625,
"loss/jsd": 0.0,
"loss/logits": 0.19719784446060656,
"step": 4530
},
{
"epoch": 0.1135,
"grad_norm": 30.625,
"grad_norm_var": 32.881184895833336,
"learning_rate": 0.0001,
"loss": 7.4563,
"loss/crossentropy": 2.095240616798401,
"loss/hidden": 3.41796875,
"loss/jsd": 0.0,
"loss/logits": 0.19202221632003785,
"step": 4540
},
{
"epoch": 0.11375,
"grad_norm": 31.625,
"grad_norm_var": 87.4853515625,
"learning_rate": 0.0001,
"loss": 7.5184,
"loss/crossentropy": 2.134147650748491,
"loss/hidden": 3.340625,
"loss/jsd": 0.0,
"loss/logits": 0.19104634067043663,
"step": 4550
},
{
"epoch": 0.114,
"grad_norm": 32.25,
"grad_norm_var": 78.95618489583333,
"learning_rate": 0.0001,
"loss": 7.4668,
"loss/crossentropy": 2.1294006586074827,
"loss/hidden": 3.326953125,
"loss/jsd": 0.0,
"loss/logits": 0.1853051505982876,
"step": 4560
},
{
"epoch": 0.11425,
"grad_norm": 30.5,
"grad_norm_var": 4.706705729166667,
"learning_rate": 0.0001,
"loss": 7.5114,
"loss/crossentropy": 2.0886638939380644,
"loss/hidden": 3.452734375,
"loss/jsd": 0.0,
"loss/logits": 0.20859680250287055,
"step": 4570
},
{
"epoch": 0.1145,
"grad_norm": 35.5,
"grad_norm_var": 5.934830729166666,
"learning_rate": 0.0001,
"loss": 7.56,
"loss/crossentropy": 2.1498399868607523,
"loss/hidden": 3.50703125,
"loss/jsd": 0.0,
"loss/logits": 0.18966037435457112,
"step": 4580
},
{
"epoch": 0.11475,
"grad_norm": 32.75,
"grad_norm_var": 3.358268229166667,
"learning_rate": 0.0001,
"loss": 7.6013,
"loss/crossentropy": 2.199784816801548,
"loss/hidden": 3.347265625,
"loss/jsd": 0.0,
"loss/logits": 0.18477672804147005,
"step": 4590
},
{
"epoch": 0.115,
"grad_norm": 31.375,
"grad_norm_var": 17.598372395833334,
"learning_rate": 0.0001,
"loss": 7.5806,
"loss/crossentropy": 2.165570431947708,
"loss/hidden": 3.505859375,
"loss/jsd": 0.0,
"loss/logits": 0.2113142903894186,
"step": 4600
},
{
"epoch": 0.11525,
"grad_norm": 31.75,
"grad_norm_var": 11.908268229166667,
"learning_rate": 0.0001,
"loss": 7.6265,
"loss/crossentropy": 2.07996127307415,
"loss/hidden": 3.418359375,
"loss/jsd": 0.0,
"loss/logits": 0.21436248067766428,
"step": 4610
},
{
"epoch": 0.1155,
"grad_norm": 28.125,
"grad_norm_var": 4.520833333333333,
"learning_rate": 0.0001,
"loss": 7.57,
"loss/crossentropy": 2.1484976023435594,
"loss/hidden": 3.390234375,
"loss/jsd": 0.0,
"loss/logits": 0.1913412597030401,
"step": 4620
},
{
"epoch": 0.11575,
"grad_norm": 30.125,
"grad_norm_var": 4.1875,
"learning_rate": 0.0001,
"loss": 7.4701,
"loss/crossentropy": 2.0956287920475005,
"loss/hidden": 3.39140625,
"loss/jsd": 0.0,
"loss/logits": 0.19558896012604238,
"step": 4630
},
{
"epoch": 0.116,
"grad_norm": 29.875,
"grad_norm_var": 2.106184895833333,
"learning_rate": 0.0001,
"loss": 7.4538,
"loss/crossentropy": 2.1474092990159988,
"loss/hidden": 3.45234375,
"loss/jsd": 0.0,
"loss/logits": 0.2039597311988473,
"step": 4640
},
{
"epoch": 0.11625,
"grad_norm": 31.75,
"grad_norm_var": 3.520768229166667,
"learning_rate": 0.0001,
"loss": 7.6657,
"loss/crossentropy": 2.1128339886665346,
"loss/hidden": 3.361328125,
"loss/jsd": 0.0,
"loss/logits": 0.21120875477790832,
"step": 4650
},
{
"epoch": 0.1165,
"grad_norm": 31.5,
"grad_norm_var": 2.133072916666667,
"learning_rate": 0.0001,
"loss": 7.5441,
"loss/crossentropy": 2.1643586844205855,
"loss/hidden": 3.4765625,
"loss/jsd": 0.0,
"loss/logits": 0.20440610516816377,
"step": 4660
},
{
"epoch": 0.11675,
"grad_norm": 33.0,
"grad_norm_var": 3.504622395833333,
"learning_rate": 0.0001,
"loss": 7.6051,
"loss/crossentropy": 2.195269528031349,
"loss/hidden": 3.428125,
"loss/jsd": 0.0,
"loss/logits": 0.20059011913836003,
"step": 4670
},
{
"epoch": 0.117,
"grad_norm": 30.0,
"grad_norm_var": 31.235416666666666,
"learning_rate": 0.0001,
"loss": 7.4526,
"loss/crossentropy": 2.082709529995918,
"loss/hidden": 3.36640625,
"loss/jsd": 0.0,
"loss/logits": 0.19617473538964986,
"step": 4680
},
{
"epoch": 0.11725,
"grad_norm": 32.0,
"grad_norm_var": 3.45625,
"learning_rate": 0.0001,
"loss": 7.5593,
"loss/crossentropy": 2.1473275452852247,
"loss/hidden": 3.481640625,
"loss/jsd": 0.0,
"loss/logits": 0.20148523468524218,
"step": 4690
},
{
"epoch": 0.1175,
"grad_norm": 31.875,
"grad_norm_var": 2.145768229166667,
"learning_rate": 0.0001,
"loss": 7.4609,
"loss/crossentropy": 2.14631325006485,
"loss/hidden": 3.42890625,
"loss/jsd": 0.0,
"loss/logits": 0.20976583026349543,
"step": 4700
},
{
"epoch": 0.11775,
"grad_norm": 32.5,
"grad_norm_var": 33.20826822916667,
"learning_rate": 0.0001,
"loss": 7.5877,
"loss/crossentropy": 2.1093395471572878,
"loss/hidden": 3.46953125,
"loss/jsd": 0.0,
"loss/logits": 0.20207433141767978,
"step": 4710
},
{
"epoch": 0.118,
"grad_norm": 30.75,
"grad_norm_var": 10.86875,
"learning_rate": 0.0001,
"loss": 7.5437,
"loss/crossentropy": 2.1447531282901764,
"loss/hidden": 3.353515625,
"loss/jsd": 0.0,
"loss/logits": 0.1963239062577486,
"step": 4720
},
{
"epoch": 0.11825,
"grad_norm": 34.5,
"grad_norm_var": 4.473372395833334,
"learning_rate": 0.0001,
"loss": 7.6248,
"loss/crossentropy": 2.1142423778772352,
"loss/hidden": 3.283203125,
"loss/jsd": 0.0,
"loss/logits": 0.19059138614684343,
"step": 4730
},
{
"epoch": 0.1185,
"grad_norm": 27.875,
"grad_norm_var": 3.098372395833333,
"learning_rate": 0.0001,
"loss": 7.5631,
"loss/crossentropy": 2.07417613863945,
"loss/hidden": 3.455859375,
"loss/jsd": 0.0,
"loss/logits": 0.19420854579657315,
"step": 4740
},
{
"epoch": 0.11875,
"grad_norm": 29.875,
"grad_norm_var": 2.734375,
"learning_rate": 0.0001,
"loss": 7.6499,
"loss/crossentropy": 2.0995796024799347,
"loss/hidden": 3.518359375,
"loss/jsd": 0.0,
"loss/logits": 0.21285411342978477,
"step": 4750
},
{
"epoch": 0.119,
"grad_norm": 32.5,
"grad_norm_var": 2.7228515625,
"learning_rate": 0.0001,
"loss": 7.5854,
"loss/crossentropy": 2.169099047780037,
"loss/hidden": 3.462109375,
"loss/jsd": 0.0,
"loss/logits": 0.22147531677037477,
"step": 4760
},
{
"epoch": 0.11925,
"grad_norm": 32.5,
"grad_norm_var": 34.799739583333334,
"learning_rate": 0.0001,
"loss": 7.435,
"loss/crossentropy": 2.0821361050009726,
"loss/hidden": 3.2953125,
"loss/jsd": 0.0,
"loss/logits": 0.17594938166439533,
"step": 4770
},
{
"epoch": 0.1195,
"grad_norm": 33.75,
"grad_norm_var": 23.00390625,
"learning_rate": 0.0001,
"loss": 7.6152,
"loss/crossentropy": 2.183753404021263,
"loss/hidden": 3.364453125,
"loss/jsd": 0.0,
"loss/logits": 0.19579742290079594,
"step": 4780
},
{
"epoch": 0.11975,
"grad_norm": 29.25,
"grad_norm_var": 23.6541015625,
"learning_rate": 0.0001,
"loss": 7.5295,
"loss/crossentropy": 2.1669270396232605,
"loss/hidden": 3.505859375,
"loss/jsd": 0.0,
"loss/logits": 0.21605710163712502,
"step": 4790
},
{
"epoch": 0.12,
"grad_norm": 34.25,
"grad_norm_var": 4.624739583333334,
"learning_rate": 0.0001,
"loss": 7.6351,
"loss/crossentropy": 2.116096779704094,
"loss/hidden": 3.4625,
"loss/jsd": 0.0,
"loss/logits": 0.21523005720227956,
"step": 4800
},
{
"epoch": 0.12025,
"grad_norm": 29.5,
"grad_norm_var": 6.009375,
"learning_rate": 0.0001,
"loss": 7.5683,
"loss/crossentropy": 2.051844981312752,
"loss/hidden": 3.351953125,
"loss/jsd": 0.0,
"loss/logits": 0.17574754767119885,
"step": 4810
},
{
"epoch": 0.1205,
"grad_norm": 30.5,
"grad_norm_var": 2.0087890625,
"learning_rate": 0.0001,
"loss": 7.5892,
"loss/crossentropy": 2.2223299980163573,
"loss/hidden": 3.3765625,
"loss/jsd": 0.0,
"loss/logits": 0.19802077617496253,
"step": 4820
},
{
"epoch": 0.12075,
"grad_norm": 30.125,
"grad_norm_var": 3.1080729166666665,
"learning_rate": 0.0001,
"loss": 7.5951,
"loss/crossentropy": 2.1982655793428423,
"loss/hidden": 3.444140625,
"loss/jsd": 0.0,
"loss/logits": 0.2064062263816595,
"step": 4830
},
{
"epoch": 0.121,
"grad_norm": 30.0,
"grad_norm_var": 12.412434895833334,
"learning_rate": 0.0001,
"loss": 7.5254,
"loss/crossentropy": 1.9753010511398315,
"loss/hidden": 3.4484375,
"loss/jsd": 0.0,
"loss/logits": 0.19706314485520124,
"step": 4840
},
{
"epoch": 0.12125,
"grad_norm": 30.125,
"grad_norm_var": 13.1447265625,
"learning_rate": 0.0001,
"loss": 7.5351,
"loss/crossentropy": 2.0562877766788006,
"loss/hidden": 3.396875,
"loss/jsd": 0.0,
"loss/logits": 0.17667807769030333,
"step": 4850
},
{
"epoch": 0.1215,
"grad_norm": 33.0,
"grad_norm_var": 3.465625,
"learning_rate": 0.0001,
"loss": 7.6024,
"loss/crossentropy": 2.1578008987009527,
"loss/hidden": 3.344140625,
"loss/jsd": 0.0,
"loss/logits": 0.18234786652028562,
"step": 4860
},
{
"epoch": 0.12175,
"grad_norm": 30.375,
"grad_norm_var": 2.343489583333333,
"learning_rate": 0.0001,
"loss": 7.4726,
"loss/crossentropy": 2.1330083698034286,
"loss/hidden": 3.333203125,
"loss/jsd": 0.0,
"loss/logits": 0.18738476932048798,
"step": 4870
},
{
"epoch": 0.122,
"grad_norm": 31.625,
"grad_norm_var": 3.2322265625,
"learning_rate": 0.0001,
"loss": 7.605,
"loss/crossentropy": 2.1732089832425117,
"loss/hidden": 3.460546875,
"loss/jsd": 0.0,
"loss/logits": 0.19949225690215827,
"step": 4880
},
{
"epoch": 0.12225,
"grad_norm": 33.5,
"grad_norm_var": 2.238997395833333,
"learning_rate": 0.0001,
"loss": 7.5382,
"loss/crossentropy": 2.188927575945854,
"loss/hidden": 3.419921875,
"loss/jsd": 0.0,
"loss/logits": 0.19776681065559387,
"step": 4890
},
{
"epoch": 0.1225,
"grad_norm": 29.0,
"grad_norm_var": 15.819791666666667,
"learning_rate": 0.0001,
"loss": 7.5728,
"loss/crossentropy": 2.202793037891388,
"loss/hidden": 3.36328125,
"loss/jsd": 0.0,
"loss/logits": 0.1992744604125619,
"step": 4900
},
{
"epoch": 0.12275,
"grad_norm": 30.25,
"grad_norm_var": 16.1775390625,
"learning_rate": 0.0001,
"loss": 7.426,
"loss/crossentropy": 2.177696394920349,
"loss/hidden": 3.36796875,
"loss/jsd": 0.0,
"loss/logits": 0.19968770015984774,
"step": 4910
},
{
"epoch": 0.123,
"grad_norm": 35.5,
"grad_norm_var": 7.707291666666666,
"learning_rate": 0.0001,
"loss": 7.7485,
"loss/crossentropy": 2.040875867009163,
"loss/hidden": 3.45234375,
"loss/jsd": 0.0,
"loss/logits": 0.1929738214239478,
"step": 4920
},
{
"epoch": 0.12325,
"grad_norm": 32.25,
"grad_norm_var": 6.771809895833333,
"learning_rate": 0.0001,
"loss": 7.5517,
"loss/crossentropy": 2.199721184372902,
"loss/hidden": 3.406640625,
"loss/jsd": 0.0,
"loss/logits": 0.20861553251743317,
"step": 4930
},
{
"epoch": 0.1235,
"grad_norm": 28.75,
"grad_norm_var": 4.235416666666667,
"learning_rate": 0.0001,
"loss": 7.5576,
"loss/crossentropy": 2.133353302627802,
"loss/hidden": 3.45546875,
"loss/jsd": 0.0,
"loss/logits": 0.21631606128066777,
"step": 4940
},
{
"epoch": 0.12375,
"grad_norm": 40.0,
"grad_norm_var": 15.1650390625,
"learning_rate": 0.0001,
"loss": 7.5129,
"loss/crossentropy": 2.2380867928266523,
"loss/hidden": 3.3703125,
"loss/jsd": 0.0,
"loss/logits": 0.2078275766223669,
"step": 4950
},
{
"epoch": 0.124,
"grad_norm": 32.5,
"grad_norm_var": 9.342643229166667,
"learning_rate": 0.0001,
"loss": 7.4421,
"loss/crossentropy": 2.1563473463058473,
"loss/hidden": 3.374609375,
"loss/jsd": 0.0,
"loss/logits": 0.1852986102923751,
"step": 4960
},
{
"epoch": 0.12425,
"grad_norm": 32.5,
"grad_norm_var": 2.3916666666666666,
"learning_rate": 0.0001,
"loss": 7.5503,
"loss/crossentropy": 2.124380439519882,
"loss/hidden": 3.480859375,
"loss/jsd": 0.0,
"loss/logits": 0.20270956568419934,
"step": 4970
},
{
"epoch": 0.1245,
"grad_norm": 30.875,
"grad_norm_var": 5.837434895833334,
"learning_rate": 0.0001,
"loss": 7.4857,
"loss/crossentropy": 2.0482941284775733,
"loss/hidden": 3.373828125,
"loss/jsd": 0.0,
"loss/logits": 0.18650466352701187,
"step": 4980
},
{
"epoch": 0.12475,
"grad_norm": 31.0,
"grad_norm_var": 3.0858723958333334,
"learning_rate": 0.0001,
"loss": 7.5446,
"loss/crossentropy": 2.160063311457634,
"loss/hidden": 3.319140625,
"loss/jsd": 0.0,
"loss/logits": 0.18700905814766883,
"step": 4990
},
{
"epoch": 0.125,
"grad_norm": 28.75,
"grad_norm_var": 5.533333333333333,
"learning_rate": 0.0001,
"loss": 7.5333,
"loss/crossentropy": 2.056824396550655,
"loss/hidden": 3.4421875,
"loss/jsd": 0.0,
"loss/logits": 0.18965724110603333,
"step": 5000
}
],
"logging_steps": 10,
"max_steps": 40000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.4287550160044032e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}