4b2-2k / trainer_state.json
semran1's picture
Upload folder using huggingface_hub
309f1d9 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.2,
"eval_steps": 2000,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001,
"grad_norm": 10752.0,
"learning_rate": 1.9e-05,
"loss": 158.0638,
"loss/crossentropy": 14.456178283691406,
"loss/hidden": 18.91875,
"loss/jsd": 0.0,
"loss/logits": 12.539741969108581,
"step": 10
},
{
"epoch": 0.002,
"grad_norm": 3264.0,
"grad_norm_var": 13568954.666666666,
"learning_rate": 2.8000000000000003e-05,
"loss": 129.8883,
"loss/crossentropy": 11.943150734901428,
"loss/hidden": 19.128125,
"loss/jsd": 0.0,
"loss/logits": 10.032073307037354,
"step": 20
},
{
"epoch": 0.003,
"grad_norm": 1824.0,
"grad_norm_var": 3372859.7333333334,
"learning_rate": 3.7e-05,
"loss": 100.0245,
"loss/crossentropy": 9.159896969795227,
"loss/hidden": 18.609375,
"loss/jsd": 0.0,
"loss/logits": 7.277156031131744,
"step": 30
},
{
"epoch": 0.004,
"grad_norm": 604.0,
"grad_norm_var": 331110.3333333333,
"learning_rate": 4.600000000000001e-05,
"loss": 90.5579,
"loss/crossentropy": 8.28059525489807,
"loss/hidden": 18.39375,
"loss/jsd": 0.0,
"loss/logits": 6.247069478034973,
"step": 40
},
{
"epoch": 0.005,
"grad_norm": 1128.0,
"grad_norm_var": 60515.2,
"learning_rate": 5.500000000000001e-05,
"loss": 86.1966,
"loss/crossentropy": 8.01256047487259,
"loss/hidden": 18.175,
"loss/jsd": 0.0,
"loss/logits": 6.1038679599761965,
"step": 50
},
{
"epoch": 0.006,
"grad_norm": 1360.0,
"grad_norm_var": 67713.86666666667,
"learning_rate": 6.400000000000001e-05,
"loss": 82.9348,
"loss/crossentropy": 7.731317961215973,
"loss/hidden": 17.959375,
"loss/jsd": 0.0,
"loss/logits": 5.726186037063599,
"step": 60
},
{
"epoch": 0.007,
"grad_norm": 1016.0,
"grad_norm_var": 35902.933333333334,
"learning_rate": 7.3e-05,
"loss": 78.6625,
"loss/crossentropy": 7.318132603168488,
"loss/hidden": 17.8375,
"loss/jsd": 0.0,
"loss/logits": 5.322234338521957,
"step": 70
},
{
"epoch": 0.008,
"grad_norm": 836.0,
"grad_norm_var": 12856.466666666667,
"learning_rate": 8.200000000000001e-05,
"loss": 74.6,
"loss/crossentropy": 6.863537752628327,
"loss/hidden": 17.325,
"loss/jsd": 0.0,
"loss/logits": 4.851147556304932,
"step": 80
},
{
"epoch": 0.009,
"grad_norm": 1168.0,
"grad_norm_var": 38569.0,
"learning_rate": 9.1e-05,
"loss": 69.2648,
"loss/crossentropy": 6.536011290550232,
"loss/hidden": 16.871875,
"loss/jsd": 0.0,
"loss/logits": 4.729572284221649,
"step": 90
},
{
"epoch": 0.01,
"grad_norm": 956.0,
"grad_norm_var": 54132.26666666667,
"learning_rate": 0.0001,
"loss": 61.5492,
"loss/crossentropy": 5.978731215000153,
"loss/hidden": 15.9046875,
"loss/jsd": 0.0,
"loss/logits": 4.037681633234024,
"step": 100
},
{
"epoch": 0.011,
"grad_norm": 494.0,
"grad_norm_var": 60329.066666666666,
"learning_rate": 0.0001,
"loss": 50.5696,
"loss/crossentropy": 5.069290089607239,
"loss/hidden": 13.9625,
"loss/jsd": 0.0,
"loss/logits": 3.04628010392189,
"step": 110
},
{
"epoch": 0.012,
"grad_norm": 242.0,
"grad_norm_var": 33342.59583333333,
"learning_rate": 0.0001,
"loss": 38.8513,
"loss/crossentropy": 4.116593188047409,
"loss/hidden": 12.21875,
"loss/jsd": 0.0,
"loss/logits": 2.207608225941658,
"step": 120
},
{
"epoch": 0.013,
"grad_norm": 189.0,
"grad_norm_var": 2268.9625,
"learning_rate": 0.0001,
"loss": 30.2934,
"loss/crossentropy": 3.6065172433853148,
"loss/hidden": 10.4703125,
"loss/jsd": 0.0,
"loss/logits": 1.553831559419632,
"step": 130
},
{
"epoch": 0.014,
"grad_norm": 129.0,
"grad_norm_var": 428.78333333333336,
"learning_rate": 0.0001,
"loss": 25.4075,
"loss/crossentropy": 3.238997083902359,
"loss/hidden": 9.36875,
"loss/jsd": 0.0,
"loss/logits": 1.2455815717577934,
"step": 140
},
{
"epoch": 0.015,
"grad_norm": 147.0,
"grad_norm_var": 884.8666666666667,
"learning_rate": 0.0001,
"loss": 21.889,
"loss/crossentropy": 3.104075390100479,
"loss/hidden": 8.19296875,
"loss/jsd": 0.0,
"loss/logits": 0.981781056523323,
"step": 150
},
{
"epoch": 0.016,
"grad_norm": 242.0,
"grad_norm_var": 1127.890625,
"learning_rate": 0.0001,
"loss": 19.3636,
"loss/crossentropy": 2.6487351998686792,
"loss/hidden": 7.96328125,
"loss/jsd": 0.0,
"loss/logits": 0.862408060580492,
"step": 160
},
{
"epoch": 0.017,
"grad_norm": 139.0,
"grad_norm_var": 1720.690625,
"learning_rate": 0.0001,
"loss": 17.9103,
"loss/crossentropy": 2.944036450982094,
"loss/hidden": 7.28671875,
"loss/jsd": 0.0,
"loss/logits": 0.7954695858061314,
"step": 170
},
{
"epoch": 0.018,
"grad_norm": 127.0,
"grad_norm_var": 1522.8958333333333,
"learning_rate": 0.0001,
"loss": 17.1787,
"loss/crossentropy": 2.7259451180696486,
"loss/hidden": 7.03046875,
"loss/jsd": 0.0,
"loss/logits": 0.7603268466889859,
"step": 180
},
{
"epoch": 0.019,
"grad_norm": 155.0,
"grad_norm_var": 1390.2666666666667,
"learning_rate": 0.0001,
"loss": 16.3546,
"loss/crossentropy": 2.745239295065403,
"loss/hidden": 6.74765625,
"loss/jsd": 0.0,
"loss/logits": 0.6926519803702831,
"step": 190
},
{
"epoch": 0.02,
"grad_norm": 164.0,
"grad_norm_var": 902.2666666666667,
"learning_rate": 0.0001,
"loss": 15.7972,
"loss/crossentropy": 2.6587735950946807,
"loss/hidden": 6.6234375,
"loss/jsd": 0.0,
"loss/logits": 0.6642795346677304,
"step": 200
},
{
"epoch": 0.021,
"grad_norm": 173.0,
"grad_norm_var": 1056.5166666666667,
"learning_rate": 0.0001,
"loss": 15.4154,
"loss/crossentropy": 2.67086471170187,
"loss/hidden": 6.30390625,
"loss/jsd": 0.0,
"loss/logits": 0.6120679222047329,
"step": 210
},
{
"epoch": 0.022,
"grad_norm": 168.0,
"grad_norm_var": 446.2291666666667,
"learning_rate": 0.0001,
"loss": 14.9164,
"loss/crossentropy": 2.8284773945808412,
"loss/hidden": 6.15546875,
"loss/jsd": 0.0,
"loss/logits": 0.6234366297721863,
"step": 220
},
{
"epoch": 0.023,
"grad_norm": 187.0,
"grad_norm_var": 7334.5625,
"learning_rate": 0.0001,
"loss": 14.9531,
"loss/crossentropy": 2.716707041859627,
"loss/hidden": 6.196875,
"loss/jsd": 0.0,
"loss/logits": 0.6206937313079834,
"step": 230
},
{
"epoch": 0.024,
"grad_norm": 172.0,
"grad_norm_var": 6329.6625,
"learning_rate": 0.0001,
"loss": 14.4769,
"loss/crossentropy": 2.4854482382535936,
"loss/hidden": 6.05859375,
"loss/jsd": 0.0,
"loss/logits": 0.5394440380856395,
"step": 240
},
{
"epoch": 0.025,
"grad_norm": 1149239296.0,
"grad_norm_var": 8.254690576187568e+16,
"learning_rate": 0.0001,
"loss": 14.4045,
"loss/crossentropy": 2.717127138376236,
"loss/hidden": 5.9890625,
"loss/jsd": 0.0,
"loss/logits": 0.5883205510675907,
"step": 250
},
{
"epoch": 0.026,
"grad_norm": 149.0,
"grad_norm_var": 8.254691009067347e+16,
"learning_rate": 0.0001,
"loss": 13.9101,
"loss/crossentropy": 2.478851719200611,
"loss/hidden": 5.8765625,
"loss/jsd": 0.0,
"loss/logits": 0.5184394292533397,
"step": 260
},
{
"epoch": 0.027,
"grad_norm": 186.0,
"grad_norm_var": 930.0625,
"learning_rate": 0.0001,
"loss": 13.6027,
"loss/crossentropy": 2.553143638372421,
"loss/hidden": 5.75234375,
"loss/jsd": 0.0,
"loss/logits": 0.5368255846202373,
"step": 270
},
{
"epoch": 0.028,
"grad_norm": 172.0,
"grad_norm_var": 3805.0666666666666,
"learning_rate": 0.0001,
"loss": 13.9542,
"loss/crossentropy": 2.7657821238040925,
"loss/hidden": 5.8421875,
"loss/jsd": 0.0,
"loss/logits": 0.5587639883160591,
"step": 280
},
{
"epoch": 0.029,
"grad_norm": 135.0,
"grad_norm_var": 3837.616666666667,
"learning_rate": 0.0001,
"loss": 13.3979,
"loss/crossentropy": 2.4579825714230537,
"loss/hidden": 5.62421875,
"loss/jsd": 0.0,
"loss/logits": 0.5003356814384461,
"step": 290
},
{
"epoch": 0.03,
"grad_norm": 119.5,
"grad_norm_var": 4336.095833333334,
"learning_rate": 0.0001,
"loss": 13.372,
"loss/crossentropy": 2.4825384080410005,
"loss/hidden": 5.77734375,
"loss/jsd": 0.0,
"loss/logits": 0.5297574065625668,
"step": 300
},
{
"epoch": 0.031,
"grad_norm": 144.0,
"grad_norm_var": 2114.4333333333334,
"learning_rate": 0.0001,
"loss": 13.2199,
"loss/crossentropy": 2.6365180641412733,
"loss/hidden": 5.5484375,
"loss/jsd": 0.0,
"loss/logits": 0.5377178646624088,
"step": 310
},
{
"epoch": 0.032,
"grad_norm": 126.5,
"grad_norm_var": 885.75,
"learning_rate": 0.0001,
"loss": 13.022,
"loss/crossentropy": 2.41667592599988,
"loss/hidden": 5.5296875,
"loss/jsd": 0.0,
"loss/logits": 0.4934091318398714,
"step": 320
},
{
"epoch": 0.033,
"grad_norm": 113.0,
"grad_norm_var": 4216.623958333334,
"learning_rate": 0.0001,
"loss": 12.6825,
"loss/crossentropy": 2.6186458706855773,
"loss/hidden": 5.3796875,
"loss/jsd": 0.0,
"loss/logits": 0.48778619766235354,
"step": 330
},
{
"epoch": 0.034,
"grad_norm": 154.0,
"grad_norm_var": 637.090625,
"learning_rate": 0.0001,
"loss": 12.6415,
"loss/crossentropy": 2.6686057686805724,
"loss/hidden": 5.384375,
"loss/jsd": 0.0,
"loss/logits": 0.4940062865614891,
"step": 340
},
{
"epoch": 0.035,
"grad_norm": 144.0,
"grad_norm_var": 2633.765625,
"learning_rate": 0.0001,
"loss": 12.6064,
"loss/crossentropy": 2.52793410718441,
"loss/hidden": 5.21171875,
"loss/jsd": 0.0,
"loss/logits": 0.45680325478315353,
"step": 350
},
{
"epoch": 0.036,
"grad_norm": 141.0,
"grad_norm_var": 2513.148958333333,
"learning_rate": 0.0001,
"loss": 12.508,
"loss/crossentropy": 2.445630243420601,
"loss/hidden": 5.31171875,
"loss/jsd": 0.0,
"loss/logits": 0.4673466898500919,
"step": 360
},
{
"epoch": 0.037,
"grad_norm": 146.0,
"grad_norm_var": 161.95729166666666,
"learning_rate": 0.0001,
"loss": 12.3383,
"loss/crossentropy": 2.432392257452011,
"loss/hidden": 5.2109375,
"loss/jsd": 0.0,
"loss/logits": 0.4600852273404598,
"step": 370
},
{
"epoch": 0.038,
"grad_norm": 122.5,
"grad_norm_var": 1555.340625,
"learning_rate": 0.0001,
"loss": 12.2486,
"loss/crossentropy": 2.448658475279808,
"loss/hidden": 5.29765625,
"loss/jsd": 0.0,
"loss/logits": 0.47797103337943553,
"step": 380
},
{
"epoch": 0.039,
"grad_norm": 110.5,
"grad_norm_var": 159.92916666666667,
"learning_rate": 0.0001,
"loss": 11.9006,
"loss/crossentropy": 2.4291503965854644,
"loss/hidden": 5.01328125,
"loss/jsd": 0.0,
"loss/logits": 0.43006020598113537,
"step": 390
},
{
"epoch": 0.04,
"grad_norm": 136.0,
"grad_norm_var": 175.37395833333332,
"learning_rate": 0.0001,
"loss": 11.9938,
"loss/crossentropy": 2.604290932416916,
"loss/hidden": 4.9828125,
"loss/jsd": 0.0,
"loss/logits": 0.4612982179969549,
"step": 400
},
{
"epoch": 0.041,
"grad_norm": 109.0,
"grad_norm_var": 170.09583333333333,
"learning_rate": 0.0001,
"loss": 11.8251,
"loss/crossentropy": 2.3994911506772043,
"loss/hidden": 5.03984375,
"loss/jsd": 0.0,
"loss/logits": 0.4143600896000862,
"step": 410
},
{
"epoch": 0.042,
"grad_norm": 122.0,
"grad_norm_var": 150.45729166666666,
"learning_rate": 0.0001,
"loss": 11.6797,
"loss/crossentropy": 2.428033410012722,
"loss/hidden": 4.96171875,
"loss/jsd": 0.0,
"loss/logits": 0.41778192222118377,
"step": 420
},
{
"epoch": 0.043,
"grad_norm": 119.0,
"grad_norm_var": 125.55729166666667,
"learning_rate": 0.0001,
"loss": 11.7055,
"loss/crossentropy": 2.569334480166435,
"loss/hidden": 4.9921875,
"loss/jsd": 0.0,
"loss/logits": 0.4176106728613377,
"step": 430
},
{
"epoch": 0.044,
"grad_norm": 120.0,
"grad_norm_var": 186.67395833333333,
"learning_rate": 0.0001,
"loss": 11.5608,
"loss/crossentropy": 2.5353519901633264,
"loss/hidden": 4.82578125,
"loss/jsd": 0.0,
"loss/logits": 0.4004150029271841,
"step": 440
},
{
"epoch": 0.045,
"grad_norm": 111.5,
"grad_norm_var": 157.52395833333333,
"learning_rate": 0.0001,
"loss": 11.6926,
"loss/crossentropy": 2.539342051744461,
"loss/hidden": 4.9390625,
"loss/jsd": 0.0,
"loss/logits": 0.4505396105349064,
"step": 450
},
{
"epoch": 0.046,
"grad_norm": 126.0,
"grad_norm_var": 329.32916666666665,
"learning_rate": 0.0001,
"loss": 11.3179,
"loss/crossentropy": 2.4947912380099297,
"loss/hidden": 4.70703125,
"loss/jsd": 0.0,
"loss/logits": 0.39311613626778125,
"step": 460
},
{
"epoch": 0.047,
"grad_norm": 130.0,
"grad_norm_var": 482.1958333333333,
"learning_rate": 0.0001,
"loss": 11.2995,
"loss/crossentropy": 2.522867926955223,
"loss/hidden": 4.778125,
"loss/jsd": 0.0,
"loss/logits": 0.39878650680184363,
"step": 470
},
{
"epoch": 0.048,
"grad_norm": 114.5,
"grad_norm_var": 159.2,
"learning_rate": 0.0001,
"loss": 11.1298,
"loss/crossentropy": 2.503119890391827,
"loss/hidden": 4.6859375,
"loss/jsd": 0.0,
"loss/logits": 0.4145892545580864,
"step": 480
},
{
"epoch": 0.049,
"grad_norm": 123.5,
"grad_norm_var": 2113.4625,
"learning_rate": 0.0001,
"loss": 11.0383,
"loss/crossentropy": 2.4039885073900225,
"loss/hidden": 4.658203125,
"loss/jsd": 0.0,
"loss/logits": 0.37959295585751535,
"step": 490
},
{
"epoch": 0.05,
"grad_norm": 110.0,
"grad_norm_var": 1545.5291666666667,
"learning_rate": 0.0001,
"loss": 10.9564,
"loss/crossentropy": 2.3160028889775277,
"loss/hidden": 4.78359375,
"loss/jsd": 0.0,
"loss/logits": 0.4041217315942049,
"step": 500
},
{
"epoch": 0.051,
"grad_norm": 118.5,
"grad_norm_var": 1485.1572916666667,
"learning_rate": 0.0001,
"loss": 11.0273,
"loss/crossentropy": 2.3481629095971583,
"loss/hidden": 4.76796875,
"loss/jsd": 0.0,
"loss/logits": 0.388704277202487,
"step": 510
},
{
"epoch": 0.052,
"grad_norm": 302.0,
"grad_norm_var": 4060.695833333333,
"learning_rate": 0.0001,
"loss": 10.8826,
"loss/crossentropy": 2.432570169866085,
"loss/hidden": 4.647265625,
"loss/jsd": 0.0,
"loss/logits": 0.4005543690174818,
"step": 520
},
{
"epoch": 0.053,
"grad_norm": 262.0,
"grad_norm_var": 5144.929166666667,
"learning_rate": 0.0001,
"loss": 10.9255,
"loss/crossentropy": 2.4078257739543916,
"loss/hidden": 4.51953125,
"loss/jsd": 0.0,
"loss/logits": 0.3619723778218031,
"step": 530
},
{
"epoch": 0.054,
"grad_norm": 111.5,
"grad_norm_var": 3058.195833333333,
"learning_rate": 0.0001,
"loss": 10.8513,
"loss/crossentropy": 2.1905623614788055,
"loss/hidden": 4.54921875,
"loss/jsd": 0.0,
"loss/logits": 0.3489991918206215,
"step": 540
},
{
"epoch": 0.055,
"grad_norm": 98.0,
"grad_norm_var": 2313.990625,
"learning_rate": 0.0001,
"loss": 10.8386,
"loss/crossentropy": 2.4719990983605387,
"loss/hidden": 4.63984375,
"loss/jsd": 0.0,
"loss/logits": 0.4116944268345833,
"step": 550
},
{
"epoch": 0.056,
"grad_norm": 105.0,
"grad_norm_var": 1808.315625,
"learning_rate": 0.0001,
"loss": 10.7797,
"loss/crossentropy": 2.381363682448864,
"loss/hidden": 4.58828125,
"loss/jsd": 0.0,
"loss/logits": 0.38398357704281805,
"step": 560
},
{
"epoch": 0.057,
"grad_norm": 206.0,
"grad_norm_var": 1395.3,
"learning_rate": 0.0001,
"loss": 10.6643,
"loss/crossentropy": 2.531977267563343,
"loss/hidden": 4.6171875,
"loss/jsd": 0.0,
"loss/logits": 0.37524734511971475,
"step": 570
},
{
"epoch": 0.058,
"grad_norm": 150.0,
"grad_norm_var": 1246.6333333333334,
"learning_rate": 0.0001,
"loss": 10.5081,
"loss/crossentropy": 2.391422814875841,
"loss/hidden": 4.46484375,
"loss/jsd": 0.0,
"loss/logits": 0.3587542846798897,
"step": 580
},
{
"epoch": 0.059,
"grad_norm": 110.5,
"grad_norm_var": 678.5333333333333,
"learning_rate": 0.0001,
"loss": 10.4338,
"loss/crossentropy": 2.267008524388075,
"loss/hidden": 4.356640625,
"loss/jsd": 0.0,
"loss/logits": 0.3174692545086145,
"step": 590
},
{
"epoch": 0.06,
"grad_norm": 135.0,
"grad_norm_var": 914.0489583333333,
"learning_rate": 0.0001,
"loss": 10.5236,
"loss/crossentropy": 2.3517861902713775,
"loss/hidden": 4.42265625,
"loss/jsd": 0.0,
"loss/logits": 0.3542962525039911,
"step": 600
},
{
"epoch": 0.061,
"grad_norm": 103.0,
"grad_norm_var": 904.1989583333333,
"learning_rate": 0.0001,
"loss": 10.345,
"loss/crossentropy": 2.3741147622466086,
"loss/hidden": 4.4171875,
"loss/jsd": 0.0,
"loss/logits": 0.3606201378628612,
"step": 610
},
{
"epoch": 0.062,
"grad_norm": 86.0,
"grad_norm_var": 624.25,
"learning_rate": 0.0001,
"loss": 10.4494,
"loss/crossentropy": 2.3786921083927153,
"loss/hidden": 4.291796875,
"loss/jsd": 0.0,
"loss/logits": 0.33345147483050824,
"step": 620
},
{
"epoch": 0.063,
"grad_norm": 109.0,
"grad_norm_var": 580.0333333333333,
"learning_rate": 0.0001,
"loss": 10.1494,
"loss/crossentropy": 2.3835427895188332,
"loss/hidden": 4.328515625,
"loss/jsd": 0.0,
"loss/logits": 0.33732542097568513,
"step": 630
},
{
"epoch": 0.064,
"grad_norm": 106.5,
"grad_norm_var": 407.5625,
"learning_rate": 0.0001,
"loss": 10.334,
"loss/crossentropy": 2.3970961540937425,
"loss/hidden": 4.486328125,
"loss/jsd": 0.0,
"loss/logits": 0.3739761531352997,
"step": 640
},
{
"epoch": 0.065,
"grad_norm": 127.0,
"grad_norm_var": 8.827054751968406e+17,
"learning_rate": 0.0001,
"loss": 10.3909,
"loss/crossentropy": 2.603018820285797,
"loss/hidden": 4.336328125,
"loss/jsd": 0.0,
"loss/logits": 0.35302893407642844,
"step": 650
},
{
"epoch": 0.066,
"grad_norm": 103.0,
"grad_norm_var": 8.827054748993245e+17,
"learning_rate": 0.0001,
"loss": 10.3448,
"loss/crossentropy": 2.209125077724457,
"loss/hidden": 4.298046875,
"loss/jsd": 0.0,
"loss/logits": 0.3420632269233465,
"step": 660
},
{
"epoch": 0.067,
"grad_norm": 111.5,
"grad_norm_var": 190.75,
"learning_rate": 0.0001,
"loss": 10.1599,
"loss/crossentropy": 2.1904555816203355,
"loss/hidden": 4.413671875,
"loss/jsd": 0.0,
"loss/logits": 0.3357353564351797,
"step": 670
},
{
"epoch": 0.068,
"grad_norm": 88.5,
"grad_norm_var": 215.29895833333333,
"learning_rate": 0.0001,
"loss": 9.9371,
"loss/crossentropy": 2.3618984460830688,
"loss/hidden": 4.186328125,
"loss/jsd": 0.0,
"loss/logits": 0.33678749240934847,
"step": 680
},
{
"epoch": 0.069,
"grad_norm": 95.0,
"grad_norm_var": 228.140625,
"learning_rate": 0.0001,
"loss": 10.0861,
"loss/crossentropy": 2.372377243638039,
"loss/hidden": 4.2109375,
"loss/jsd": 0.0,
"loss/logits": 0.3243491280823946,
"step": 690
},
{
"epoch": 0.07,
"grad_norm": 84.0,
"grad_norm_var": 520.1666666666666,
"learning_rate": 0.0001,
"loss": 10.2116,
"loss/crossentropy": 2.235209721326828,
"loss/hidden": 4.303515625,
"loss/jsd": 0.0,
"loss/logits": 0.34188132397830484,
"step": 700
},
{
"epoch": 0.071,
"grad_norm": 103.0,
"grad_norm_var": 553.9291666666667,
"learning_rate": 0.0001,
"loss": 9.9575,
"loss/crossentropy": 2.3372152552008627,
"loss/hidden": 4.1390625,
"loss/jsd": 0.0,
"loss/logits": 0.3105729196220636,
"step": 710
},
{
"epoch": 0.072,
"grad_norm": 107.0,
"grad_norm_var": 538.540625,
"learning_rate": 0.0001,
"loss": 9.978,
"loss/crossentropy": 2.510573136806488,
"loss/hidden": 4.14453125,
"loss/jsd": 0.0,
"loss/logits": 0.3471809647977352,
"step": 720
},
{
"epoch": 0.073,
"grad_norm": 139.0,
"grad_norm_var": 493.49583333333334,
"learning_rate": 0.0001,
"loss": 9.9677,
"loss/crossentropy": 2.3755437433719635,
"loss/hidden": 4.123828125,
"loss/jsd": 0.0,
"loss/logits": 0.3338810380548239,
"step": 730
},
{
"epoch": 0.074,
"grad_norm": 99.0,
"grad_norm_var": 286.8625,
"learning_rate": 0.0001,
"loss": 9.8714,
"loss/crossentropy": 2.3226330026984217,
"loss/hidden": 4.11015625,
"loss/jsd": 0.0,
"loss/logits": 0.32580162063241,
"step": 740
},
{
"epoch": 0.075,
"grad_norm": 85.5,
"grad_norm_var": 425.8625,
"learning_rate": 0.0001,
"loss": 9.7891,
"loss/crossentropy": 2.3768628584221005,
"loss/hidden": 4.124609375,
"loss/jsd": 0.0,
"loss/logits": 0.31062583327293397,
"step": 750
},
{
"epoch": 0.076,
"grad_norm": 120.0,
"grad_norm_var": 373.765625,
"learning_rate": 0.0001,
"loss": 9.8455,
"loss/crossentropy": 2.4248126417398455,
"loss/hidden": 4.2359375,
"loss/jsd": 0.0,
"loss/logits": 0.3379279874265194,
"step": 760
},
{
"epoch": 0.077,
"grad_norm": 115.5,
"grad_norm_var": 366.765625,
"learning_rate": 0.0001,
"loss": 9.7894,
"loss/crossentropy": 2.2128719061613085,
"loss/hidden": 4.18515625,
"loss/jsd": 0.0,
"loss/logits": 0.3335044614970684,
"step": 770
},
{
"epoch": 0.078,
"grad_norm": 82.0,
"grad_norm_var": 207.05,
"learning_rate": 0.0001,
"loss": 9.7323,
"loss/crossentropy": 2.321111184358597,
"loss/hidden": 4.112109375,
"loss/jsd": 0.0,
"loss/logits": 0.30921670254319905,
"step": 780
},
{
"epoch": 0.079,
"grad_norm": 90.0,
"grad_norm_var": 321.65729166666665,
"learning_rate": 0.0001,
"loss": 9.7419,
"loss/crossentropy": 2.3887290723621843,
"loss/hidden": 4.17421875,
"loss/jsd": 0.0,
"loss/logits": 0.34963752441108226,
"step": 790
},
{
"epoch": 0.08,
"grad_norm": 90.0,
"grad_norm_var": 1653.9958333333334,
"learning_rate": 0.0001,
"loss": 9.6443,
"loss/crossentropy": 2.34355805516243,
"loss/hidden": 4.119140625,
"loss/jsd": 0.0,
"loss/logits": 0.3216205321252346,
"step": 800
},
{
"epoch": 0.081,
"grad_norm": 111.0,
"grad_norm_var": 1760.865625,
"learning_rate": 0.0001,
"loss": 9.7151,
"loss/crossentropy": 2.26568204164505,
"loss/hidden": 4.0734375,
"loss/jsd": 0.0,
"loss/logits": 0.3119744971394539,
"step": 810
},
{
"epoch": 0.082,
"grad_norm": 100.0,
"grad_norm_var": 365.0,
"learning_rate": 0.0001,
"loss": 9.6335,
"loss/crossentropy": 2.363439542800188,
"loss/hidden": 4.0421875,
"loss/jsd": 0.0,
"loss/logits": 0.3196489207446575,
"step": 820
},
{
"epoch": 0.083,
"grad_norm": 105.0,
"grad_norm_var": 725.840625,
"learning_rate": 0.0001,
"loss": 9.5683,
"loss/crossentropy": 2.25376470759511,
"loss/hidden": 4.040625,
"loss/jsd": 0.0,
"loss/logits": 0.3137321826070547,
"step": 830
},
{
"epoch": 0.084,
"grad_norm": 91.0,
"grad_norm_var": 243.115625,
"learning_rate": 0.0001,
"loss": 9.6059,
"loss/crossentropy": 2.402809253334999,
"loss/hidden": 4.08359375,
"loss/jsd": 0.0,
"loss/logits": 0.3079391553997993,
"step": 840
},
{
"epoch": 0.085,
"grad_norm": 115.5,
"grad_norm_var": 52.3625,
"learning_rate": 0.0001,
"loss": 9.4809,
"loss/crossentropy": 2.3521162420511246,
"loss/hidden": 3.929296875,
"loss/jsd": 0.0,
"loss/logits": 0.3063440557569265,
"step": 850
},
{
"epoch": 0.086,
"grad_norm": 91.0,
"grad_norm_var": 109.71666666666667,
"learning_rate": 0.0001,
"loss": 9.6562,
"loss/crossentropy": 2.443948082625866,
"loss/hidden": 4.025390625,
"loss/jsd": 0.0,
"loss/logits": 0.33005591817200186,
"step": 860
},
{
"epoch": 0.087,
"grad_norm": 99.0,
"grad_norm_var": 8.906043697083199e+17,
"learning_rate": 0.0001,
"loss": 9.6756,
"loss/crossentropy": 2.2569786101579665,
"loss/hidden": 4.169140625,
"loss/jsd": 0.0,
"loss/logits": 0.32912670746445655,
"step": 870
},
{
"epoch": 0.088,
"grad_norm": 87.5,
"grad_norm_var": 8.90604369488119e+17,
"learning_rate": 0.0001,
"loss": 9.6822,
"loss/crossentropy": 2.542811484634876,
"loss/hidden": 3.961328125,
"loss/jsd": 0.0,
"loss/logits": 0.3259673956781626,
"step": 880
},
{
"epoch": 0.089,
"grad_norm": 117.0,
"grad_norm_var": 227.42395833333333,
"learning_rate": 0.0001,
"loss": 9.44,
"loss/crossentropy": 2.3939336955547335,
"loss/hidden": 3.878515625,
"loss/jsd": 0.0,
"loss/logits": 0.29817260801792145,
"step": 890
},
{
"epoch": 0.09,
"grad_norm": 79.0,
"grad_norm_var": 200.97395833333334,
"learning_rate": 0.0001,
"loss": 9.3573,
"loss/crossentropy": 2.496935114264488,
"loss/hidden": 4.0015625,
"loss/jsd": 0.0,
"loss/logits": 0.3248747974634171,
"step": 900
},
{
"epoch": 0.091,
"grad_norm": 97.5,
"grad_norm_var": 517.7,
"learning_rate": 0.0001,
"loss": 9.4559,
"loss/crossentropy": 2.245865948498249,
"loss/hidden": 3.951953125,
"loss/jsd": 0.0,
"loss/logits": 0.30880712568759916,
"step": 910
},
{
"epoch": 0.092,
"grad_norm": 93.0,
"grad_norm_var": 475.07395833333334,
"learning_rate": 0.0001,
"loss": 9.3572,
"loss/crossentropy": 2.3004986569285393,
"loss/hidden": 3.912890625,
"loss/jsd": 0.0,
"loss/logits": 0.2959143763408065,
"step": 920
},
{
"epoch": 0.093,
"grad_norm": 94.5,
"grad_norm_var": 139.9,
"learning_rate": 0.0001,
"loss": 9.461,
"loss/crossentropy": 2.360969065129757,
"loss/hidden": 3.9828125,
"loss/jsd": 0.0,
"loss/logits": 0.3106645856052637,
"step": 930
},
{
"epoch": 0.094,
"grad_norm": 102.5,
"grad_norm_var": 82.290625,
"learning_rate": 0.0001,
"loss": 9.3725,
"loss/crossentropy": 2.442077124118805,
"loss/hidden": 3.887109375,
"loss/jsd": 0.0,
"loss/logits": 0.30320504680275917,
"step": 940
},
{
"epoch": 0.095,
"grad_norm": 81.0,
"grad_norm_var": 283.8989583333333,
"learning_rate": 0.0001,
"loss": 9.215,
"loss/crossentropy": 2.2990706115961075,
"loss/hidden": 3.908203125,
"loss/jsd": 0.0,
"loss/logits": 0.2945917289704084,
"step": 950
},
{
"epoch": 0.096,
"grad_norm": 85.5,
"grad_norm_var": 935.5291666666667,
"learning_rate": 0.0001,
"loss": 9.3148,
"loss/crossentropy": 2.405318558216095,
"loss/hidden": 3.8734375,
"loss/jsd": 0.0,
"loss/logits": 0.29377752766013143,
"step": 960
},
{
"epoch": 0.097,
"grad_norm": 90.5,
"grad_norm_var": 745.3291666666667,
"learning_rate": 0.0001,
"loss": 9.2675,
"loss/crossentropy": 2.313190388679504,
"loss/hidden": 3.908203125,
"loss/jsd": 0.0,
"loss/logits": 0.3074024930596352,
"step": 970
},
{
"epoch": 0.098,
"grad_norm": 91.5,
"grad_norm_var": 74.38333333333334,
"learning_rate": 0.0001,
"loss": 9.3473,
"loss/crossentropy": 2.4643412232398987,
"loss/hidden": 3.9109375,
"loss/jsd": 0.0,
"loss/logits": 0.31328765451908114,
"step": 980
},
{
"epoch": 0.099,
"grad_norm": 83.0,
"grad_norm_var": 77.24895833333333,
"learning_rate": 0.0001,
"loss": 9.1591,
"loss/crossentropy": 2.3321994699537756,
"loss/hidden": 3.794140625,
"loss/jsd": 0.0,
"loss/logits": 0.2842238027602434,
"step": 990
},
{
"epoch": 0.1,
"grad_norm": 2919235584.0,
"grad_norm_var": 5.3262099304352365e+17,
"learning_rate": 0.0001,
"loss": 9.2499,
"loss/crossentropy": 2.24974425137043,
"loss/hidden": 3.69921875,
"loss/jsd": 0.0,
"loss/logits": 0.2656703107059002,
"step": 1000
},
{
"epoch": 0.101,
"grad_norm": 83.0,
"grad_norm_var": 5.3262099137712704e+17,
"learning_rate": 0.0001,
"loss": 9.1036,
"loss/crossentropy": 2.248470115661621,
"loss/hidden": 3.834375,
"loss/jsd": 0.0,
"loss/logits": 0.28389163631945846,
"step": 1010
},
{
"epoch": 0.102,
"grad_norm": 99.5,
"grad_norm_var": 260.3958333333333,
"learning_rate": 0.0001,
"loss": 9.1529,
"loss/crossentropy": 2.177551028132439,
"loss/hidden": 3.85625,
"loss/jsd": 0.0,
"loss/logits": 0.2901096811518073,
"step": 1020
},
{
"epoch": 0.103,
"grad_norm": 107.0,
"grad_norm_var": 126.18333333333334,
"learning_rate": 0.0001,
"loss": 9.2276,
"loss/crossentropy": 2.4588360369205473,
"loss/hidden": 3.81953125,
"loss/jsd": 0.0,
"loss/logits": 0.3054195210337639,
"step": 1030
},
{
"epoch": 0.104,
"grad_norm": 92.5,
"grad_norm_var": 773.6822916666666,
"learning_rate": 0.0001,
"loss": 9.2522,
"loss/crossentropy": 2.36704108864069,
"loss/hidden": 3.98984375,
"loss/jsd": 0.0,
"loss/logits": 0.32540309652686117,
"step": 1040
},
{
"epoch": 0.105,
"grad_norm": 94.0,
"grad_norm_var": 747.9958333333333,
"learning_rate": 0.0001,
"loss": 9.1546,
"loss/crossentropy": 2.2803470581769942,
"loss/hidden": 3.805078125,
"loss/jsd": 0.0,
"loss/logits": 0.3206649195402861,
"step": 1050
},
{
"epoch": 0.106,
"grad_norm": 74.0,
"grad_norm_var": 118.565625,
"learning_rate": 0.0001,
"loss": 9.1738,
"loss/crossentropy": 2.468463772535324,
"loss/hidden": 3.775390625,
"loss/jsd": 0.0,
"loss/logits": 0.3029760651290417,
"step": 1060
},
{
"epoch": 0.107,
"grad_norm": 76.0,
"grad_norm_var": 112.42395833333333,
"learning_rate": 0.0001,
"loss": 9.0442,
"loss/crossentropy": 2.3093275628983974,
"loss/hidden": 3.8203125,
"loss/jsd": 0.0,
"loss/logits": 0.30387087166309357,
"step": 1070
},
{
"epoch": 0.108,
"grad_norm": 92.5,
"grad_norm_var": 47.71666666666667,
"learning_rate": 0.0001,
"loss": 9.0691,
"loss/crossentropy": 2.3587117075920103,
"loss/hidden": 3.7921875,
"loss/jsd": 0.0,
"loss/logits": 0.2959397092461586,
"step": 1080
},
{
"epoch": 0.109,
"grad_norm": 82.0,
"grad_norm_var": 82.57395833333334,
"learning_rate": 0.0001,
"loss": 9.0681,
"loss/crossentropy": 2.3668819189071657,
"loss/hidden": 3.873828125,
"loss/jsd": 0.0,
"loss/logits": 0.30734706819057467,
"step": 1090
},
{
"epoch": 0.11,
"grad_norm": 98.0,
"grad_norm_var": 130.02916666666667,
"learning_rate": 0.0001,
"loss": 9.1895,
"loss/crossentropy": 2.4498503282666206,
"loss/hidden": 3.838671875,
"loss/jsd": 0.0,
"loss/logits": 0.30784521605819465,
"step": 1100
},
{
"epoch": 0.111,
"grad_norm": 88.5,
"grad_norm_var": 92.05729166666667,
"learning_rate": 0.0001,
"loss": 9.2165,
"loss/crossentropy": 2.37082399725914,
"loss/hidden": 3.859375,
"loss/jsd": 0.0,
"loss/logits": 0.29330057725310327,
"step": 1110
},
{
"epoch": 0.112,
"grad_norm": 89.5,
"grad_norm_var": 160.39895833333333,
"learning_rate": 0.0001,
"loss": 9.0963,
"loss/crossentropy": 2.245619586110115,
"loss/hidden": 3.839453125,
"loss/jsd": 0.0,
"loss/logits": 0.3090781785547733,
"step": 1120
},
{
"epoch": 0.113,
"grad_norm": 89.0,
"grad_norm_var": 153.57395833333334,
"learning_rate": 0.0001,
"loss": 9.1747,
"loss/crossentropy": 2.254079730808735,
"loss/hidden": 3.863671875,
"loss/jsd": 0.0,
"loss/logits": 0.29664100557565687,
"step": 1130
},
{
"epoch": 0.114,
"grad_norm": 85.5,
"grad_norm_var": 177.3625,
"learning_rate": 0.0001,
"loss": 8.9365,
"loss/crossentropy": 2.3813750982284545,
"loss/hidden": 3.841796875,
"loss/jsd": 0.0,
"loss/logits": 0.2955601759254932,
"step": 1140
},
{
"epoch": 0.115,
"grad_norm": 97.0,
"grad_norm_var": 177.75,
"learning_rate": 0.0001,
"loss": 9.0288,
"loss/crossentropy": 2.317107746005058,
"loss/hidden": 3.7171875,
"loss/jsd": 0.0,
"loss/logits": 0.2740287099033594,
"step": 1150
},
{
"epoch": 0.116,
"grad_norm": 84.0,
"grad_norm_var": 192.15,
"learning_rate": 0.0001,
"loss": 8.9149,
"loss/crossentropy": 2.2348272860050202,
"loss/hidden": 3.748828125,
"loss/jsd": 0.0,
"loss/logits": 0.26385229676961897,
"step": 1160
},
{
"epoch": 0.117,
"grad_norm": 78.0,
"grad_norm_var": 139.8625,
"learning_rate": 0.0001,
"loss": 8.9416,
"loss/crossentropy": 2.186076807975769,
"loss/hidden": 3.68046875,
"loss/jsd": 0.0,
"loss/logits": 0.2610600605607033,
"step": 1170
},
{
"epoch": 0.118,
"grad_norm": 80.5,
"grad_norm_var": 175.85,
"learning_rate": 0.0001,
"loss": 8.9542,
"loss/crossentropy": 2.258153685927391,
"loss/hidden": 3.740234375,
"loss/jsd": 0.0,
"loss/logits": 0.27120565343648195,
"step": 1180
},
{
"epoch": 0.119,
"grad_norm": 79.0,
"grad_norm_var": 164.89583333333334,
"learning_rate": 0.0001,
"loss": 8.8167,
"loss/crossentropy": 2.4536369144916534,
"loss/hidden": 3.7125,
"loss/jsd": 0.0,
"loss/logits": 0.28769057895988226,
"step": 1190
},
{
"epoch": 0.12,
"grad_norm": 63.0,
"grad_norm_var": 103.565625,
"learning_rate": 0.0001,
"loss": 8.7058,
"loss/crossentropy": 2.2031524434685705,
"loss/hidden": 3.709375,
"loss/jsd": 0.0,
"loss/logits": 0.2841499318368733,
"step": 1200
},
{
"epoch": 0.121,
"grad_norm": 74.0,
"grad_norm_var": 117.23229166666667,
"learning_rate": 0.0001,
"loss": 8.8823,
"loss/crossentropy": 2.2541019685566424,
"loss/hidden": 3.725,
"loss/jsd": 0.0,
"loss/logits": 0.2822803447023034,
"step": 1210
},
{
"epoch": 0.122,
"grad_norm": 75.5,
"grad_norm_var": 163.3625,
"learning_rate": 0.0001,
"loss": 8.7654,
"loss/crossentropy": 2.4589641630649566,
"loss/hidden": 3.77265625,
"loss/jsd": 0.0,
"loss/logits": 0.28896796628832816,
"step": 1220
},
{
"epoch": 0.123,
"grad_norm": 83.0,
"grad_norm_var": 68.25,
"learning_rate": 0.0001,
"loss": 8.9438,
"loss/crossentropy": 2.2707848742604257,
"loss/hidden": 3.685546875,
"loss/jsd": 0.0,
"loss/logits": 0.2688711144030094,
"step": 1230
},
{
"epoch": 0.124,
"grad_norm": 97.5,
"grad_norm_var": 75.89895833333334,
"learning_rate": 0.0001,
"loss": 8.8432,
"loss/crossentropy": 2.5097223311662673,
"loss/hidden": 3.656640625,
"loss/jsd": 0.0,
"loss/logits": 0.29047914147377013,
"step": 1240
},
{
"epoch": 0.125,
"grad_norm": 89.0,
"grad_norm_var": 1450.2989583333333,
"learning_rate": 0.0001,
"loss": 8.8377,
"loss/crossentropy": 2.3170286387205126,
"loss/hidden": 3.7,
"loss/jsd": 0.0,
"loss/logits": 0.2755675740540028,
"step": 1250
},
{
"epoch": 0.126,
"grad_norm": 65.0,
"grad_norm_var": 1693.0291666666667,
"learning_rate": 0.0001,
"loss": 8.6604,
"loss/crossentropy": 2.1438958957791328,
"loss/hidden": 3.639453125,
"loss/jsd": 0.0,
"loss/logits": 0.2576067052781582,
"step": 1260
},
{
"epoch": 0.127,
"grad_norm": 74.0,
"grad_norm_var": 126.190625,
"learning_rate": 0.0001,
"loss": 8.8333,
"loss/crossentropy": 2.3025652706623077,
"loss/hidden": 3.691015625,
"loss/jsd": 0.0,
"loss/logits": 0.2775576956570148,
"step": 1270
},
{
"epoch": 0.128,
"grad_norm": 71.5,
"grad_norm_var": 87.23229166666667,
"learning_rate": 0.0001,
"loss": 8.7094,
"loss/crossentropy": 2.13181097432971,
"loss/hidden": 3.702734375,
"loss/jsd": 0.0,
"loss/logits": 0.27296230792999265,
"step": 1280
},
{
"epoch": 0.129,
"grad_norm": 113.0,
"grad_norm_var": 149.48229166666667,
"learning_rate": 0.0001,
"loss": 8.6782,
"loss/crossentropy": 2.1315632432699205,
"loss/hidden": 3.625390625,
"loss/jsd": 0.0,
"loss/logits": 0.2651492517441511,
"step": 1290
},
{
"epoch": 0.13,
"grad_norm": 85.0,
"grad_norm_var": 111.440625,
"learning_rate": 0.0001,
"loss": 8.742,
"loss/crossentropy": 2.339846658706665,
"loss/hidden": 3.623828125,
"loss/jsd": 0.0,
"loss/logits": 0.2743611980229616,
"step": 1300
},
{
"epoch": 0.131,
"grad_norm": 86.0,
"grad_norm_var": 122.965625,
"learning_rate": 0.0001,
"loss": 8.6397,
"loss/crossentropy": 2.2031438082456587,
"loss/hidden": 3.5578125,
"loss/jsd": 0.0,
"loss/logits": 0.2621523380279541,
"step": 1310
},
{
"epoch": 0.132,
"grad_norm": 71.5,
"grad_norm_var": 132.10729166666667,
"learning_rate": 0.0001,
"loss": 8.7931,
"loss/crossentropy": 2.465841978788376,
"loss/hidden": 3.657421875,
"loss/jsd": 0.0,
"loss/logits": 0.29582356065511706,
"step": 1320
},
{
"epoch": 0.133,
"grad_norm": 98.5,
"grad_norm_var": 136.34973958333333,
"learning_rate": 0.0001,
"loss": 8.7755,
"loss/crossentropy": 2.3093322798609734,
"loss/hidden": 3.675390625,
"loss/jsd": 0.0,
"loss/logits": 0.28201375566422937,
"step": 1330
},
{
"epoch": 0.134,
"grad_norm": 87.0,
"grad_norm_var": 45.1625,
"learning_rate": 0.0001,
"loss": 8.8767,
"loss/crossentropy": 2.3267322540283204,
"loss/hidden": 3.687109375,
"loss/jsd": 0.0,
"loss/logits": 0.27597835548222066,
"step": 1340
},
{
"epoch": 0.135,
"grad_norm": 78.5,
"grad_norm_var": 52.19583333333333,
"learning_rate": 0.0001,
"loss": 8.7636,
"loss/crossentropy": 2.250748935341835,
"loss/hidden": 3.722265625,
"loss/jsd": 0.0,
"loss/logits": 0.275000686571002,
"step": 1350
},
{
"epoch": 0.136,
"grad_norm": 74.0,
"grad_norm_var": 77.68229166666667,
"learning_rate": 0.0001,
"loss": 8.8309,
"loss/crossentropy": 2.294243222475052,
"loss/hidden": 3.781640625,
"loss/jsd": 0.0,
"loss/logits": 0.29517283104360104,
"step": 1360
},
{
"epoch": 0.137,
"grad_norm": 73.0,
"grad_norm_var": 70.565625,
"learning_rate": 0.0001,
"loss": 8.6486,
"loss/crossentropy": 2.4063815265893935,
"loss/hidden": 3.561328125,
"loss/jsd": 0.0,
"loss/logits": 0.27084620147943494,
"step": 1370
},
{
"epoch": 0.138,
"grad_norm": 72.0,
"grad_norm_var": 160.365625,
"learning_rate": 0.0001,
"loss": 8.6319,
"loss/crossentropy": 2.0357704624533652,
"loss/hidden": 3.55390625,
"loss/jsd": 0.0,
"loss/logits": 0.24529488924890758,
"step": 1380
},
{
"epoch": 0.139,
"grad_norm": 92.0,
"grad_norm_var": 159.2625,
"learning_rate": 0.0001,
"loss": 8.6773,
"loss/crossentropy": 2.207934172451496,
"loss/hidden": 3.626953125,
"loss/jsd": 0.0,
"loss/logits": 0.26206000819802283,
"step": 1390
},
{
"epoch": 0.14,
"grad_norm": 92.0,
"grad_norm_var": 75.85,
"learning_rate": 0.0001,
"loss": 8.6142,
"loss/crossentropy": 2.2258728444576263,
"loss/hidden": 3.694140625,
"loss/jsd": 0.0,
"loss/logits": 0.2842423222959042,
"step": 1400
},
{
"epoch": 0.141,
"grad_norm": 75.5,
"grad_norm_var": 69.590625,
"learning_rate": 0.0001,
"loss": 8.7049,
"loss/crossentropy": 2.405027574300766,
"loss/hidden": 3.594140625,
"loss/jsd": 0.0,
"loss/logits": 0.2595718756318092,
"step": 1410
},
{
"epoch": 0.142,
"grad_norm": 175.0,
"grad_norm_var": 622.85,
"learning_rate": 0.0001,
"loss": 8.5144,
"loss/crossentropy": 2.3508727669715883,
"loss/hidden": 3.6578125,
"loss/jsd": 0.0,
"loss/logits": 0.2513396417722106,
"step": 1420
},
{
"epoch": 0.143,
"grad_norm": 144.0,
"grad_norm_var": 827.8739583333333,
"learning_rate": 0.0001,
"loss": 8.64,
"loss/crossentropy": 2.158524568378925,
"loss/hidden": 3.666015625,
"loss/jsd": 0.0,
"loss/logits": 0.25669998563826085,
"step": 1430
},
{
"epoch": 0.144,
"grad_norm": 90.5,
"grad_norm_var": 339.35729166666664,
"learning_rate": 0.0001,
"loss": 8.5076,
"loss/crossentropy": 2.1952589228749275,
"loss/hidden": 3.490625,
"loss/jsd": 0.0,
"loss/logits": 0.246895507350564,
"step": 1440
},
{
"epoch": 0.145,
"grad_norm": 65.5,
"grad_norm_var": 314.6333333333333,
"learning_rate": 0.0001,
"loss": 8.6159,
"loss/crossentropy": 2.3050056755542756,
"loss/hidden": 3.553515625,
"loss/jsd": 0.0,
"loss/logits": 0.2641986530274153,
"step": 1450
},
{
"epoch": 0.146,
"grad_norm": 76.5,
"grad_norm_var": 426.1166666666667,
"learning_rate": 0.0001,
"loss": 8.527,
"loss/crossentropy": 2.281977441906929,
"loss/hidden": 3.49296875,
"loss/jsd": 0.0,
"loss/logits": 0.2622336186468601,
"step": 1460
},
{
"epoch": 0.147,
"grad_norm": 74.0,
"grad_norm_var": 278.69348958333336,
"learning_rate": 0.0001,
"loss": 8.6149,
"loss/crossentropy": 2.303273032605648,
"loss/hidden": 3.5671875,
"loss/jsd": 0.0,
"loss/logits": 0.2778003554791212,
"step": 1470
},
{
"epoch": 0.148,
"grad_norm": 102.0,
"grad_norm_var": 134.70729166666666,
"learning_rate": 0.0001,
"loss": 8.4927,
"loss/crossentropy": 2.40097414329648,
"loss/hidden": 3.536328125,
"loss/jsd": 0.0,
"loss/logits": 0.27044865442439914,
"step": 1480
},
{
"epoch": 0.149,
"grad_norm": 72.0,
"grad_norm_var": 87.8,
"learning_rate": 0.0001,
"loss": 8.4056,
"loss/crossentropy": 2.186897784471512,
"loss/hidden": 3.532421875,
"loss/jsd": 0.0,
"loss/logits": 0.24866797383874656,
"step": 1490
},
{
"epoch": 0.15,
"grad_norm": 75.5,
"grad_norm_var": 133.09583333333333,
"learning_rate": 0.0001,
"loss": 8.5426,
"loss/crossentropy": 2.311472164094448,
"loss/hidden": 3.53359375,
"loss/jsd": 0.0,
"loss/logits": 0.25585599690675737,
"step": 1500
},
{
"epoch": 0.151,
"grad_norm": 136.0,
"grad_norm_var": 258.7625,
"learning_rate": 0.0001,
"loss": 8.3875,
"loss/crossentropy": 2.2983651250600814,
"loss/hidden": 3.562890625,
"loss/jsd": 0.0,
"loss/logits": 0.2763795707374811,
"step": 1510
},
{
"epoch": 0.152,
"grad_norm": 94.5,
"grad_norm_var": 292.75598958333336,
"learning_rate": 0.0001,
"loss": 8.5971,
"loss/crossentropy": 2.3549255669116973,
"loss/hidden": 3.55703125,
"loss/jsd": 0.0,
"loss/logits": 0.268990096822381,
"step": 1520
},
{
"epoch": 0.153,
"grad_norm": 83.0,
"grad_norm_var": 1.4189153071319926e+18,
"learning_rate": 0.0001,
"loss": 8.7383,
"loss/crossentropy": 2.267159214615822,
"loss/hidden": 3.5671875,
"loss/jsd": 0.0,
"loss/logits": 0.27373309470713136,
"step": 1530
},
{
"epoch": 0.154,
"grad_norm": 77.5,
"grad_norm_var": 63.916666666666664,
"learning_rate": 0.0001,
"loss": 8.5718,
"loss/crossentropy": 2.259125065803528,
"loss/hidden": 3.67265625,
"loss/jsd": 0.0,
"loss/logits": 0.28385352455079554,
"step": 1540
},
{
"epoch": 0.155,
"grad_norm": 73.5,
"grad_norm_var": 39.78723958333333,
"learning_rate": 0.0001,
"loss": 8.4993,
"loss/crossentropy": 2.3606351226568223,
"loss/hidden": 3.558984375,
"loss/jsd": 0.0,
"loss/logits": 0.272869897633791,
"step": 1550
},
{
"epoch": 0.156,
"grad_norm": 71.5,
"grad_norm_var": 247.2625,
"learning_rate": 0.0001,
"loss": 8.6188,
"loss/crossentropy": 2.394289918243885,
"loss/hidden": 3.519921875,
"loss/jsd": 0.0,
"loss/logits": 0.269069866463542,
"step": 1560
},
{
"epoch": 0.157,
"grad_norm": 71.0,
"grad_norm_var": 265.7,
"learning_rate": 0.0001,
"loss": 8.4936,
"loss/crossentropy": 2.2599784307181836,
"loss/hidden": 3.533203125,
"loss/jsd": 0.0,
"loss/logits": 0.26600994151085616,
"step": 1570
},
{
"epoch": 0.158,
"grad_norm": 83.0,
"grad_norm_var": 42.88333333333333,
"learning_rate": 0.0001,
"loss": 8.5015,
"loss/crossentropy": 2.3098704159259795,
"loss/hidden": 3.628515625,
"loss/jsd": 0.0,
"loss/logits": 0.285567194968462,
"step": 1580
},
{
"epoch": 0.159,
"grad_norm": 67.0,
"grad_norm_var": 111.665625,
"learning_rate": 0.0001,
"loss": 8.4128,
"loss/crossentropy": 2.1794722147285936,
"loss/hidden": 3.526953125,
"loss/jsd": 0.0,
"loss/logits": 0.2647275095805526,
"step": 1590
},
{
"epoch": 0.16,
"grad_norm": 91.0,
"grad_norm_var": 149.97890625,
"learning_rate": 0.0001,
"loss": 8.4745,
"loss/crossentropy": 2.2243838563561438,
"loss/hidden": 3.550390625,
"loss/jsd": 0.0,
"loss/logits": 0.2563688028603792,
"step": 1600
},
{
"epoch": 0.161,
"grad_norm": 90.0,
"grad_norm_var": 157.87395833333332,
"learning_rate": 0.0001,
"loss": 8.4168,
"loss/crossentropy": 2.3965038657188416,
"loss/hidden": 3.52265625,
"loss/jsd": 0.0,
"loss/logits": 0.27364722844213246,
"step": 1610
},
{
"epoch": 0.162,
"grad_norm": 96.0,
"grad_norm_var": 380.89348958333335,
"learning_rate": 0.0001,
"loss": 8.6256,
"loss/crossentropy": 2.519009140133858,
"loss/hidden": 3.536328125,
"loss/jsd": 0.0,
"loss/logits": 0.29145103991031646,
"step": 1620
},
{
"epoch": 0.163,
"grad_norm": 80.0,
"grad_norm_var": 331.05,
"learning_rate": 0.0001,
"loss": 8.2011,
"loss/crossentropy": 2.1994084089994432,
"loss/hidden": 3.530078125,
"loss/jsd": 0.0,
"loss/logits": 0.2542119387537241,
"step": 1630
},
{
"epoch": 0.164,
"grad_norm": 72.0,
"grad_norm_var": 41.19583333333333,
"learning_rate": 0.0001,
"loss": 8.3636,
"loss/crossentropy": 2.4333469703793527,
"loss/hidden": 3.4828125,
"loss/jsd": 0.0,
"loss/logits": 0.25861090533435344,
"step": 1640
},
{
"epoch": 0.165,
"grad_norm": 79.0,
"grad_norm_var": 226.29583333333332,
"learning_rate": 0.0001,
"loss": 8.5285,
"loss/crossentropy": 2.468096488714218,
"loss/hidden": 3.478515625,
"loss/jsd": 0.0,
"loss/logits": 0.26285996809601786,
"step": 1650
},
{
"epoch": 0.166,
"grad_norm": 84.5,
"grad_norm_var": 218.12916666666666,
"learning_rate": 0.0001,
"loss": 8.4346,
"loss/crossentropy": 2.2107077345252035,
"loss/hidden": 3.591015625,
"loss/jsd": 0.0,
"loss/logits": 0.2654247496277094,
"step": 1660
},
{
"epoch": 0.167,
"grad_norm": 68.0,
"grad_norm_var": 47.329166666666666,
"learning_rate": 0.0001,
"loss": 8.4021,
"loss/crossentropy": 2.188153588026762,
"loss/hidden": 3.481640625,
"loss/jsd": 0.0,
"loss/logits": 0.24756914153695106,
"step": 1670
},
{
"epoch": 0.168,
"grad_norm": 68.0,
"grad_norm_var": 232.240625,
"learning_rate": 0.0001,
"loss": 8.4491,
"loss/crossentropy": 2.3357387453317644,
"loss/hidden": 3.552734375,
"loss/jsd": 0.0,
"loss/logits": 0.28453084602952006,
"step": 1680
},
{
"epoch": 0.169,
"grad_norm": 63.5,
"grad_norm_var": 179.80729166666666,
"learning_rate": 0.0001,
"loss": 8.4439,
"loss/crossentropy": 2.3677712947130205,
"loss/hidden": 3.6484375,
"loss/jsd": 0.0,
"loss/logits": 0.2788976304233074,
"step": 1690
},
{
"epoch": 0.17,
"grad_norm": 119.5,
"grad_norm_var": 398.665625,
"learning_rate": 0.0001,
"loss": 8.3827,
"loss/crossentropy": 2.4275426417589188,
"loss/hidden": 3.425390625,
"loss/jsd": 0.0,
"loss/logits": 0.26100732628256085,
"step": 1700
},
{
"epoch": 0.171,
"grad_norm": 66.5,
"grad_norm_var": 209.37395833333332,
"learning_rate": 0.0001,
"loss": 8.3197,
"loss/crossentropy": 2.237619758397341,
"loss/hidden": 3.508203125,
"loss/jsd": 0.0,
"loss/logits": 0.2523366323672235,
"step": 1710
},
{
"epoch": 0.172,
"grad_norm": 171.0,
"grad_norm_var": 636.4833333333333,
"learning_rate": 0.0001,
"loss": 8.2648,
"loss/crossentropy": 2.169030448794365,
"loss/hidden": 3.505078125,
"loss/jsd": 0.0,
"loss/logits": 0.24771953662857413,
"step": 1720
},
{
"epoch": 0.173,
"grad_norm": 68.0,
"grad_norm_var": 861.1247395833333,
"learning_rate": 0.0001,
"loss": 8.2948,
"loss/crossentropy": 2.197067990899086,
"loss/hidden": 3.424609375,
"loss/jsd": 0.0,
"loss/logits": 0.2451560577377677,
"step": 1730
},
{
"epoch": 0.174,
"grad_norm": 63.5,
"grad_norm_var": 524.7833333333333,
"learning_rate": 0.0001,
"loss": 8.2316,
"loss/crossentropy": 2.2412655726075172,
"loss/hidden": 3.498046875,
"loss/jsd": 0.0,
"loss/logits": 0.26945888753980396,
"step": 1740
},
{
"epoch": 0.175,
"grad_norm": 90.5,
"grad_norm_var": 496.12395833333335,
"learning_rate": 0.0001,
"loss": 8.3094,
"loss/crossentropy": 2.314925655722618,
"loss/hidden": 3.59375,
"loss/jsd": 0.0,
"loss/logits": 0.27251414209604263,
"step": 1750
},
{
"epoch": 0.176,
"grad_norm": 70.0,
"grad_norm_var": 484.890625,
"learning_rate": 0.0001,
"loss": 8.3807,
"loss/crossentropy": 2.3074424833059313,
"loss/hidden": 3.4640625,
"loss/jsd": 0.0,
"loss/logits": 0.2574224047362804,
"step": 1760
},
{
"epoch": 0.177,
"grad_norm": 69.0,
"grad_norm_var": 88.83932291666666,
"learning_rate": 0.0001,
"loss": 8.3403,
"loss/crossentropy": 2.2954701989889146,
"loss/hidden": 3.46484375,
"loss/jsd": 0.0,
"loss/logits": 0.25794004313647745,
"step": 1770
},
{
"epoch": 0.178,
"grad_norm": 71.5,
"grad_norm_var": 86.65598958333334,
"learning_rate": 0.0001,
"loss": 8.1745,
"loss/crossentropy": 2.2755073979496956,
"loss/hidden": 3.521875,
"loss/jsd": 0.0,
"loss/logits": 0.26081139910966156,
"step": 1780
},
{
"epoch": 0.179,
"grad_norm": 73.5,
"grad_norm_var": 46.40390625,
"learning_rate": 0.0001,
"loss": 8.2619,
"loss/crossentropy": 2.2126931130886076,
"loss/hidden": 3.4921875,
"loss/jsd": 0.0,
"loss/logits": 0.27995246797800066,
"step": 1790
},
{
"epoch": 0.18,
"grad_norm": 70.0,
"grad_norm_var": 44.215625,
"learning_rate": 0.0001,
"loss": 8.3462,
"loss/crossentropy": 2.3120188415050507,
"loss/hidden": 3.482421875,
"loss/jsd": 0.0,
"loss/logits": 0.25568581037223337,
"step": 1800
},
{
"epoch": 0.181,
"grad_norm": 79.0,
"grad_norm_var": 250.965625,
"learning_rate": 0.0001,
"loss": 8.3991,
"loss/crossentropy": 2.2807445406913756,
"loss/hidden": 3.4328125,
"loss/jsd": 0.0,
"loss/logits": 0.2566069485619664,
"step": 1810
},
{
"epoch": 0.182,
"grad_norm": 72.5,
"grad_norm_var": 287.98333333333335,
"learning_rate": 0.0001,
"loss": 8.2019,
"loss/crossentropy": 2.3523808985948564,
"loss/hidden": 3.369921875,
"loss/jsd": 0.0,
"loss/logits": 0.2528150577098131,
"step": 1820
},
{
"epoch": 0.183,
"grad_norm": 85.0,
"grad_norm_var": 37.733072916666664,
"learning_rate": 0.0001,
"loss": 8.1958,
"loss/crossentropy": 2.0805646784603598,
"loss/hidden": 3.38359375,
"loss/jsd": 0.0,
"loss/logits": 0.22354185171425342,
"step": 1830
},
{
"epoch": 0.184,
"grad_norm": 67.5,
"grad_norm_var": 72.85729166666667,
"learning_rate": 0.0001,
"loss": 8.0768,
"loss/crossentropy": 2.3133904695510865,
"loss/hidden": 3.401171875,
"loss/jsd": 0.0,
"loss/logits": 0.24347416013479234,
"step": 1840
},
{
"epoch": 0.185,
"grad_norm": 79.0,
"grad_norm_var": 169.65833333333333,
"learning_rate": 0.0001,
"loss": 8.2994,
"loss/crossentropy": 2.3512276649475097,
"loss/hidden": 3.444921875,
"loss/jsd": 0.0,
"loss/logits": 0.26196608748286965,
"step": 1850
},
{
"epoch": 0.186,
"grad_norm": 69.5,
"grad_norm_var": 2388.08515625,
"learning_rate": 0.0001,
"loss": 8.3424,
"loss/crossentropy": 2.3174356922507284,
"loss/hidden": 3.454296875,
"loss/jsd": 0.0,
"loss/logits": 0.2508500372990966,
"step": 1860
},
{
"epoch": 0.187,
"grad_norm": 60.0,
"grad_norm_var": 196.15833333333333,
"learning_rate": 0.0001,
"loss": 8.256,
"loss/crossentropy": 2.280574831366539,
"loss/hidden": 3.421875,
"loss/jsd": 0.0,
"loss/logits": 0.2615037776529789,
"step": 1870
},
{
"epoch": 0.188,
"grad_norm": 73.0,
"grad_norm_var": 116.42890625,
"learning_rate": 0.0001,
"loss": 8.2108,
"loss/crossentropy": 2.275608576834202,
"loss/hidden": 3.449609375,
"loss/jsd": 0.0,
"loss/logits": 0.2556317184120417,
"step": 1880
},
{
"epoch": 0.189,
"grad_norm": 66.5,
"grad_norm_var": 38.723958333333336,
"learning_rate": 0.0001,
"loss": 8.3584,
"loss/crossentropy": 2.356363560259342,
"loss/hidden": 3.490625,
"loss/jsd": 0.0,
"loss/logits": 0.26850553378462794,
"step": 1890
},
{
"epoch": 0.19,
"grad_norm": 70.0,
"grad_norm_var": 90.62916666666666,
"learning_rate": 0.0001,
"loss": 8.1875,
"loss/crossentropy": 2.282008448243141,
"loss/hidden": 3.4234375,
"loss/jsd": 0.0,
"loss/logits": 0.25158569142222403,
"step": 1900
},
{
"epoch": 0.191,
"grad_norm": 69.0,
"grad_norm_var": 26.895833333333332,
"learning_rate": 0.0001,
"loss": 8.1676,
"loss/crossentropy": 2.3583726406097414,
"loss/hidden": 3.475,
"loss/jsd": 0.0,
"loss/logits": 0.2574294516816735,
"step": 1910
},
{
"epoch": 0.192,
"grad_norm": 62.25,
"grad_norm_var": 34.430989583333336,
"learning_rate": 0.0001,
"loss": 8.2457,
"loss/crossentropy": 2.310526317358017,
"loss/hidden": 3.407421875,
"loss/jsd": 0.0,
"loss/logits": 0.25894895792007444,
"step": 1920
},
{
"epoch": 0.193,
"grad_norm": 84.0,
"grad_norm_var": 65.58307291666667,
"learning_rate": 0.0001,
"loss": 8.2176,
"loss/crossentropy": 2.0871855318546295,
"loss/hidden": 3.3796875,
"loss/jsd": 0.0,
"loss/logits": 0.24589193761348724,
"step": 1930
},
{
"epoch": 0.194,
"grad_norm": 65.5,
"grad_norm_var": 39.07473958333333,
"learning_rate": 0.0001,
"loss": 8.1842,
"loss/crossentropy": 2.261622406542301,
"loss/hidden": 3.3609375,
"loss/jsd": 0.0,
"loss/logits": 0.24441927969455718,
"step": 1940
},
{
"epoch": 0.195,
"grad_norm": 64.0,
"grad_norm_var": 57.848958333333336,
"learning_rate": 0.0001,
"loss": 8.1485,
"loss/crossentropy": 2.386093820631504,
"loss/hidden": 3.3359375,
"loss/jsd": 0.0,
"loss/logits": 0.2539959207177162,
"step": 1950
},
{
"epoch": 0.196,
"grad_norm": 72.5,
"grad_norm_var": 37.01223958333333,
"learning_rate": 0.0001,
"loss": 8.3277,
"loss/crossentropy": 2.2825982570648193,
"loss/hidden": 3.433203125,
"loss/jsd": 0.0,
"loss/logits": 0.2809562737122178,
"step": 1960
},
{
"epoch": 0.197,
"grad_norm": 69.0,
"grad_norm_var": 15.633333333333333,
"learning_rate": 0.0001,
"loss": 8.1367,
"loss/crossentropy": 2.181477516889572,
"loss/hidden": 3.494140625,
"loss/jsd": 0.0,
"loss/logits": 0.26897694952785967,
"step": 1970
},
{
"epoch": 0.198,
"grad_norm": 68.5,
"grad_norm_var": 13.966666666666667,
"learning_rate": 0.0001,
"loss": 8.1232,
"loss/crossentropy": 2.292652648687363,
"loss/hidden": 3.409765625,
"loss/jsd": 0.0,
"loss/logits": 0.24958589412271975,
"step": 1980
},
{
"epoch": 0.199,
"grad_norm": 68.0,
"grad_norm_var": 88.34765625,
"learning_rate": 0.0001,
"loss": 8.09,
"loss/crossentropy": 2.367698776721954,
"loss/hidden": 3.387109375,
"loss/jsd": 0.0,
"loss/logits": 0.2641737159341574,
"step": 1990
},
{
"epoch": 0.2,
"grad_norm": 91.5,
"grad_norm_var": 120.825,
"learning_rate": 0.0001,
"loss": 8.1587,
"loss/crossentropy": 2.3354921892285345,
"loss/hidden": 3.42109375,
"loss/jsd": 0.0,
"loss/logits": 0.24634175039827824,
"step": 2000
}
],
"logging_steps": 10,
"max_steps": 10000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.715020064017613e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}