THIRAWAT-SapBERT / trainer_state.json
na399's picture
checkpoint-9356
a6f3e8e verified
{
"best_global_step": 9356,
"best_metric": 0.9798825256975033,
"best_model_checkpoint": "runs/de_sapbert/checkpoint-9356",
"epoch": 2.0,
"eval_steps": 500,
"global_step": 9356,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0010688328345446773,
"grad_norm": 683.587890625,
"learning_rate": 8.547008547008549e-08,
"loss": 50.3236,
"step": 5
},
{
"epoch": 0.0021376656690893546,
"grad_norm": 546.2178955078125,
"learning_rate": 1.9230769230769234e-07,
"loss": 59.3515,
"step": 10
},
{
"epoch": 0.0032064985036340315,
"grad_norm": 602.040771484375,
"learning_rate": 2.991452991452992e-07,
"loss": 62.6096,
"step": 15
},
{
"epoch": 0.004275331338178709,
"grad_norm": 697.5074462890625,
"learning_rate": 4.05982905982906e-07,
"loss": 71.224,
"step": 20
},
{
"epoch": 0.005344164172723386,
"grad_norm": 629.8714599609375,
"learning_rate": 5.128205128205128e-07,
"loss": 58.249,
"step": 25
},
{
"epoch": 0.006412997007268063,
"grad_norm": 850.3056640625,
"learning_rate": 6.196581196581197e-07,
"loss": 53.1647,
"step": 30
},
{
"epoch": 0.007481829841812741,
"grad_norm": 535.2101440429688,
"learning_rate": 7.264957264957266e-07,
"loss": 74.0864,
"step": 35
},
{
"epoch": 0.008550662676357419,
"grad_norm": 877.2177734375,
"learning_rate": 8.333333333333333e-07,
"loss": 68.1396,
"step": 40
},
{
"epoch": 0.009619495510902095,
"grad_norm": 827.8037109375,
"learning_rate": 9.401709401709402e-07,
"loss": 67.5764,
"step": 45
},
{
"epoch": 0.010688328345446772,
"grad_norm": 738.37158203125,
"learning_rate": 1.047008547008547e-06,
"loss": 59.9784,
"step": 50
},
{
"epoch": 0.01175716117999145,
"grad_norm": 698.6246337890625,
"learning_rate": 1.153846153846154e-06,
"loss": 51.3131,
"step": 55
},
{
"epoch": 0.012825994014536126,
"grad_norm": 683.5339965820312,
"learning_rate": 1.2606837606837608e-06,
"loss": 68.0457,
"step": 60
},
{
"epoch": 0.013894826849080803,
"grad_norm": 766.8721923828125,
"learning_rate": 1.3675213675213678e-06,
"loss": 58.9018,
"step": 65
},
{
"epoch": 0.014963659683625482,
"grad_norm": 694.8308715820312,
"learning_rate": 1.4743589743589745e-06,
"loss": 59.4911,
"step": 70
},
{
"epoch": 0.01603249251817016,
"grad_norm": 585.236572265625,
"learning_rate": 1.5811965811965813e-06,
"loss": 58.0199,
"step": 75
},
{
"epoch": 0.017101325352714837,
"grad_norm": 778.44287109375,
"learning_rate": 1.6880341880341883e-06,
"loss": 63.7679,
"step": 80
},
{
"epoch": 0.018170158187259512,
"grad_norm": 646.8430786132812,
"learning_rate": 1.794871794871795e-06,
"loss": 60.7314,
"step": 85
},
{
"epoch": 0.01923899102180419,
"grad_norm": 989.9755249023438,
"learning_rate": 1.9017094017094018e-06,
"loss": 77.8096,
"step": 90
},
{
"epoch": 0.020307823856348866,
"grad_norm": 834.5411987304688,
"learning_rate": 2.008547008547009e-06,
"loss": 66.2088,
"step": 95
},
{
"epoch": 0.021376656690893545,
"grad_norm": 643.298095703125,
"learning_rate": 2.1153846153846155e-06,
"loss": 65.6054,
"step": 100
},
{
"epoch": 0.02244548952543822,
"grad_norm": 1178.525146484375,
"learning_rate": 2.222222222222222e-06,
"loss": 80.6449,
"step": 105
},
{
"epoch": 0.0235143223599829,
"grad_norm": 697.0106201171875,
"learning_rate": 2.3290598290598295e-06,
"loss": 69.9855,
"step": 110
},
{
"epoch": 0.024583155194527577,
"grad_norm": 783.412841796875,
"learning_rate": 2.435897435897436e-06,
"loss": 56.6307,
"step": 115
},
{
"epoch": 0.025651988029072252,
"grad_norm": 692.8482666015625,
"learning_rate": 2.542735042735043e-06,
"loss": 46.5356,
"step": 120
},
{
"epoch": 0.02672082086361693,
"grad_norm": 731.8605346679688,
"learning_rate": 2.64957264957265e-06,
"loss": 57.7131,
"step": 125
},
{
"epoch": 0.027789653698161606,
"grad_norm": 710.8729858398438,
"learning_rate": 2.756410256410257e-06,
"loss": 49.0737,
"step": 130
},
{
"epoch": 0.028858486532706284,
"grad_norm": 608.5044555664062,
"learning_rate": 2.8632478632478635e-06,
"loss": 50.7495,
"step": 135
},
{
"epoch": 0.029927319367250963,
"grad_norm": 723.5994873046875,
"learning_rate": 2.9700854700854705e-06,
"loss": 57.1029,
"step": 140
},
{
"epoch": 0.030996152201795638,
"grad_norm": 697.4583740234375,
"learning_rate": 3.0769230769230774e-06,
"loss": 45.4298,
"step": 145
},
{
"epoch": 0.03206498503634032,
"grad_norm": 567.723388671875,
"learning_rate": 3.183760683760684e-06,
"loss": 57.8409,
"step": 150
},
{
"epoch": 0.03313381787088499,
"grad_norm": 688.6309814453125,
"learning_rate": 3.290598290598291e-06,
"loss": 68.8388,
"step": 155
},
{
"epoch": 0.034202650705429674,
"grad_norm": 628.3114624023438,
"learning_rate": 3.397435897435898e-06,
"loss": 64.5809,
"step": 160
},
{
"epoch": 0.03527148353997435,
"grad_norm": 556.7693481445312,
"learning_rate": 3.5042735042735045e-06,
"loss": 54.5407,
"step": 165
},
{
"epoch": 0.036340316374519024,
"grad_norm": 494.6075744628906,
"learning_rate": 3.6111111111111115e-06,
"loss": 48.341,
"step": 170
},
{
"epoch": 0.0374091492090637,
"grad_norm": 554.0121459960938,
"learning_rate": 3.7179487179487184e-06,
"loss": 44.3806,
"step": 175
},
{
"epoch": 0.03847798204360838,
"grad_norm": 758.5612182617188,
"learning_rate": 3.8247863247863246e-06,
"loss": 59.123,
"step": 180
},
{
"epoch": 0.03954681487815306,
"grad_norm": 716.5413208007812,
"learning_rate": 3.9316239316239315e-06,
"loss": 59.4863,
"step": 185
},
{
"epoch": 0.04061564771269773,
"grad_norm": 503.7677307128906,
"learning_rate": 4.0384615384615385e-06,
"loss": 65.4498,
"step": 190
},
{
"epoch": 0.041684480547242414,
"grad_norm": 613.170654296875,
"learning_rate": 4.145299145299146e-06,
"loss": 52.9526,
"step": 195
},
{
"epoch": 0.04275331338178709,
"grad_norm": 725.29833984375,
"learning_rate": 4.2521367521367524e-06,
"loss": 46.6744,
"step": 200
},
{
"epoch": 0.043822146216331764,
"grad_norm": 525.0426635742188,
"learning_rate": 4.358974358974359e-06,
"loss": 37.1728,
"step": 205
},
{
"epoch": 0.04489097905087644,
"grad_norm": 627.0368041992188,
"learning_rate": 4.465811965811966e-06,
"loss": 63.3973,
"step": 210
},
{
"epoch": 0.04595981188542112,
"grad_norm": 501.9342956542969,
"learning_rate": 4.5726495726495725e-06,
"loss": 51.1136,
"step": 215
},
{
"epoch": 0.0470286447199658,
"grad_norm": 535.6387329101562,
"learning_rate": 4.6794871794871795e-06,
"loss": 40.9712,
"step": 220
},
{
"epoch": 0.04809747755451047,
"grad_norm": 492.3857421875,
"learning_rate": 4.786324786324787e-06,
"loss": 46.4765,
"step": 225
},
{
"epoch": 0.049166310389055154,
"grad_norm": 579.775390625,
"learning_rate": 4.8931623931623934e-06,
"loss": 47.3894,
"step": 230
},
{
"epoch": 0.05023514322359983,
"grad_norm": 530.4070434570312,
"learning_rate": 5e-06,
"loss": 36.866,
"step": 235
},
{
"epoch": 0.051303976058144504,
"grad_norm": 476.2954406738281,
"learning_rate": 5.1068376068376065e-06,
"loss": 32.3704,
"step": 240
},
{
"epoch": 0.052372808892689186,
"grad_norm": 412.430419921875,
"learning_rate": 5.213675213675214e-06,
"loss": 36.0298,
"step": 245
},
{
"epoch": 0.05344164172723386,
"grad_norm": 467.79412841796875,
"learning_rate": 5.320512820512821e-06,
"loss": 43.7503,
"step": 250
},
{
"epoch": 0.05451047456177854,
"grad_norm": 395.0292663574219,
"learning_rate": 5.4273504273504275e-06,
"loss": 33.9929,
"step": 255
},
{
"epoch": 0.05557930739632321,
"grad_norm": 543.4691162109375,
"learning_rate": 5.534188034188035e-06,
"loss": 38.4924,
"step": 260
},
{
"epoch": 0.056648140230867894,
"grad_norm": 446.6466369628906,
"learning_rate": 5.641025641025641e-06,
"loss": 30.8329,
"step": 265
},
{
"epoch": 0.05771697306541257,
"grad_norm": 409.0653381347656,
"learning_rate": 5.7478632478632475e-06,
"loss": 29.0382,
"step": 270
},
{
"epoch": 0.058785805899957244,
"grad_norm": 397.3152770996094,
"learning_rate": 5.854700854700855e-06,
"loss": 25.2408,
"step": 275
},
{
"epoch": 0.059854638734501926,
"grad_norm": 324.9696350097656,
"learning_rate": 5.961538461538462e-06,
"loss": 30.9856,
"step": 280
},
{
"epoch": 0.0609234715690466,
"grad_norm": 439.66241455078125,
"learning_rate": 6.0683760683760684e-06,
"loss": 25.987,
"step": 285
},
{
"epoch": 0.061992304403591277,
"grad_norm": 299.0304260253906,
"learning_rate": 6.175213675213676e-06,
"loss": 24.4493,
"step": 290
},
{
"epoch": 0.06306113723813596,
"grad_norm": 324.8219909667969,
"learning_rate": 6.282051282051282e-06,
"loss": 25.6893,
"step": 295
},
{
"epoch": 0.06412997007268063,
"grad_norm": 419.6073303222656,
"learning_rate": 6.3888888888888885e-06,
"loss": 29.4356,
"step": 300
},
{
"epoch": 0.06519880290722531,
"grad_norm": 299.56048583984375,
"learning_rate": 6.495726495726496e-06,
"loss": 22.2,
"step": 305
},
{
"epoch": 0.06626763574176998,
"grad_norm": 303.1939697265625,
"learning_rate": 6.602564102564103e-06,
"loss": 24.3048,
"step": 310
},
{
"epoch": 0.06733646857631466,
"grad_norm": 203.7785186767578,
"learning_rate": 6.7094017094017094e-06,
"loss": 18.5714,
"step": 315
},
{
"epoch": 0.06840530141085935,
"grad_norm": 292.95050048828125,
"learning_rate": 6.816239316239317e-06,
"loss": 17.4822,
"step": 320
},
{
"epoch": 0.06947413424540402,
"grad_norm": 145.5703125,
"learning_rate": 6.923076923076923e-06,
"loss": 15.8966,
"step": 325
},
{
"epoch": 0.0705429670799487,
"grad_norm": 261.8589782714844,
"learning_rate": 7.02991452991453e-06,
"loss": 14.8771,
"step": 330
},
{
"epoch": 0.07161179991449337,
"grad_norm": 215.4441375732422,
"learning_rate": 7.136752136752137e-06,
"loss": 16.0905,
"step": 335
},
{
"epoch": 0.07268063274903805,
"grad_norm": 206.23388671875,
"learning_rate": 7.243589743589744e-06,
"loss": 11.9737,
"step": 340
},
{
"epoch": 0.07374946558358272,
"grad_norm": 200.727294921875,
"learning_rate": 7.350427350427351e-06,
"loss": 12.1407,
"step": 345
},
{
"epoch": 0.0748182984181274,
"grad_norm": 183.34083557128906,
"learning_rate": 7.457264957264958e-06,
"loss": 11.5492,
"step": 350
},
{
"epoch": 0.07588713125267209,
"grad_norm": 155.31253051757812,
"learning_rate": 7.564102564102564e-06,
"loss": 13.0664,
"step": 355
},
{
"epoch": 0.07695596408721676,
"grad_norm": 141.0535125732422,
"learning_rate": 7.670940170940172e-06,
"loss": 10.0428,
"step": 360
},
{
"epoch": 0.07802479692176144,
"grad_norm": 164.8147430419922,
"learning_rate": 7.77777777777778e-06,
"loss": 9.2962,
"step": 365
},
{
"epoch": 0.07909362975630611,
"grad_norm": 113.84266662597656,
"learning_rate": 7.884615384615384e-06,
"loss": 8.6304,
"step": 370
},
{
"epoch": 0.08016246259085079,
"grad_norm": 90.67302703857422,
"learning_rate": 7.991452991452993e-06,
"loss": 5.7954,
"step": 375
},
{
"epoch": 0.08123129542539546,
"grad_norm": 77.02782440185547,
"learning_rate": 8.098290598290598e-06,
"loss": 6.0213,
"step": 380
},
{
"epoch": 0.08230012825994014,
"grad_norm": 77.24604797363281,
"learning_rate": 8.205128205128205e-06,
"loss": 6.8873,
"step": 385
},
{
"epoch": 0.08336896109448483,
"grad_norm": 78.4577407836914,
"learning_rate": 8.311965811965812e-06,
"loss": 5.6347,
"step": 390
},
{
"epoch": 0.0844377939290295,
"grad_norm": 70.10798645019531,
"learning_rate": 8.41880341880342e-06,
"loss": 5.7346,
"step": 395
},
{
"epoch": 0.08550662676357418,
"grad_norm": 53.02711486816406,
"learning_rate": 8.525641025641026e-06,
"loss": 4.2817,
"step": 400
},
{
"epoch": 0.08657545959811885,
"grad_norm": 58.17922592163086,
"learning_rate": 8.632478632478633e-06,
"loss": 3.9817,
"step": 405
},
{
"epoch": 0.08764429243266353,
"grad_norm": 47.072662353515625,
"learning_rate": 8.73931623931624e-06,
"loss": 3.1871,
"step": 410
},
{
"epoch": 0.0887131252672082,
"grad_norm": 35.73280715942383,
"learning_rate": 8.846153846153847e-06,
"loss": 3.2088,
"step": 415
},
{
"epoch": 0.08978195810175288,
"grad_norm": 40.767581939697266,
"learning_rate": 8.952991452991454e-06,
"loss": 3.5216,
"step": 420
},
{
"epoch": 0.09085079093629757,
"grad_norm": 32.53750991821289,
"learning_rate": 9.059829059829061e-06,
"loss": 2.3657,
"step": 425
},
{
"epoch": 0.09191962377084224,
"grad_norm": 31.6849422454834,
"learning_rate": 9.166666666666666e-06,
"loss": 2.3054,
"step": 430
},
{
"epoch": 0.09298845660538692,
"grad_norm": 29.081796646118164,
"learning_rate": 9.273504273504275e-06,
"loss": 2.174,
"step": 435
},
{
"epoch": 0.0940572894399316,
"grad_norm": 34.65196990966797,
"learning_rate": 9.38034188034188e-06,
"loss": 2.4017,
"step": 440
},
{
"epoch": 0.09512612227447627,
"grad_norm": 19.63212776184082,
"learning_rate": 9.487179487179487e-06,
"loss": 2.1189,
"step": 445
},
{
"epoch": 0.09619495510902094,
"grad_norm": 24.055822372436523,
"learning_rate": 9.594017094017094e-06,
"loss": 2.3965,
"step": 450
},
{
"epoch": 0.09726378794356563,
"grad_norm": 18.823020935058594,
"learning_rate": 9.700854700854701e-06,
"loss": 1.7638,
"step": 455
},
{
"epoch": 0.09833262077811031,
"grad_norm": 16.97188949584961,
"learning_rate": 9.807692307692308e-06,
"loss": 1.4081,
"step": 460
},
{
"epoch": 0.09940145361265498,
"grad_norm": 18.629823684692383,
"learning_rate": 9.914529914529915e-06,
"loss": 1.5501,
"step": 465
},
{
"epoch": 0.10047028644719966,
"grad_norm": 16.413358688354492,
"learning_rate": 1.0021367521367522e-05,
"loss": 1.4015,
"step": 470
},
{
"epoch": 0.10153911928174433,
"grad_norm": 14.555413246154785,
"learning_rate": 1.012820512820513e-05,
"loss": 1.3726,
"step": 475
},
{
"epoch": 0.10260795211628901,
"grad_norm": 17.65382194519043,
"learning_rate": 1.0235042735042734e-05,
"loss": 1.1044,
"step": 480
},
{
"epoch": 0.10367678495083368,
"grad_norm": 13.994580268859863,
"learning_rate": 1.0341880341880343e-05,
"loss": 1.1651,
"step": 485
},
{
"epoch": 0.10474561778537837,
"grad_norm": 13.052831649780273,
"learning_rate": 1.044871794871795e-05,
"loss": 1.1674,
"step": 490
},
{
"epoch": 0.10581445061992305,
"grad_norm": 16.172752380371094,
"learning_rate": 1.0555555555555557e-05,
"loss": 1.2274,
"step": 495
},
{
"epoch": 0.10688328345446772,
"grad_norm": 12.005922317504883,
"learning_rate": 1.0662393162393162e-05,
"loss": 1.0606,
"step": 500
},
{
"epoch": 0.1079521162890124,
"grad_norm": 13.400876998901367,
"learning_rate": 1.076923076923077e-05,
"loss": 1.2207,
"step": 505
},
{
"epoch": 0.10902094912355707,
"grad_norm": 14.658180236816406,
"learning_rate": 1.0876068376068376e-05,
"loss": 1.12,
"step": 510
},
{
"epoch": 0.11008978195810175,
"grad_norm": 11.943291664123535,
"learning_rate": 1.0982905982905985e-05,
"loss": 0.9925,
"step": 515
},
{
"epoch": 0.11115861479264642,
"grad_norm": 10.507229804992676,
"learning_rate": 1.1089743589743592e-05,
"loss": 0.9542,
"step": 520
},
{
"epoch": 0.11222744762719111,
"grad_norm": 12.802043914794922,
"learning_rate": 1.1196581196581197e-05,
"loss": 1.1911,
"step": 525
},
{
"epoch": 0.11329628046173579,
"grad_norm": 10.767659187316895,
"learning_rate": 1.1303418803418804e-05,
"loss": 1.2418,
"step": 530
},
{
"epoch": 0.11436511329628046,
"grad_norm": 12.087440490722656,
"learning_rate": 1.1410256410256411e-05,
"loss": 1.0926,
"step": 535
},
{
"epoch": 0.11543394613082514,
"grad_norm": 9.390274047851562,
"learning_rate": 1.1517094017094016e-05,
"loss": 0.9052,
"step": 540
},
{
"epoch": 0.11650277896536981,
"grad_norm": 10.022379875183105,
"learning_rate": 1.1623931623931625e-05,
"loss": 0.9725,
"step": 545
},
{
"epoch": 0.11757161179991449,
"grad_norm": 12.384991645812988,
"learning_rate": 1.1730769230769232e-05,
"loss": 1.0631,
"step": 550
},
{
"epoch": 0.11864044463445918,
"grad_norm": 10.217049598693848,
"learning_rate": 1.1837606837606839e-05,
"loss": 1.0069,
"step": 555
},
{
"epoch": 0.11970927746900385,
"grad_norm": 9.567984580993652,
"learning_rate": 1.1944444444444444e-05,
"loss": 0.9301,
"step": 560
},
{
"epoch": 0.12077811030354853,
"grad_norm": 7.623697280883789,
"learning_rate": 1.2051282051282051e-05,
"loss": 0.7413,
"step": 565
},
{
"epoch": 0.1218469431380932,
"grad_norm": 12.299339294433594,
"learning_rate": 1.2158119658119658e-05,
"loss": 1.0359,
"step": 570
},
{
"epoch": 0.12291577597263788,
"grad_norm": 10.420650482177734,
"learning_rate": 1.2264957264957267e-05,
"loss": 0.9793,
"step": 575
},
{
"epoch": 0.12398460880718255,
"grad_norm": 9.585742950439453,
"learning_rate": 1.2371794871794874e-05,
"loss": 0.7139,
"step": 580
},
{
"epoch": 0.12505344164172724,
"grad_norm": 8.004850387573242,
"learning_rate": 1.247863247863248e-05,
"loss": 1.0595,
"step": 585
},
{
"epoch": 0.12612227447627192,
"grad_norm": 9.993664741516113,
"learning_rate": 1.2585470085470086e-05,
"loss": 0.8869,
"step": 590
},
{
"epoch": 0.1271911073108166,
"grad_norm": 9.177787780761719,
"learning_rate": 1.2692307692307693e-05,
"loss": 0.8318,
"step": 595
},
{
"epoch": 0.12825994014536127,
"grad_norm": 10.756118774414062,
"learning_rate": 1.2799145299145298e-05,
"loss": 0.7328,
"step": 600
},
{
"epoch": 0.12932877297990594,
"grad_norm": 9.876399040222168,
"learning_rate": 1.2905982905982907e-05,
"loss": 0.7785,
"step": 605
},
{
"epoch": 0.13039760581445062,
"grad_norm": 9.022135734558105,
"learning_rate": 1.3012820512820514e-05,
"loss": 0.8507,
"step": 610
},
{
"epoch": 0.1314664386489953,
"grad_norm": 9.838420867919922,
"learning_rate": 1.3119658119658121e-05,
"loss": 1.0039,
"step": 615
},
{
"epoch": 0.13253527148353997,
"grad_norm": 11.163064002990723,
"learning_rate": 1.3226495726495728e-05,
"loss": 0.7514,
"step": 620
},
{
"epoch": 0.13360410431808464,
"grad_norm": 7.968421459197998,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.6666,
"step": 625
},
{
"epoch": 0.13467293715262932,
"grad_norm": 9.873268127441406,
"learning_rate": 1.3440170940170942e-05,
"loss": 0.6639,
"step": 630
},
{
"epoch": 0.135741769987174,
"grad_norm": 7.193119049072266,
"learning_rate": 1.3547008547008549e-05,
"loss": 0.77,
"step": 635
},
{
"epoch": 0.1368106028217187,
"grad_norm": 9.957530975341797,
"learning_rate": 1.3653846153846156e-05,
"loss": 0.8079,
"step": 640
},
{
"epoch": 0.13787943565626337,
"grad_norm": 9.682191848754883,
"learning_rate": 1.3760683760683761e-05,
"loss": 0.8061,
"step": 645
},
{
"epoch": 0.13894826849080805,
"grad_norm": 7.611622333526611,
"learning_rate": 1.3867521367521368e-05,
"loss": 0.7666,
"step": 650
},
{
"epoch": 0.14001710132535272,
"grad_norm": 9.452914237976074,
"learning_rate": 1.3974358974358975e-05,
"loss": 0.7835,
"step": 655
},
{
"epoch": 0.1410859341598974,
"grad_norm": 7.584820747375488,
"learning_rate": 1.4081196581196584e-05,
"loss": 0.7319,
"step": 660
},
{
"epoch": 0.14215476699444207,
"grad_norm": 7.296449661254883,
"learning_rate": 1.4188034188034189e-05,
"loss": 0.5945,
"step": 665
},
{
"epoch": 0.14322359982898675,
"grad_norm": 8.2069730758667,
"learning_rate": 1.4294871794871796e-05,
"loss": 0.7008,
"step": 670
},
{
"epoch": 0.14429243266353142,
"grad_norm": 8.537630081176758,
"learning_rate": 1.4401709401709403e-05,
"loss": 0.9728,
"step": 675
},
{
"epoch": 0.1453612654980761,
"grad_norm": 8.83273696899414,
"learning_rate": 1.450854700854701e-05,
"loss": 0.7346,
"step": 680
},
{
"epoch": 0.14643009833262077,
"grad_norm": 9.676769256591797,
"learning_rate": 1.4615384615384615e-05,
"loss": 0.8463,
"step": 685
},
{
"epoch": 0.14749893116716545,
"grad_norm": 8.45507526397705,
"learning_rate": 1.4722222222222224e-05,
"loss": 0.7251,
"step": 690
},
{
"epoch": 0.14856776400171012,
"grad_norm": 9.440534591674805,
"learning_rate": 1.4829059829059831e-05,
"loss": 0.7444,
"step": 695
},
{
"epoch": 0.1496365968362548,
"grad_norm": 7.491715431213379,
"learning_rate": 1.4935897435897438e-05,
"loss": 0.6496,
"step": 700
},
{
"epoch": 0.15070542967079947,
"grad_norm": 6.313747406005859,
"learning_rate": 1.5042735042735043e-05,
"loss": 0.7623,
"step": 705
},
{
"epoch": 0.15177426250534418,
"grad_norm": 7.4156060218811035,
"learning_rate": 1.514957264957265e-05,
"loss": 0.8285,
"step": 710
},
{
"epoch": 0.15284309533988885,
"grad_norm": 8.984630584716797,
"learning_rate": 1.5256410256410257e-05,
"loss": 0.886,
"step": 715
},
{
"epoch": 0.15391192817443353,
"grad_norm": 7.339222431182861,
"learning_rate": 1.5363247863247866e-05,
"loss": 0.7274,
"step": 720
},
{
"epoch": 0.1549807610089782,
"grad_norm": 7.579738140106201,
"learning_rate": 1.5470085470085473e-05,
"loss": 0.8669,
"step": 725
},
{
"epoch": 0.15604959384352288,
"grad_norm": 8.190738677978516,
"learning_rate": 1.557692307692308e-05,
"loss": 0.6741,
"step": 730
},
{
"epoch": 0.15711842667806755,
"grad_norm": 7.186857223510742,
"learning_rate": 1.5683760683760683e-05,
"loss": 0.7396,
"step": 735
},
{
"epoch": 0.15818725951261223,
"grad_norm": 6.874704360961914,
"learning_rate": 1.579059829059829e-05,
"loss": 0.7197,
"step": 740
},
{
"epoch": 0.1592560923471569,
"grad_norm": 9.56429386138916,
"learning_rate": 1.5897435897435897e-05,
"loss": 0.9099,
"step": 745
},
{
"epoch": 0.16032492518170158,
"grad_norm": 7.520778179168701,
"learning_rate": 1.6004273504273508e-05,
"loss": 0.7344,
"step": 750
},
{
"epoch": 0.16139375801624625,
"grad_norm": 5.668506622314453,
"learning_rate": 1.6111111111111115e-05,
"loss": 0.6107,
"step": 755
},
{
"epoch": 0.16246259085079093,
"grad_norm": 9.869526863098145,
"learning_rate": 1.6217948717948718e-05,
"loss": 0.6929,
"step": 760
},
{
"epoch": 0.1635314236853356,
"grad_norm": 8.029936790466309,
"learning_rate": 1.6324786324786325e-05,
"loss": 0.7019,
"step": 765
},
{
"epoch": 0.16460025651988028,
"grad_norm": 8.148101806640625,
"learning_rate": 1.6431623931623932e-05,
"loss": 0.5759,
"step": 770
},
{
"epoch": 0.16566908935442498,
"grad_norm": 7.96873664855957,
"learning_rate": 1.653846153846154e-05,
"loss": 0.5736,
"step": 775
},
{
"epoch": 0.16673792218896966,
"grad_norm": 9.016830444335938,
"learning_rate": 1.6645299145299146e-05,
"loss": 0.5878,
"step": 780
},
{
"epoch": 0.16780675502351433,
"grad_norm": 7.439241886138916,
"learning_rate": 1.6752136752136753e-05,
"loss": 0.7349,
"step": 785
},
{
"epoch": 0.168875587858059,
"grad_norm": 9.404788970947266,
"learning_rate": 1.685897435897436e-05,
"loss": 0.7691,
"step": 790
},
{
"epoch": 0.16994442069260368,
"grad_norm": 6.773132801055908,
"learning_rate": 1.6965811965811967e-05,
"loss": 0.6228,
"step": 795
},
{
"epoch": 0.17101325352714836,
"grad_norm": 6.86265754699707,
"learning_rate": 1.7072649572649574e-05,
"loss": 0.6394,
"step": 800
},
{
"epoch": 0.17208208636169303,
"grad_norm": 6.647765159606934,
"learning_rate": 1.717948717948718e-05,
"loss": 0.624,
"step": 805
},
{
"epoch": 0.1731509191962377,
"grad_norm": 6.882334232330322,
"learning_rate": 1.7286324786324788e-05,
"loss": 0.6487,
"step": 810
},
{
"epoch": 0.17421975203078238,
"grad_norm": 8.620728492736816,
"learning_rate": 1.7393162393162395e-05,
"loss": 0.6001,
"step": 815
},
{
"epoch": 0.17528858486532706,
"grad_norm": 7.544363021850586,
"learning_rate": 1.7500000000000002e-05,
"loss": 0.6685,
"step": 820
},
{
"epoch": 0.17635741769987173,
"grad_norm": 8.941640853881836,
"learning_rate": 1.760683760683761e-05,
"loss": 0.8176,
"step": 825
},
{
"epoch": 0.1774262505344164,
"grad_norm": 6.829235553741455,
"learning_rate": 1.7713675213675216e-05,
"loss": 0.6417,
"step": 830
},
{
"epoch": 0.17849508336896108,
"grad_norm": 7.0878705978393555,
"learning_rate": 1.7820512820512823e-05,
"loss": 0.6353,
"step": 835
},
{
"epoch": 0.17956391620350576,
"grad_norm": 9.062278747558594,
"learning_rate": 1.792735042735043e-05,
"loss": 0.5946,
"step": 840
},
{
"epoch": 0.18063274903805046,
"grad_norm": 7.967255115509033,
"learning_rate": 1.8034188034188037e-05,
"loss": 0.6122,
"step": 845
},
{
"epoch": 0.18170158187259514,
"grad_norm": 7.076515197753906,
"learning_rate": 1.8141025641025644e-05,
"loss": 0.6499,
"step": 850
},
{
"epoch": 0.1827704147071398,
"grad_norm": 7.658061981201172,
"learning_rate": 1.8247863247863247e-05,
"loss": 0.601,
"step": 855
},
{
"epoch": 0.1838392475416845,
"grad_norm": 7.605343341827393,
"learning_rate": 1.8354700854700854e-05,
"loss": 0.5404,
"step": 860
},
{
"epoch": 0.18490808037622916,
"grad_norm": 6.7278900146484375,
"learning_rate": 1.8461538461538465e-05,
"loss": 0.6574,
"step": 865
},
{
"epoch": 0.18597691321077384,
"grad_norm": 6.190138339996338,
"learning_rate": 1.856837606837607e-05,
"loss": 0.5377,
"step": 870
},
{
"epoch": 0.1870457460453185,
"grad_norm": 5.700743198394775,
"learning_rate": 1.867521367521368e-05,
"loss": 0.5537,
"step": 875
},
{
"epoch": 0.1881145788798632,
"grad_norm": 9.452567100524902,
"learning_rate": 1.8782051282051282e-05,
"loss": 0.7377,
"step": 880
},
{
"epoch": 0.18918341171440786,
"grad_norm": 8.572616577148438,
"learning_rate": 1.888888888888889e-05,
"loss": 0.8466,
"step": 885
},
{
"epoch": 0.19025224454895254,
"grad_norm": 6.903915882110596,
"learning_rate": 1.8995726495726496e-05,
"loss": 0.4808,
"step": 890
},
{
"epoch": 0.1913210773834972,
"grad_norm": 6.828094959259033,
"learning_rate": 1.9102564102564106e-05,
"loss": 0.5482,
"step": 895
},
{
"epoch": 0.1923899102180419,
"grad_norm": 9.153865814208984,
"learning_rate": 1.920940170940171e-05,
"loss": 0.6588,
"step": 900
},
{
"epoch": 0.19345874305258656,
"grad_norm": 8.290953636169434,
"learning_rate": 1.9316239316239317e-05,
"loss": 0.6953,
"step": 905
},
{
"epoch": 0.19452757588713127,
"grad_norm": 6.111261367797852,
"learning_rate": 1.9423076923076924e-05,
"loss": 0.4803,
"step": 910
},
{
"epoch": 0.19559640872167594,
"grad_norm": 8.402656555175781,
"learning_rate": 1.952991452991453e-05,
"loss": 0.6869,
"step": 915
},
{
"epoch": 0.19666524155622062,
"grad_norm": 5.929531574249268,
"learning_rate": 1.9636752136752138e-05,
"loss": 0.5393,
"step": 920
},
{
"epoch": 0.1977340743907653,
"grad_norm": 7.195873260498047,
"learning_rate": 1.9743589743589745e-05,
"loss": 0.4829,
"step": 925
},
{
"epoch": 0.19880290722530997,
"grad_norm": 8.35781192779541,
"learning_rate": 1.9850427350427352e-05,
"loss": 0.5071,
"step": 930
},
{
"epoch": 0.19987174005985464,
"grad_norm": 7.967381954193115,
"learning_rate": 1.995726495726496e-05,
"loss": 0.6828,
"step": 935
},
{
"epoch": 0.20094057289439932,
"grad_norm": 7.339260101318359,
"learning_rate": 1.999287410926366e-05,
"loss": 0.6468,
"step": 940
},
{
"epoch": 0.202009405728944,
"grad_norm": 6.559432029724121,
"learning_rate": 1.9980997624703088e-05,
"loss": 0.6169,
"step": 945
},
{
"epoch": 0.20307823856348867,
"grad_norm": 6.627939701080322,
"learning_rate": 1.996912114014252e-05,
"loss": 0.5389,
"step": 950
},
{
"epoch": 0.20414707139803334,
"grad_norm": 6.5172119140625,
"learning_rate": 1.995724465558195e-05,
"loss": 0.4527,
"step": 955
},
{
"epoch": 0.20521590423257802,
"grad_norm": 6.31046199798584,
"learning_rate": 1.994536817102138e-05,
"loss": 0.5405,
"step": 960
},
{
"epoch": 0.2062847370671227,
"grad_norm": 6.0619425773620605,
"learning_rate": 1.993349168646081e-05,
"loss": 0.6583,
"step": 965
},
{
"epoch": 0.20735356990166737,
"grad_norm": 7.9872941970825195,
"learning_rate": 1.992161520190024e-05,
"loss": 0.7037,
"step": 970
},
{
"epoch": 0.20842240273621207,
"grad_norm": 6.310743808746338,
"learning_rate": 1.9909738717339668e-05,
"loss": 0.564,
"step": 975
},
{
"epoch": 0.20949123557075675,
"grad_norm": 6.636473655700684,
"learning_rate": 1.9897862232779098e-05,
"loss": 0.4531,
"step": 980
},
{
"epoch": 0.21056006840530142,
"grad_norm": 6.087254047393799,
"learning_rate": 1.988598574821853e-05,
"loss": 0.5454,
"step": 985
},
{
"epoch": 0.2116289012398461,
"grad_norm": 6.705723762512207,
"learning_rate": 1.987410926365796e-05,
"loss": 0.4836,
"step": 990
},
{
"epoch": 0.21269773407439077,
"grad_norm": 6.238287448883057,
"learning_rate": 1.9862232779097387e-05,
"loss": 0.5458,
"step": 995
},
{
"epoch": 0.21376656690893545,
"grad_norm": 5.439404010772705,
"learning_rate": 1.985035629453682e-05,
"loss": 0.5202,
"step": 1000
},
{
"epoch": 0.21483539974348012,
"grad_norm": 5.1525654792785645,
"learning_rate": 1.9838479809976248e-05,
"loss": 0.5532,
"step": 1005
},
{
"epoch": 0.2159042325780248,
"grad_norm": 6.952949047088623,
"learning_rate": 1.9826603325415678e-05,
"loss": 0.63,
"step": 1010
},
{
"epoch": 0.21697306541256947,
"grad_norm": 5.5152788162231445,
"learning_rate": 1.981472684085511e-05,
"loss": 0.6299,
"step": 1015
},
{
"epoch": 0.21804189824711415,
"grad_norm": 7.090893745422363,
"learning_rate": 1.980285035629454e-05,
"loss": 0.5405,
"step": 1020
},
{
"epoch": 0.21911073108165882,
"grad_norm": 5.758279323577881,
"learning_rate": 1.979097387173397e-05,
"loss": 0.4379,
"step": 1025
},
{
"epoch": 0.2201795639162035,
"grad_norm": 10.006664276123047,
"learning_rate": 1.9779097387173397e-05,
"loss": 0.7446,
"step": 1030
},
{
"epoch": 0.22124839675074817,
"grad_norm": 6.134273529052734,
"learning_rate": 1.9767220902612828e-05,
"loss": 0.6652,
"step": 1035
},
{
"epoch": 0.22231722958529285,
"grad_norm": 7.395203113555908,
"learning_rate": 1.9755344418052258e-05,
"loss": 0.6873,
"step": 1040
},
{
"epoch": 0.22338606241983755,
"grad_norm": 4.713867664337158,
"learning_rate": 1.974346793349169e-05,
"loss": 0.4904,
"step": 1045
},
{
"epoch": 0.22445489525438223,
"grad_norm": 6.1399359703063965,
"learning_rate": 1.973159144893112e-05,
"loss": 0.5763,
"step": 1050
},
{
"epoch": 0.2255237280889269,
"grad_norm": 7.5219879150390625,
"learning_rate": 1.971971496437055e-05,
"loss": 0.5801,
"step": 1055
},
{
"epoch": 0.22659256092347158,
"grad_norm": 5.3690619468688965,
"learning_rate": 1.9707838479809977e-05,
"loss": 0.518,
"step": 1060
},
{
"epoch": 0.22766139375801625,
"grad_norm": 7.701142311096191,
"learning_rate": 1.9695961995249407e-05,
"loss": 0.5928,
"step": 1065
},
{
"epoch": 0.22873022659256093,
"grad_norm": 5.431284427642822,
"learning_rate": 1.9684085510688838e-05,
"loss": 0.5575,
"step": 1070
},
{
"epoch": 0.2297990594271056,
"grad_norm": 5.841889381408691,
"learning_rate": 1.967220902612827e-05,
"loss": 0.497,
"step": 1075
},
{
"epoch": 0.23086789226165028,
"grad_norm": 6.84688663482666,
"learning_rate": 1.9660332541567696e-05,
"loss": 0.4607,
"step": 1080
},
{
"epoch": 0.23193672509619495,
"grad_norm": 7.2094645500183105,
"learning_rate": 1.964845605700713e-05,
"loss": 0.5531,
"step": 1085
},
{
"epoch": 0.23300555793073963,
"grad_norm": 8.189807891845703,
"learning_rate": 1.9636579572446557e-05,
"loss": 0.5372,
"step": 1090
},
{
"epoch": 0.2340743907652843,
"grad_norm": 6.64928674697876,
"learning_rate": 1.9624703087885987e-05,
"loss": 0.4641,
"step": 1095
},
{
"epoch": 0.23514322359982898,
"grad_norm": 5.907129764556885,
"learning_rate": 1.9612826603325418e-05,
"loss": 0.4724,
"step": 1100
},
{
"epoch": 0.23621205643437365,
"grad_norm": 5.550957202911377,
"learning_rate": 1.960095011876485e-05,
"loss": 0.4422,
"step": 1105
},
{
"epoch": 0.23728088926891835,
"grad_norm": 5.1877899169921875,
"learning_rate": 1.9589073634204276e-05,
"loss": 0.4571,
"step": 1110
},
{
"epoch": 0.23834972210346303,
"grad_norm": 6.098719120025635,
"learning_rate": 1.9577197149643706e-05,
"loss": 0.5104,
"step": 1115
},
{
"epoch": 0.2394185549380077,
"grad_norm": 5.2909770011901855,
"learning_rate": 1.9565320665083137e-05,
"loss": 0.5217,
"step": 1120
},
{
"epoch": 0.24048738777255238,
"grad_norm": 7.134459018707275,
"learning_rate": 1.9553444180522567e-05,
"loss": 0.4481,
"step": 1125
},
{
"epoch": 0.24155622060709706,
"grad_norm": 7.295234203338623,
"learning_rate": 1.9541567695961994e-05,
"loss": 0.5244,
"step": 1130
},
{
"epoch": 0.24262505344164173,
"grad_norm": 6.2853193283081055,
"learning_rate": 1.952969121140143e-05,
"loss": 0.4593,
"step": 1135
},
{
"epoch": 0.2436938862761864,
"grad_norm": 6.563023567199707,
"learning_rate": 1.9517814726840856e-05,
"loss": 0.5566,
"step": 1140
},
{
"epoch": 0.24476271911073108,
"grad_norm": 5.089166164398193,
"learning_rate": 1.9505938242280286e-05,
"loss": 0.5473,
"step": 1145
},
{
"epoch": 0.24583155194527576,
"grad_norm": 4.706131458282471,
"learning_rate": 1.9494061757719717e-05,
"loss": 0.3649,
"step": 1150
},
{
"epoch": 0.24690038477982043,
"grad_norm": 7.967005729675293,
"learning_rate": 1.9482185273159147e-05,
"loss": 0.5466,
"step": 1155
},
{
"epoch": 0.2479692176143651,
"grad_norm": 6.625776767730713,
"learning_rate": 1.9470308788598574e-05,
"loss": 0.4687,
"step": 1160
},
{
"epoch": 0.24903805044890978,
"grad_norm": 6.631053447723389,
"learning_rate": 1.9458432304038005e-05,
"loss": 0.521,
"step": 1165
},
{
"epoch": 0.2501068832834545,
"grad_norm": 5.4099555015563965,
"learning_rate": 1.944655581947744e-05,
"loss": 0.4539,
"step": 1170
},
{
"epoch": 0.25117571611799916,
"grad_norm": 6.302776336669922,
"learning_rate": 1.9434679334916866e-05,
"loss": 0.5975,
"step": 1175
},
{
"epoch": 0.25224454895254383,
"grad_norm": 6.055430889129639,
"learning_rate": 1.9422802850356297e-05,
"loss": 0.4108,
"step": 1180
},
{
"epoch": 0.2533133817870885,
"grad_norm": 6.507791519165039,
"learning_rate": 1.9410926365795727e-05,
"loss": 0.4052,
"step": 1185
},
{
"epoch": 0.2543822146216332,
"grad_norm": 4.61367130279541,
"learning_rate": 1.9399049881235158e-05,
"loss": 0.3805,
"step": 1190
},
{
"epoch": 0.25545104745617786,
"grad_norm": 7.931022644042969,
"learning_rate": 1.9387173396674585e-05,
"loss": 0.508,
"step": 1195
},
{
"epoch": 0.25651988029072254,
"grad_norm": 7.164324760437012,
"learning_rate": 1.9375296912114015e-05,
"loss": 0.4517,
"step": 1200
},
{
"epoch": 0.2575887131252672,
"grad_norm": 5.5631890296936035,
"learning_rate": 1.9363420427553446e-05,
"loss": 0.5546,
"step": 1205
},
{
"epoch": 0.2586575459598119,
"grad_norm": 6.603108882904053,
"learning_rate": 1.9351543942992876e-05,
"loss": 0.5427,
"step": 1210
},
{
"epoch": 0.25972637879435656,
"grad_norm": 6.033792018890381,
"learning_rate": 1.9339667458432304e-05,
"loss": 0.4869,
"step": 1215
},
{
"epoch": 0.26079521162890124,
"grad_norm": 6.105428218841553,
"learning_rate": 1.9327790973871738e-05,
"loss": 0.5849,
"step": 1220
},
{
"epoch": 0.2618640444634459,
"grad_norm": 5.2770161628723145,
"learning_rate": 1.9315914489311165e-05,
"loss": 0.4608,
"step": 1225
},
{
"epoch": 0.2629328772979906,
"grad_norm": 8.29669189453125,
"learning_rate": 1.9304038004750595e-05,
"loss": 0.6211,
"step": 1230
},
{
"epoch": 0.26400171013253526,
"grad_norm": 5.556075572967529,
"learning_rate": 1.9292161520190026e-05,
"loss": 0.5136,
"step": 1235
},
{
"epoch": 0.26507054296707994,
"grad_norm": 6.262655258178711,
"learning_rate": 1.9280285035629456e-05,
"loss": 0.4788,
"step": 1240
},
{
"epoch": 0.2661393758016246,
"grad_norm": 7.117279052734375,
"learning_rate": 1.9268408551068884e-05,
"loss": 0.4763,
"step": 1245
},
{
"epoch": 0.2672082086361693,
"grad_norm": 7.450889587402344,
"learning_rate": 1.9256532066508314e-05,
"loss": 0.5398,
"step": 1250
},
{
"epoch": 0.26827704147071396,
"grad_norm": 6.718365669250488,
"learning_rate": 1.9244655581947745e-05,
"loss": 0.5782,
"step": 1255
},
{
"epoch": 0.26934587430525864,
"grad_norm": 5.374184608459473,
"learning_rate": 1.9232779097387175e-05,
"loss": 0.4568,
"step": 1260
},
{
"epoch": 0.2704147071398033,
"grad_norm": 4.485583305358887,
"learning_rate": 1.9220902612826606e-05,
"loss": 0.3874,
"step": 1265
},
{
"epoch": 0.271483539974348,
"grad_norm": 5.478529930114746,
"learning_rate": 1.9209026128266036e-05,
"loss": 0.4604,
"step": 1270
},
{
"epoch": 0.27255237280889266,
"grad_norm": 5.911350727081299,
"learning_rate": 1.9197149643705463e-05,
"loss": 0.4202,
"step": 1275
},
{
"epoch": 0.2736212056434374,
"grad_norm": 7.953678131103516,
"learning_rate": 1.9185273159144894e-05,
"loss": 0.4467,
"step": 1280
},
{
"epoch": 0.27469003847798207,
"grad_norm": 6.318038463592529,
"learning_rate": 1.9173396674584325e-05,
"loss": 0.3631,
"step": 1285
},
{
"epoch": 0.27575887131252674,
"grad_norm": 7.290485382080078,
"learning_rate": 1.9161520190023755e-05,
"loss": 0.5917,
"step": 1290
},
{
"epoch": 0.2768277041470714,
"grad_norm": 6.057776927947998,
"learning_rate": 1.9149643705463182e-05,
"loss": 0.4315,
"step": 1295
},
{
"epoch": 0.2778965369816161,
"grad_norm": 5.482032775878906,
"learning_rate": 1.9137767220902613e-05,
"loss": 0.3898,
"step": 1300
},
{
"epoch": 0.27896536981616077,
"grad_norm": 7.219336032867432,
"learning_rate": 1.9125890736342047e-05,
"loss": 0.5546,
"step": 1305
},
{
"epoch": 0.28003420265070544,
"grad_norm": 6.556499481201172,
"learning_rate": 1.9114014251781474e-05,
"loss": 0.4766,
"step": 1310
},
{
"epoch": 0.2811030354852501,
"grad_norm": 8.849128723144531,
"learning_rate": 1.9102137767220904e-05,
"loss": 0.5883,
"step": 1315
},
{
"epoch": 0.2821718683197948,
"grad_norm": 6.604886054992676,
"learning_rate": 1.9090261282660335e-05,
"loss": 0.4837,
"step": 1320
},
{
"epoch": 0.28324070115433947,
"grad_norm": 6.3507304191589355,
"learning_rate": 1.9078384798099766e-05,
"loss": 0.4576,
"step": 1325
},
{
"epoch": 0.28430953398888414,
"grad_norm": 7.592872619628906,
"learning_rate": 1.9066508313539193e-05,
"loss": 0.3894,
"step": 1330
},
{
"epoch": 0.2853783668234288,
"grad_norm": 7.806624889373779,
"learning_rate": 1.9054631828978623e-05,
"loss": 0.5239,
"step": 1335
},
{
"epoch": 0.2864471996579735,
"grad_norm": 5.70356559753418,
"learning_rate": 1.9042755344418054e-05,
"loss": 0.4255,
"step": 1340
},
{
"epoch": 0.28751603249251817,
"grad_norm": 6.017592906951904,
"learning_rate": 1.9030878859857484e-05,
"loss": 0.3944,
"step": 1345
},
{
"epoch": 0.28858486532706284,
"grad_norm": 8.678641319274902,
"learning_rate": 1.9019002375296915e-05,
"loss": 0.6219,
"step": 1350
},
{
"epoch": 0.2896536981616075,
"grad_norm": 5.638593673706055,
"learning_rate": 1.9007125890736345e-05,
"loss": 0.4791,
"step": 1355
},
{
"epoch": 0.2907225309961522,
"grad_norm": 6.662184238433838,
"learning_rate": 1.8995249406175773e-05,
"loss": 0.4101,
"step": 1360
},
{
"epoch": 0.29179136383069687,
"grad_norm": 5.850408554077148,
"learning_rate": 1.8983372921615203e-05,
"loss": 0.4422,
"step": 1365
},
{
"epoch": 0.29286019666524155,
"grad_norm": 6.298422813415527,
"learning_rate": 1.8971496437054634e-05,
"loss": 0.4426,
"step": 1370
},
{
"epoch": 0.2939290294997862,
"grad_norm": 6.113378524780273,
"learning_rate": 1.8959619952494064e-05,
"loss": 0.4108,
"step": 1375
},
{
"epoch": 0.2949978623343309,
"grad_norm": 5.136318206787109,
"learning_rate": 1.894774346793349e-05,
"loss": 0.4734,
"step": 1380
},
{
"epoch": 0.29606669516887557,
"grad_norm": 7.1877760887146,
"learning_rate": 1.8935866983372922e-05,
"loss": 0.4678,
"step": 1385
},
{
"epoch": 0.29713552800342025,
"grad_norm": 7.322067737579346,
"learning_rate": 1.8923990498812352e-05,
"loss": 0.4334,
"step": 1390
},
{
"epoch": 0.2982043608379649,
"grad_norm": 4.906497001647949,
"learning_rate": 1.8912114014251783e-05,
"loss": 0.4211,
"step": 1395
},
{
"epoch": 0.2992731936725096,
"grad_norm": 4.929844379425049,
"learning_rate": 1.8900237529691214e-05,
"loss": 0.5807,
"step": 1400
},
{
"epoch": 0.30034202650705427,
"grad_norm": 6.196166515350342,
"learning_rate": 1.8888361045130644e-05,
"loss": 0.5956,
"step": 1405
},
{
"epoch": 0.30141085934159895,
"grad_norm": 5.226170539855957,
"learning_rate": 1.887648456057007e-05,
"loss": 0.4175,
"step": 1410
},
{
"epoch": 0.3024796921761437,
"grad_norm": 4.843142509460449,
"learning_rate": 1.8864608076009502e-05,
"loss": 0.4233,
"step": 1415
},
{
"epoch": 0.30354852501068835,
"grad_norm": 5.112825393676758,
"learning_rate": 1.8852731591448932e-05,
"loss": 0.4118,
"step": 1420
},
{
"epoch": 0.304617357845233,
"grad_norm": 6.756041526794434,
"learning_rate": 1.8840855106888363e-05,
"loss": 0.3919,
"step": 1425
},
{
"epoch": 0.3056861906797777,
"grad_norm": 5.811524868011475,
"learning_rate": 1.882897862232779e-05,
"loss": 0.5152,
"step": 1430
},
{
"epoch": 0.3067550235143224,
"grad_norm": 5.891305446624756,
"learning_rate": 1.8817102137767224e-05,
"loss": 0.393,
"step": 1435
},
{
"epoch": 0.30782385634886705,
"grad_norm": 6.220530986785889,
"learning_rate": 1.880522565320665e-05,
"loss": 0.3765,
"step": 1440
},
{
"epoch": 0.30889268918341173,
"grad_norm": 6.09738826751709,
"learning_rate": 1.8793349168646082e-05,
"loss": 0.4476,
"step": 1445
},
{
"epoch": 0.3099615220179564,
"grad_norm": 4.718704700469971,
"learning_rate": 1.8781472684085512e-05,
"loss": 0.5094,
"step": 1450
},
{
"epoch": 0.3110303548525011,
"grad_norm": 5.264518737792969,
"learning_rate": 1.8769596199524943e-05,
"loss": 0.4496,
"step": 1455
},
{
"epoch": 0.31209918768704575,
"grad_norm": 5.551924705505371,
"learning_rate": 1.8757719714964373e-05,
"loss": 0.4305,
"step": 1460
},
{
"epoch": 0.31316802052159043,
"grad_norm": 4.252546787261963,
"learning_rate": 1.87458432304038e-05,
"loss": 0.495,
"step": 1465
},
{
"epoch": 0.3142368533561351,
"grad_norm": 4.372467517852783,
"learning_rate": 1.8733966745843235e-05,
"loss": 0.5561,
"step": 1470
},
{
"epoch": 0.3153056861906798,
"grad_norm": 6.216442108154297,
"learning_rate": 1.872209026128266e-05,
"loss": 0.4432,
"step": 1475
},
{
"epoch": 0.31637451902522445,
"grad_norm": 3.8125741481781006,
"learning_rate": 1.8710213776722092e-05,
"loss": 0.3382,
"step": 1480
},
{
"epoch": 0.31744335185976913,
"grad_norm": 5.539150714874268,
"learning_rate": 1.8698337292161523e-05,
"loss": 0.4209,
"step": 1485
},
{
"epoch": 0.3185121846943138,
"grad_norm": 6.593637466430664,
"learning_rate": 1.8686460807600953e-05,
"loss": 0.3776,
"step": 1490
},
{
"epoch": 0.3195810175288585,
"grad_norm": 5.109198570251465,
"learning_rate": 1.867458432304038e-05,
"loss": 0.4271,
"step": 1495
},
{
"epoch": 0.32064985036340315,
"grad_norm": 7.083045959472656,
"learning_rate": 1.866270783847981e-05,
"loss": 0.5628,
"step": 1500
},
{
"epoch": 0.32171868319794783,
"grad_norm": 7.068709850311279,
"learning_rate": 1.865083135391924e-05,
"loss": 0.4438,
"step": 1505
},
{
"epoch": 0.3227875160324925,
"grad_norm": 4.62941312789917,
"learning_rate": 1.8638954869358672e-05,
"loss": 0.3863,
"step": 1510
},
{
"epoch": 0.3238563488670372,
"grad_norm": 4.4039788246154785,
"learning_rate": 1.86270783847981e-05,
"loss": 0.5629,
"step": 1515
},
{
"epoch": 0.32492518170158186,
"grad_norm": 6.153443813323975,
"learning_rate": 1.8615201900237533e-05,
"loss": 0.4851,
"step": 1520
},
{
"epoch": 0.32599401453612653,
"grad_norm": 4.207914352416992,
"learning_rate": 1.860332541567696e-05,
"loss": 0.3386,
"step": 1525
},
{
"epoch": 0.3270628473706712,
"grad_norm": 5.669225692749023,
"learning_rate": 1.859144893111639e-05,
"loss": 0.4546,
"step": 1530
},
{
"epoch": 0.3281316802052159,
"grad_norm": 6.5213775634765625,
"learning_rate": 1.857957244655582e-05,
"loss": 0.4147,
"step": 1535
},
{
"epoch": 0.32920051303976056,
"grad_norm": 5.153679370880127,
"learning_rate": 1.8567695961995252e-05,
"loss": 0.403,
"step": 1540
},
{
"epoch": 0.33026934587430523,
"grad_norm": 5.655941009521484,
"learning_rate": 1.855581947743468e-05,
"loss": 0.4155,
"step": 1545
},
{
"epoch": 0.33133817870884996,
"grad_norm": 6.6513895988464355,
"learning_rate": 1.854394299287411e-05,
"loss": 0.4831,
"step": 1550
},
{
"epoch": 0.33240701154339464,
"grad_norm": 5.994706153869629,
"learning_rate": 1.853206650831354e-05,
"loss": 0.388,
"step": 1555
},
{
"epoch": 0.3334758443779393,
"grad_norm": 4.132383346557617,
"learning_rate": 1.852019002375297e-05,
"loss": 0.4184,
"step": 1560
},
{
"epoch": 0.334544677212484,
"grad_norm": 4.975070953369141,
"learning_rate": 1.8508313539192398e-05,
"loss": 0.3645,
"step": 1565
},
{
"epoch": 0.33561351004702866,
"grad_norm": 6.475266933441162,
"learning_rate": 1.8496437054631832e-05,
"loss": 0.4653,
"step": 1570
},
{
"epoch": 0.33668234288157334,
"grad_norm": 5.302603244781494,
"learning_rate": 1.848456057007126e-05,
"loss": 0.5196,
"step": 1575
},
{
"epoch": 0.337751175716118,
"grad_norm": 6.404365539550781,
"learning_rate": 1.847268408551069e-05,
"loss": 0.5252,
"step": 1580
},
{
"epoch": 0.3388200085506627,
"grad_norm": 5.3015923500061035,
"learning_rate": 1.846080760095012e-05,
"loss": 0.6971,
"step": 1585
},
{
"epoch": 0.33988884138520736,
"grad_norm": 6.321039199829102,
"learning_rate": 1.844893111638955e-05,
"loss": 0.3881,
"step": 1590
},
{
"epoch": 0.34095767421975204,
"grad_norm": 4.614476680755615,
"learning_rate": 1.843705463182898e-05,
"loss": 0.4302,
"step": 1595
},
{
"epoch": 0.3420265070542967,
"grad_norm": 5.174408912658691,
"learning_rate": 1.842517814726841e-05,
"loss": 0.3701,
"step": 1600
},
{
"epoch": 0.3430953398888414,
"grad_norm": 4.7469706535339355,
"learning_rate": 1.8413301662707842e-05,
"loss": 0.3893,
"step": 1605
},
{
"epoch": 0.34416417272338606,
"grad_norm": 5.967380046844482,
"learning_rate": 1.840142517814727e-05,
"loss": 0.4246,
"step": 1610
},
{
"epoch": 0.34523300555793074,
"grad_norm": 4.841580867767334,
"learning_rate": 1.83895486935867e-05,
"loss": 0.3006,
"step": 1615
},
{
"epoch": 0.3463018383924754,
"grad_norm": 5.739339351654053,
"learning_rate": 1.837767220902613e-05,
"loss": 0.7078,
"step": 1620
},
{
"epoch": 0.3473706712270201,
"grad_norm": 5.888680458068848,
"learning_rate": 1.836579572446556e-05,
"loss": 0.375,
"step": 1625
},
{
"epoch": 0.34843950406156476,
"grad_norm": 6.077122211456299,
"learning_rate": 1.835391923990499e-05,
"loss": 0.4425,
"step": 1630
},
{
"epoch": 0.34950833689610944,
"grad_norm": 6.087640762329102,
"learning_rate": 1.834204275534442e-05,
"loss": 0.3841,
"step": 1635
},
{
"epoch": 0.3505771697306541,
"grad_norm": 7.3536529541015625,
"learning_rate": 1.833016627078385e-05,
"loss": 0.4075,
"step": 1640
},
{
"epoch": 0.3516460025651988,
"grad_norm": 6.833067893981934,
"learning_rate": 1.831828978622328e-05,
"loss": 0.5383,
"step": 1645
},
{
"epoch": 0.35271483539974346,
"grad_norm": 5.849217414855957,
"learning_rate": 1.8306413301662707e-05,
"loss": 0.4623,
"step": 1650
},
{
"epoch": 0.35378366823428814,
"grad_norm": 5.285182952880859,
"learning_rate": 1.829453681710214e-05,
"loss": 0.3986,
"step": 1655
},
{
"epoch": 0.3548525010688328,
"grad_norm": 5.706902980804443,
"learning_rate": 1.8282660332541568e-05,
"loss": 0.4038,
"step": 1660
},
{
"epoch": 0.3559213339033775,
"grad_norm": 4.221705436706543,
"learning_rate": 1.8270783847981e-05,
"loss": 0.5615,
"step": 1665
},
{
"epoch": 0.35699016673792217,
"grad_norm": 6.5307745933532715,
"learning_rate": 1.825890736342043e-05,
"loss": 0.5535,
"step": 1670
},
{
"epoch": 0.35805899957246684,
"grad_norm": 5.936892509460449,
"learning_rate": 1.824703087885986e-05,
"loss": 0.3316,
"step": 1675
},
{
"epoch": 0.3591278324070115,
"grad_norm": 4.413790702819824,
"learning_rate": 1.8235154394299287e-05,
"loss": 0.3916,
"step": 1680
},
{
"epoch": 0.36019666524155625,
"grad_norm": 5.399665355682373,
"learning_rate": 1.8223277909738718e-05,
"loss": 0.3723,
"step": 1685
},
{
"epoch": 0.3612654980761009,
"grad_norm": 8.413554191589355,
"learning_rate": 1.8211401425178148e-05,
"loss": 0.5188,
"step": 1690
},
{
"epoch": 0.3623343309106456,
"grad_norm": 3.7601664066314697,
"learning_rate": 1.819952494061758e-05,
"loss": 0.3817,
"step": 1695
},
{
"epoch": 0.36340316374519027,
"grad_norm": 5.661569595336914,
"learning_rate": 1.818764845605701e-05,
"loss": 0.4036,
"step": 1700
},
{
"epoch": 0.36447199657973495,
"grad_norm": 6.07588005065918,
"learning_rate": 1.817577197149644e-05,
"loss": 0.4224,
"step": 1705
},
{
"epoch": 0.3655408294142796,
"grad_norm": 5.329127311706543,
"learning_rate": 1.8163895486935867e-05,
"loss": 0.4171,
"step": 1710
},
{
"epoch": 0.3666096622488243,
"grad_norm": 7.156865119934082,
"learning_rate": 1.8152019002375298e-05,
"loss": 0.4122,
"step": 1715
},
{
"epoch": 0.367678495083369,
"grad_norm": 5.72195291519165,
"learning_rate": 1.8140142517814728e-05,
"loss": 0.4024,
"step": 1720
},
{
"epoch": 0.36874732791791365,
"grad_norm": 4.991401672363281,
"learning_rate": 1.812826603325416e-05,
"loss": 0.3882,
"step": 1725
},
{
"epoch": 0.3698161607524583,
"grad_norm": 4.662073612213135,
"learning_rate": 1.811638954869359e-05,
"loss": 0.3282,
"step": 1730
},
{
"epoch": 0.370884993587003,
"grad_norm": 5.966677188873291,
"learning_rate": 1.8104513064133016e-05,
"loss": 0.2961,
"step": 1735
},
{
"epoch": 0.3719538264215477,
"grad_norm": 5.708690166473389,
"learning_rate": 1.809263657957245e-05,
"loss": 0.4568,
"step": 1740
},
{
"epoch": 0.37302265925609235,
"grad_norm": 5.69785213470459,
"learning_rate": 1.8080760095011877e-05,
"loss": 0.3544,
"step": 1745
},
{
"epoch": 0.374091492090637,
"grad_norm": 6.101360321044922,
"learning_rate": 1.8068883610451308e-05,
"loss": 0.4282,
"step": 1750
},
{
"epoch": 0.3751603249251817,
"grad_norm": 6.585791110992432,
"learning_rate": 1.805700712589074e-05,
"loss": 0.3798,
"step": 1755
},
{
"epoch": 0.3762291577597264,
"grad_norm": 5.618402481079102,
"learning_rate": 1.804513064133017e-05,
"loss": 0.4618,
"step": 1760
},
{
"epoch": 0.37729799059427105,
"grad_norm": 6.637610912322998,
"learning_rate": 1.8033254156769596e-05,
"loss": 0.3876,
"step": 1765
},
{
"epoch": 0.3783668234288157,
"grad_norm": 4.9898295402526855,
"learning_rate": 1.8021377672209027e-05,
"loss": 0.3393,
"step": 1770
},
{
"epoch": 0.3794356562633604,
"grad_norm": 6.182595252990723,
"learning_rate": 1.8009501187648457e-05,
"loss": 0.4276,
"step": 1775
},
{
"epoch": 0.3805044890979051,
"grad_norm": 5.460147380828857,
"learning_rate": 1.7997624703087888e-05,
"loss": 0.4641,
"step": 1780
},
{
"epoch": 0.38157332193244975,
"grad_norm": 4.0731940269470215,
"learning_rate": 1.798574821852732e-05,
"loss": 0.4248,
"step": 1785
},
{
"epoch": 0.3826421547669944,
"grad_norm": 3.6468496322631836,
"learning_rate": 1.797387173396675e-05,
"loss": 0.3601,
"step": 1790
},
{
"epoch": 0.3837109876015391,
"grad_norm": 3.701404094696045,
"learning_rate": 1.7961995249406176e-05,
"loss": 0.3384,
"step": 1795
},
{
"epoch": 0.3847798204360838,
"grad_norm": 6.082109451293945,
"learning_rate": 1.7950118764845607e-05,
"loss": 0.3598,
"step": 1800
},
{
"epoch": 0.38584865327062845,
"grad_norm": 4.901666164398193,
"learning_rate": 1.7938242280285037e-05,
"loss": 0.4363,
"step": 1805
},
{
"epoch": 0.3869174861051731,
"grad_norm": 3.848799467086792,
"learning_rate": 1.7926365795724468e-05,
"loss": 0.335,
"step": 1810
},
{
"epoch": 0.38798631893971786,
"grad_norm": 4.457520484924316,
"learning_rate": 1.7914489311163895e-05,
"loss": 0.3892,
"step": 1815
},
{
"epoch": 0.38905515177426253,
"grad_norm": 6.423126697540283,
"learning_rate": 1.7902612826603326e-05,
"loss": 0.4103,
"step": 1820
},
{
"epoch": 0.3901239846088072,
"grad_norm": 5.50001335144043,
"learning_rate": 1.7890736342042756e-05,
"loss": 0.3959,
"step": 1825
},
{
"epoch": 0.3911928174433519,
"grad_norm": 3.85994553565979,
"learning_rate": 1.7878859857482187e-05,
"loss": 0.3593,
"step": 1830
},
{
"epoch": 0.39226165027789656,
"grad_norm": 6.009896278381348,
"learning_rate": 1.7866983372921617e-05,
"loss": 0.4366,
"step": 1835
},
{
"epoch": 0.39333048311244123,
"grad_norm": 4.844223499298096,
"learning_rate": 1.7855106888361048e-05,
"loss": 0.3633,
"step": 1840
},
{
"epoch": 0.3943993159469859,
"grad_norm": 5.032964706420898,
"learning_rate": 1.7843230403800475e-05,
"loss": 0.4333,
"step": 1845
},
{
"epoch": 0.3954681487815306,
"grad_norm": 5.1685872077941895,
"learning_rate": 1.7831353919239905e-05,
"loss": 0.4825,
"step": 1850
},
{
"epoch": 0.39653698161607526,
"grad_norm": 5.741828918457031,
"learning_rate": 1.7819477434679336e-05,
"loss": 0.3041,
"step": 1855
},
{
"epoch": 0.39760581445061993,
"grad_norm": 5.440220832824707,
"learning_rate": 1.7807600950118767e-05,
"loss": 0.3416,
"step": 1860
},
{
"epoch": 0.3986746472851646,
"grad_norm": 4.476759433746338,
"learning_rate": 1.7795724465558197e-05,
"loss": 0.4578,
"step": 1865
},
{
"epoch": 0.3997434801197093,
"grad_norm": 6.7310991287231445,
"learning_rate": 1.7783847980997628e-05,
"loss": 0.5441,
"step": 1870
},
{
"epoch": 0.40081231295425396,
"grad_norm": 5.929594993591309,
"learning_rate": 1.7771971496437058e-05,
"loss": 0.5124,
"step": 1875
},
{
"epoch": 0.40188114578879863,
"grad_norm": 4.516419410705566,
"learning_rate": 1.7760095011876485e-05,
"loss": 0.4026,
"step": 1880
},
{
"epoch": 0.4029499786233433,
"grad_norm": 5.7698798179626465,
"learning_rate": 1.7748218527315916e-05,
"loss": 0.4537,
"step": 1885
},
{
"epoch": 0.404018811457888,
"grad_norm": 4.604269027709961,
"learning_rate": 1.7736342042755346e-05,
"loss": 0.3983,
"step": 1890
},
{
"epoch": 0.40508764429243266,
"grad_norm": 6.4217610359191895,
"learning_rate": 1.7724465558194777e-05,
"loss": 0.3894,
"step": 1895
},
{
"epoch": 0.40615647712697733,
"grad_norm": 5.296751022338867,
"learning_rate": 1.7712589073634204e-05,
"loss": 0.4747,
"step": 1900
},
{
"epoch": 0.407225309961522,
"grad_norm": 4.870068550109863,
"learning_rate": 1.7700712589073638e-05,
"loss": 0.4042,
"step": 1905
},
{
"epoch": 0.4082941427960667,
"grad_norm": 4.312191486358643,
"learning_rate": 1.7688836104513065e-05,
"loss": 0.3477,
"step": 1910
},
{
"epoch": 0.40936297563061136,
"grad_norm": 5.281498432159424,
"learning_rate": 1.7676959619952496e-05,
"loss": 0.4512,
"step": 1915
},
{
"epoch": 0.41043180846515603,
"grad_norm": 4.401067733764648,
"learning_rate": 1.7665083135391926e-05,
"loss": 0.3451,
"step": 1920
},
{
"epoch": 0.4115006412997007,
"grad_norm": 5.28626012802124,
"learning_rate": 1.7653206650831357e-05,
"loss": 0.3983,
"step": 1925
},
{
"epoch": 0.4125694741342454,
"grad_norm": 5.951436519622803,
"learning_rate": 1.7641330166270784e-05,
"loss": 0.62,
"step": 1930
},
{
"epoch": 0.41363830696879006,
"grad_norm": 3.4126088619232178,
"learning_rate": 1.7629453681710215e-05,
"loss": 0.4632,
"step": 1935
},
{
"epoch": 0.41470713980333473,
"grad_norm": 4.540611267089844,
"learning_rate": 1.7617577197149645e-05,
"loss": 0.3757,
"step": 1940
},
{
"epoch": 0.4157759726378794,
"grad_norm": 5.913720607757568,
"learning_rate": 1.7605700712589076e-05,
"loss": 0.4752,
"step": 1945
},
{
"epoch": 0.41684480547242414,
"grad_norm": 4.386907577514648,
"learning_rate": 1.7593824228028503e-05,
"loss": 0.453,
"step": 1950
},
{
"epoch": 0.4179136383069688,
"grad_norm": 4.836590766906738,
"learning_rate": 1.7581947743467937e-05,
"loss": 0.4348,
"step": 1955
},
{
"epoch": 0.4189824711415135,
"grad_norm": 4.215417861938477,
"learning_rate": 1.7570071258907364e-05,
"loss": 0.3944,
"step": 1960
},
{
"epoch": 0.42005130397605817,
"grad_norm": 5.9303789138793945,
"learning_rate": 1.7558194774346795e-05,
"loss": 0.3702,
"step": 1965
},
{
"epoch": 0.42112013681060284,
"grad_norm": 5.648311138153076,
"learning_rate": 1.7546318289786225e-05,
"loss": 0.3888,
"step": 1970
},
{
"epoch": 0.4221889696451475,
"grad_norm": 5.413701057434082,
"learning_rate": 1.7534441805225656e-05,
"loss": 0.3968,
"step": 1975
},
{
"epoch": 0.4232578024796922,
"grad_norm": 4.331090450286865,
"learning_rate": 1.7522565320665083e-05,
"loss": 0.3609,
"step": 1980
},
{
"epoch": 0.42432663531423687,
"grad_norm": 4.991115093231201,
"learning_rate": 1.7510688836104513e-05,
"loss": 0.4328,
"step": 1985
},
{
"epoch": 0.42539546814878154,
"grad_norm": 5.451033115386963,
"learning_rate": 1.7498812351543944e-05,
"loss": 0.516,
"step": 1990
},
{
"epoch": 0.4264643009833262,
"grad_norm": 5.011542320251465,
"learning_rate": 1.7486935866983374e-05,
"loss": 0.3605,
"step": 1995
},
{
"epoch": 0.4275331338178709,
"grad_norm": 5.4983086585998535,
"learning_rate": 1.74750593824228e-05,
"loss": 0.3094,
"step": 2000
},
{
"epoch": 0.42860196665241557,
"grad_norm": 5.928680896759033,
"learning_rate": 1.7463182897862236e-05,
"loss": 0.3866,
"step": 2005
},
{
"epoch": 0.42967079948696024,
"grad_norm": 4.630986213684082,
"learning_rate": 1.7451306413301666e-05,
"loss": 0.3943,
"step": 2010
},
{
"epoch": 0.4307396323215049,
"grad_norm": 4.091104030609131,
"learning_rate": 1.7439429928741093e-05,
"loss": 0.3931,
"step": 2015
},
{
"epoch": 0.4318084651560496,
"grad_norm": 6.031238555908203,
"learning_rate": 1.7427553444180524e-05,
"loss": 0.3728,
"step": 2020
},
{
"epoch": 0.43287729799059427,
"grad_norm": 4.81741189956665,
"learning_rate": 1.7415676959619954e-05,
"loss": 0.2953,
"step": 2025
},
{
"epoch": 0.43394613082513894,
"grad_norm": 5.144311904907227,
"learning_rate": 1.7403800475059385e-05,
"loss": 0.3547,
"step": 2030
},
{
"epoch": 0.4350149636596836,
"grad_norm": 4.806643009185791,
"learning_rate": 1.7391923990498812e-05,
"loss": 0.4275,
"step": 2035
},
{
"epoch": 0.4360837964942283,
"grad_norm": 4.138782501220703,
"learning_rate": 1.7380047505938246e-05,
"loss": 0.2959,
"step": 2040
},
{
"epoch": 0.43715262932877297,
"grad_norm": 5.7593255043029785,
"learning_rate": 1.7368171021377673e-05,
"loss": 0.3245,
"step": 2045
},
{
"epoch": 0.43822146216331764,
"grad_norm": 4.043095588684082,
"learning_rate": 1.7356294536817104e-05,
"loss": 0.4148,
"step": 2050
},
{
"epoch": 0.4392902949978623,
"grad_norm": 4.848685264587402,
"learning_rate": 1.7344418052256534e-05,
"loss": 0.3542,
"step": 2055
},
{
"epoch": 0.440359127832407,
"grad_norm": 5.738672256469727,
"learning_rate": 1.7332541567695965e-05,
"loss": 0.4493,
"step": 2060
},
{
"epoch": 0.44142796066695167,
"grad_norm": 4.470565319061279,
"learning_rate": 1.7320665083135392e-05,
"loss": 0.376,
"step": 2065
},
{
"epoch": 0.44249679350149634,
"grad_norm": 4.22749137878418,
"learning_rate": 1.7308788598574823e-05,
"loss": 0.322,
"step": 2070
},
{
"epoch": 0.443565626336041,
"grad_norm": 5.158305644989014,
"learning_rate": 1.7296912114014253e-05,
"loss": 0.3227,
"step": 2075
},
{
"epoch": 0.4446344591705857,
"grad_norm": 6.257720947265625,
"learning_rate": 1.7285035629453684e-05,
"loss": 0.3046,
"step": 2080
},
{
"epoch": 0.4457032920051304,
"grad_norm": 5.981179237365723,
"learning_rate": 1.727315914489311e-05,
"loss": 0.3064,
"step": 2085
},
{
"epoch": 0.4467721248396751,
"grad_norm": 5.584667682647705,
"learning_rate": 1.7261282660332545e-05,
"loss": 0.3199,
"step": 2090
},
{
"epoch": 0.4478409576742198,
"grad_norm": 5.660790920257568,
"learning_rate": 1.7249406175771972e-05,
"loss": 0.3774,
"step": 2095
},
{
"epoch": 0.44890979050876445,
"grad_norm": 4.129720687866211,
"learning_rate": 1.7237529691211402e-05,
"loss": 0.3212,
"step": 2100
},
{
"epoch": 0.4499786233433091,
"grad_norm": 3.2054107189178467,
"learning_rate": 1.7225653206650833e-05,
"loss": 0.3302,
"step": 2105
},
{
"epoch": 0.4510474561778538,
"grad_norm": 3.934522867202759,
"learning_rate": 1.7213776722090264e-05,
"loss": 0.3205,
"step": 2110
},
{
"epoch": 0.4521162890123985,
"grad_norm": 5.592263221740723,
"learning_rate": 1.720190023752969e-05,
"loss": 0.3673,
"step": 2115
},
{
"epoch": 0.45318512184694315,
"grad_norm": 5.707674026489258,
"learning_rate": 1.719002375296912e-05,
"loss": 0.3752,
"step": 2120
},
{
"epoch": 0.4542539546814878,
"grad_norm": 4.284328937530518,
"learning_rate": 1.7178147268408552e-05,
"loss": 0.3192,
"step": 2125
},
{
"epoch": 0.4553227875160325,
"grad_norm": 4.87931489944458,
"learning_rate": 1.7166270783847982e-05,
"loss": 0.3372,
"step": 2130
},
{
"epoch": 0.4563916203505772,
"grad_norm": 5.3206048011779785,
"learning_rate": 1.7154394299287413e-05,
"loss": 0.321,
"step": 2135
},
{
"epoch": 0.45746045318512185,
"grad_norm": 5.118194103240967,
"learning_rate": 1.7142517814726843e-05,
"loss": 0.4086,
"step": 2140
},
{
"epoch": 0.4585292860196665,
"grad_norm": 5.390005111694336,
"learning_rate": 1.7130641330166274e-05,
"loss": 0.4092,
"step": 2145
},
{
"epoch": 0.4595981188542112,
"grad_norm": 6.221261978149414,
"learning_rate": 1.71187648456057e-05,
"loss": 0.4591,
"step": 2150
},
{
"epoch": 0.4606669516887559,
"grad_norm": 4.9464497566223145,
"learning_rate": 1.7106888361045132e-05,
"loss": 0.503,
"step": 2155
},
{
"epoch": 0.46173578452330055,
"grad_norm": 6.745388984680176,
"learning_rate": 1.7095011876484562e-05,
"loss": 0.4767,
"step": 2160
},
{
"epoch": 0.4628046173578452,
"grad_norm": 5.506555080413818,
"learning_rate": 1.7083135391923993e-05,
"loss": 0.3425,
"step": 2165
},
{
"epoch": 0.4638734501923899,
"grad_norm": 5.21577787399292,
"learning_rate": 1.707125890736342e-05,
"loss": 0.372,
"step": 2170
},
{
"epoch": 0.4649422830269346,
"grad_norm": 4.69103479385376,
"learning_rate": 1.7059382422802854e-05,
"loss": 0.4671,
"step": 2175
},
{
"epoch": 0.46601111586147925,
"grad_norm": 4.060796737670898,
"learning_rate": 1.704750593824228e-05,
"loss": 0.3767,
"step": 2180
},
{
"epoch": 0.4670799486960239,
"grad_norm": 6.448695659637451,
"learning_rate": 1.703562945368171e-05,
"loss": 0.3096,
"step": 2185
},
{
"epoch": 0.4681487815305686,
"grad_norm": 4.255459308624268,
"learning_rate": 1.7023752969121142e-05,
"loss": 0.3654,
"step": 2190
},
{
"epoch": 0.4692176143651133,
"grad_norm": 5.383869647979736,
"learning_rate": 1.7011876484560573e-05,
"loss": 0.4243,
"step": 2195
},
{
"epoch": 0.47028644719965795,
"grad_norm": 4.97196102142334,
"learning_rate": 1.7e-05,
"loss": 0.4411,
"step": 2200
},
{
"epoch": 0.47135528003420263,
"grad_norm": 4.9628071784973145,
"learning_rate": 1.698812351543943e-05,
"loss": 0.5001,
"step": 2205
},
{
"epoch": 0.4724241128687473,
"grad_norm": 5.05242919921875,
"learning_rate": 1.697624703087886e-05,
"loss": 0.3485,
"step": 2210
},
{
"epoch": 0.473492945703292,
"grad_norm": 4.373459339141846,
"learning_rate": 1.696437054631829e-05,
"loss": 0.3546,
"step": 2215
},
{
"epoch": 0.4745617785378367,
"grad_norm": 5.0651397705078125,
"learning_rate": 1.6952494061757722e-05,
"loss": 0.4037,
"step": 2220
},
{
"epoch": 0.4756306113723814,
"grad_norm": 6.026737213134766,
"learning_rate": 1.6940617577197153e-05,
"loss": 0.3114,
"step": 2225
},
{
"epoch": 0.47669944420692606,
"grad_norm": 4.404332160949707,
"learning_rate": 1.692874109263658e-05,
"loss": 0.3422,
"step": 2230
},
{
"epoch": 0.47776827704147073,
"grad_norm": 5.780966281890869,
"learning_rate": 1.691686460807601e-05,
"loss": 0.4241,
"step": 2235
},
{
"epoch": 0.4788371098760154,
"grad_norm": 5.648661136627197,
"learning_rate": 1.690498812351544e-05,
"loss": 0.3912,
"step": 2240
},
{
"epoch": 0.4799059427105601,
"grad_norm": 3.616197109222412,
"learning_rate": 1.689311163895487e-05,
"loss": 0.3748,
"step": 2245
},
{
"epoch": 0.48097477554510476,
"grad_norm": 4.634115219116211,
"learning_rate": 1.68812351543943e-05,
"loss": 0.3746,
"step": 2250
},
{
"epoch": 0.48204360837964944,
"grad_norm": 4.268435478210449,
"learning_rate": 1.686935866983373e-05,
"loss": 0.3544,
"step": 2255
},
{
"epoch": 0.4831124412141941,
"grad_norm": 4.208693504333496,
"learning_rate": 1.685748218527316e-05,
"loss": 0.3246,
"step": 2260
},
{
"epoch": 0.4841812740487388,
"grad_norm": 7.521546840667725,
"learning_rate": 1.684560570071259e-05,
"loss": 0.3739,
"step": 2265
},
{
"epoch": 0.48525010688328346,
"grad_norm": 5.12343692779541,
"learning_rate": 1.683372921615202e-05,
"loss": 0.3606,
"step": 2270
},
{
"epoch": 0.48631893971782814,
"grad_norm": 6.54265022277832,
"learning_rate": 1.682185273159145e-05,
"loss": 0.3891,
"step": 2275
},
{
"epoch": 0.4873877725523728,
"grad_norm": 4.471118450164795,
"learning_rate": 1.680997624703088e-05,
"loss": 0.2855,
"step": 2280
},
{
"epoch": 0.4884566053869175,
"grad_norm": 7.488130569458008,
"learning_rate": 1.679809976247031e-05,
"loss": 0.4829,
"step": 2285
},
{
"epoch": 0.48952543822146216,
"grad_norm": 6.3466033935546875,
"learning_rate": 1.678622327790974e-05,
"loss": 0.4529,
"step": 2290
},
{
"epoch": 0.49059427105600684,
"grad_norm": 7.353418350219727,
"learning_rate": 1.677434679334917e-05,
"loss": 0.4819,
"step": 2295
},
{
"epoch": 0.4916631038905515,
"grad_norm": 4.575865745544434,
"learning_rate": 1.67624703087886e-05,
"loss": 0.3453,
"step": 2300
},
{
"epoch": 0.4927319367250962,
"grad_norm": 4.988368511199951,
"learning_rate": 1.675059382422803e-05,
"loss": 0.3425,
"step": 2305
},
{
"epoch": 0.49380076955964086,
"grad_norm": 5.05146598815918,
"learning_rate": 1.6738717339667462e-05,
"loss": 0.2884,
"step": 2310
},
{
"epoch": 0.49486960239418554,
"grad_norm": 6.10252571105957,
"learning_rate": 1.672684085510689e-05,
"loss": 0.3184,
"step": 2315
},
{
"epoch": 0.4959384352287302,
"grad_norm": 5.356700420379639,
"learning_rate": 1.671496437054632e-05,
"loss": 0.3043,
"step": 2320
},
{
"epoch": 0.4970072680632749,
"grad_norm": 4.550732135772705,
"learning_rate": 1.670308788598575e-05,
"loss": 0.3746,
"step": 2325
},
{
"epoch": 0.49807610089781956,
"grad_norm": 4.781940937042236,
"learning_rate": 1.669121140142518e-05,
"loss": 0.4023,
"step": 2330
},
{
"epoch": 0.49914493373236424,
"grad_norm": 3.1689300537109375,
"learning_rate": 1.6679334916864608e-05,
"loss": 0.2994,
"step": 2335
},
{
"epoch": 0.500213766566909,
"grad_norm": 5.919034004211426,
"learning_rate": 1.6667458432304042e-05,
"loss": 0.3858,
"step": 2340
},
{
"epoch": 0.5012825994014536,
"grad_norm": 4.044144153594971,
"learning_rate": 1.665558194774347e-05,
"loss": 0.3488,
"step": 2345
},
{
"epoch": 0.5023514322359983,
"grad_norm": 5.063786506652832,
"learning_rate": 1.66437054631829e-05,
"loss": 0.4467,
"step": 2350
},
{
"epoch": 0.5034202650705429,
"grad_norm": 4.159796237945557,
"learning_rate": 1.663182897862233e-05,
"loss": 0.3199,
"step": 2355
},
{
"epoch": 0.5044890979050877,
"grad_norm": 4.232370853424072,
"learning_rate": 1.661995249406176e-05,
"loss": 0.3124,
"step": 2360
},
{
"epoch": 0.5055579307396323,
"grad_norm": 3.8301782608032227,
"learning_rate": 1.6608076009501188e-05,
"loss": 0.3229,
"step": 2365
},
{
"epoch": 0.506626763574177,
"grad_norm": 5.729179382324219,
"learning_rate": 1.6596199524940618e-05,
"loss": 0.3416,
"step": 2370
},
{
"epoch": 0.5076955964087216,
"grad_norm": 4.137636184692383,
"learning_rate": 1.658432304038005e-05,
"loss": 0.3555,
"step": 2375
},
{
"epoch": 0.5087644292432664,
"grad_norm": 6.014377593994141,
"learning_rate": 1.657244655581948e-05,
"loss": 0.3078,
"step": 2380
},
{
"epoch": 0.509833262077811,
"grad_norm": 5.031920909881592,
"learning_rate": 1.6560570071258906e-05,
"loss": 0.3105,
"step": 2385
},
{
"epoch": 0.5109020949123557,
"grad_norm": 4.162966728210449,
"learning_rate": 1.654869358669834e-05,
"loss": 0.3565,
"step": 2390
},
{
"epoch": 0.5119709277469003,
"grad_norm": 5.57382345199585,
"learning_rate": 1.6536817102137768e-05,
"loss": 0.3113,
"step": 2395
},
{
"epoch": 0.5130397605814451,
"grad_norm": 6.443201065063477,
"learning_rate": 1.6524940617577198e-05,
"loss": 0.4326,
"step": 2400
},
{
"epoch": 0.5141085934159897,
"grad_norm": 7.050893306732178,
"learning_rate": 1.651306413301663e-05,
"loss": 0.4395,
"step": 2405
},
{
"epoch": 0.5151774262505344,
"grad_norm": 4.315305709838867,
"learning_rate": 1.650118764845606e-05,
"loss": 0.3549,
"step": 2410
},
{
"epoch": 0.516246259085079,
"grad_norm": 3.76841402053833,
"learning_rate": 1.6489311163895486e-05,
"loss": 0.4271,
"step": 2415
},
{
"epoch": 0.5173150919196238,
"grad_norm": 4.878926753997803,
"learning_rate": 1.6477434679334917e-05,
"loss": 0.3136,
"step": 2420
},
{
"epoch": 0.5183839247541685,
"grad_norm": 4.831075668334961,
"learning_rate": 1.646555819477435e-05,
"loss": 0.3235,
"step": 2425
},
{
"epoch": 0.5194527575887131,
"grad_norm": 4.886428356170654,
"learning_rate": 1.6453681710213778e-05,
"loss": 0.2909,
"step": 2430
},
{
"epoch": 0.5205215904232579,
"grad_norm": 5.281339645385742,
"learning_rate": 1.644180522565321e-05,
"loss": 0.3296,
"step": 2435
},
{
"epoch": 0.5215904232578025,
"grad_norm": 4.9752516746521,
"learning_rate": 1.642992874109264e-05,
"loss": 0.4124,
"step": 2440
},
{
"epoch": 0.5226592560923472,
"grad_norm": 5.5705952644348145,
"learning_rate": 1.641805225653207e-05,
"loss": 0.4444,
"step": 2445
},
{
"epoch": 0.5237280889268918,
"grad_norm": 4.4641499519348145,
"learning_rate": 1.6406175771971497e-05,
"loss": 0.3649,
"step": 2450
},
{
"epoch": 0.5247969217614366,
"grad_norm": 4.909672260284424,
"learning_rate": 1.6394299287410927e-05,
"loss": 0.3897,
"step": 2455
},
{
"epoch": 0.5258657545959812,
"grad_norm": 5.340948581695557,
"learning_rate": 1.6382422802850358e-05,
"loss": 0.36,
"step": 2460
},
{
"epoch": 0.5269345874305259,
"grad_norm": 5.204975128173828,
"learning_rate": 1.637054631828979e-05,
"loss": 0.3899,
"step": 2465
},
{
"epoch": 0.5280034202650705,
"grad_norm": 5.030284881591797,
"learning_rate": 1.6358669833729216e-05,
"loss": 0.3303,
"step": 2470
},
{
"epoch": 0.5290722530996153,
"grad_norm": 3.7952535152435303,
"learning_rate": 1.634679334916865e-05,
"loss": 0.3115,
"step": 2475
},
{
"epoch": 0.5301410859341599,
"grad_norm": 5.823569297790527,
"learning_rate": 1.6334916864608077e-05,
"loss": 0.4637,
"step": 2480
},
{
"epoch": 0.5312099187687046,
"grad_norm": 6.1813483238220215,
"learning_rate": 1.6323040380047507e-05,
"loss": 0.4402,
"step": 2485
},
{
"epoch": 0.5322787516032492,
"grad_norm": 3.668980360031128,
"learning_rate": 1.6311163895486938e-05,
"loss": 0.2825,
"step": 2490
},
{
"epoch": 0.533347584437794,
"grad_norm": 4.954606056213379,
"learning_rate": 1.629928741092637e-05,
"loss": 0.3389,
"step": 2495
},
{
"epoch": 0.5344164172723386,
"grad_norm": 4.136919021606445,
"learning_rate": 1.6287410926365796e-05,
"loss": 0.3513,
"step": 2500
},
{
"epoch": 0.5354852501068833,
"grad_norm": 5.383963108062744,
"learning_rate": 1.6275534441805226e-05,
"loss": 0.4301,
"step": 2505
},
{
"epoch": 0.5365540829414279,
"grad_norm": 4.818902015686035,
"learning_rate": 1.6263657957244657e-05,
"loss": 0.3733,
"step": 2510
},
{
"epoch": 0.5376229157759727,
"grad_norm": 4.797301769256592,
"learning_rate": 1.6251781472684087e-05,
"loss": 0.3241,
"step": 2515
},
{
"epoch": 0.5386917486105173,
"grad_norm": 5.040024757385254,
"learning_rate": 1.6239904988123514e-05,
"loss": 0.3457,
"step": 2520
},
{
"epoch": 0.539760581445062,
"grad_norm": 5.214640140533447,
"learning_rate": 1.622802850356295e-05,
"loss": 0.4533,
"step": 2525
},
{
"epoch": 0.5408294142796066,
"grad_norm": 3.6819052696228027,
"learning_rate": 1.6216152019002375e-05,
"loss": 0.3397,
"step": 2530
},
{
"epoch": 0.5418982471141514,
"grad_norm": 4.882740020751953,
"learning_rate": 1.6204275534441806e-05,
"loss": 0.4097,
"step": 2535
},
{
"epoch": 0.542967079948696,
"grad_norm": 4.784149646759033,
"learning_rate": 1.6192399049881237e-05,
"loss": 0.3083,
"step": 2540
},
{
"epoch": 0.5440359127832407,
"grad_norm": 5.621673107147217,
"learning_rate": 1.6180522565320667e-05,
"loss": 0.3199,
"step": 2545
},
{
"epoch": 0.5451047456177853,
"grad_norm": 5.204516887664795,
"learning_rate": 1.6168646080760094e-05,
"loss": 0.3971,
"step": 2550
},
{
"epoch": 0.54617357845233,
"grad_norm": 4.5771026611328125,
"learning_rate": 1.6156769596199525e-05,
"loss": 0.5229,
"step": 2555
},
{
"epoch": 0.5472424112868748,
"grad_norm": 5.919792652130127,
"learning_rate": 1.6144893111638955e-05,
"loss": 0.3765,
"step": 2560
},
{
"epoch": 0.5483112441214194,
"grad_norm": 4.573512554168701,
"learning_rate": 1.6133016627078386e-05,
"loss": 0.3727,
"step": 2565
},
{
"epoch": 0.5493800769559641,
"grad_norm": 3.752349615097046,
"learning_rate": 1.6121140142517816e-05,
"loss": 0.3252,
"step": 2570
},
{
"epoch": 0.5504489097905088,
"grad_norm": 3.7579493522644043,
"learning_rate": 1.6109263657957247e-05,
"loss": 0.2897,
"step": 2575
},
{
"epoch": 0.5515177426250535,
"grad_norm": 3.408615827560425,
"learning_rate": 1.6097387173396678e-05,
"loss": 0.2867,
"step": 2580
},
{
"epoch": 0.5525865754595981,
"grad_norm": 6.79346227645874,
"learning_rate": 1.6085510688836105e-05,
"loss": 0.3058,
"step": 2585
},
{
"epoch": 0.5536554082941428,
"grad_norm": 4.814434051513672,
"learning_rate": 1.6073634204275535e-05,
"loss": 0.461,
"step": 2590
},
{
"epoch": 0.5547242411286875,
"grad_norm": 4.379047393798828,
"learning_rate": 1.6061757719714966e-05,
"loss": 0.2341,
"step": 2595
},
{
"epoch": 0.5557930739632322,
"grad_norm": 7.072385787963867,
"learning_rate": 1.6049881235154396e-05,
"loss": 0.3446,
"step": 2600
},
{
"epoch": 0.5568619067977768,
"grad_norm": 6.0254411697387695,
"learning_rate": 1.6038004750593824e-05,
"loss": 0.2844,
"step": 2605
},
{
"epoch": 0.5579307396323215,
"grad_norm": 3.961240768432617,
"learning_rate": 1.6026128266033257e-05,
"loss": 0.3414,
"step": 2610
},
{
"epoch": 0.5589995724668662,
"grad_norm": 4.460314750671387,
"learning_rate": 1.6014251781472685e-05,
"loss": 0.3575,
"step": 2615
},
{
"epoch": 0.5600684053014109,
"grad_norm": 4.68889856338501,
"learning_rate": 1.6002375296912115e-05,
"loss": 0.3799,
"step": 2620
},
{
"epoch": 0.5611372381359555,
"grad_norm": 4.315304756164551,
"learning_rate": 1.5990498812351546e-05,
"loss": 0.3553,
"step": 2625
},
{
"epoch": 0.5622060709705002,
"grad_norm": 5.276904582977295,
"learning_rate": 1.5978622327790976e-05,
"loss": 0.3938,
"step": 2630
},
{
"epoch": 0.5632749038050449,
"grad_norm": 6.12239408493042,
"learning_rate": 1.5966745843230403e-05,
"loss": 0.4941,
"step": 2635
},
{
"epoch": 0.5643437366395896,
"grad_norm": 3.896017074584961,
"learning_rate": 1.5954869358669834e-05,
"loss": 0.3454,
"step": 2640
},
{
"epoch": 0.5654125694741342,
"grad_norm": 4.870078086853027,
"learning_rate": 1.5942992874109265e-05,
"loss": 0.3009,
"step": 2645
},
{
"epoch": 0.5664814023086789,
"grad_norm": 4.661655426025391,
"learning_rate": 1.5931116389548695e-05,
"loss": 0.4625,
"step": 2650
},
{
"epoch": 0.5675502351432236,
"grad_norm": 4.946725368499756,
"learning_rate": 1.5919239904988126e-05,
"loss": 0.3426,
"step": 2655
},
{
"epoch": 0.5686190679777683,
"grad_norm": 5.536448955535889,
"learning_rate": 1.5907363420427556e-05,
"loss": 0.3403,
"step": 2660
},
{
"epoch": 0.5696879008123129,
"grad_norm": 4.526655673980713,
"learning_rate": 1.5895486935866983e-05,
"loss": 0.3571,
"step": 2665
},
{
"epoch": 0.5707567336468576,
"grad_norm": 5.318846225738525,
"learning_rate": 1.5883610451306414e-05,
"loss": 0.235,
"step": 2670
},
{
"epoch": 0.5718255664814023,
"grad_norm": 4.3493571281433105,
"learning_rate": 1.5871733966745844e-05,
"loss": 0.3939,
"step": 2675
},
{
"epoch": 0.572894399315947,
"grad_norm": 4.984584331512451,
"learning_rate": 1.5859857482185275e-05,
"loss": 0.3041,
"step": 2680
},
{
"epoch": 0.5739632321504916,
"grad_norm": 5.118055820465088,
"learning_rate": 1.5847980997624702e-05,
"loss": 0.398,
"step": 2685
},
{
"epoch": 0.5750320649850363,
"grad_norm": 3.7693285942077637,
"learning_rate": 1.5836104513064136e-05,
"loss": 0.4327,
"step": 2690
},
{
"epoch": 0.5761008978195811,
"grad_norm": 4.113673210144043,
"learning_rate": 1.5824228028503563e-05,
"loss": 0.3622,
"step": 2695
},
{
"epoch": 0.5771697306541257,
"grad_norm": 4.5102386474609375,
"learning_rate": 1.5812351543942994e-05,
"loss": 0.3461,
"step": 2700
},
{
"epoch": 0.5782385634886704,
"grad_norm": 4.592400074005127,
"learning_rate": 1.5800475059382424e-05,
"loss": 0.409,
"step": 2705
},
{
"epoch": 0.579307396323215,
"grad_norm": 4.869931697845459,
"learning_rate": 1.5788598574821855e-05,
"loss": 0.3561,
"step": 2710
},
{
"epoch": 0.5803762291577598,
"grad_norm": 4.971279621124268,
"learning_rate": 1.5776722090261285e-05,
"loss": 0.3608,
"step": 2715
},
{
"epoch": 0.5814450619923044,
"grad_norm": 4.390021324157715,
"learning_rate": 1.5764845605700713e-05,
"loss": 0.3824,
"step": 2720
},
{
"epoch": 0.5825138948268491,
"grad_norm": 4.252533912658691,
"learning_rate": 1.5752969121140143e-05,
"loss": 0.3403,
"step": 2725
},
{
"epoch": 0.5835827276613937,
"grad_norm": 4.273214817047119,
"learning_rate": 1.5741092636579574e-05,
"loss": 0.4133,
"step": 2730
},
{
"epoch": 0.5846515604959385,
"grad_norm": 6.121555328369141,
"learning_rate": 1.5729216152019004e-05,
"loss": 0.4104,
"step": 2735
},
{
"epoch": 0.5857203933304831,
"grad_norm": 4.297682762145996,
"learning_rate": 1.5717339667458435e-05,
"loss": 0.2704,
"step": 2740
},
{
"epoch": 0.5867892261650278,
"grad_norm": 3.2164599895477295,
"learning_rate": 1.5705463182897865e-05,
"loss": 0.275,
"step": 2745
},
{
"epoch": 0.5878580589995724,
"grad_norm": 5.293271541595459,
"learning_rate": 1.5693586698337293e-05,
"loss": 0.3642,
"step": 2750
},
{
"epoch": 0.5889268918341172,
"grad_norm": 7.932840824127197,
"learning_rate": 1.5681710213776723e-05,
"loss": 0.3882,
"step": 2755
},
{
"epoch": 0.5899957246686618,
"grad_norm": 4.301117897033691,
"learning_rate": 1.5669833729216154e-05,
"loss": 0.445,
"step": 2760
},
{
"epoch": 0.5910645575032065,
"grad_norm": 5.11594820022583,
"learning_rate": 1.5657957244655584e-05,
"loss": 0.275,
"step": 2765
},
{
"epoch": 0.5921333903377511,
"grad_norm": 6.5174384117126465,
"learning_rate": 1.564608076009501e-05,
"loss": 0.3727,
"step": 2770
},
{
"epoch": 0.5932022231722959,
"grad_norm": 4.847846508026123,
"learning_rate": 1.5634204275534445e-05,
"loss": 0.3458,
"step": 2775
},
{
"epoch": 0.5942710560068405,
"grad_norm": 4.418210983276367,
"learning_rate": 1.5622327790973872e-05,
"loss": 0.3775,
"step": 2780
},
{
"epoch": 0.5953398888413852,
"grad_norm": 4.810405731201172,
"learning_rate": 1.5610451306413303e-05,
"loss": 0.3731,
"step": 2785
},
{
"epoch": 0.5964087216759298,
"grad_norm": 3.9812352657318115,
"learning_rate": 1.5598574821852734e-05,
"loss": 0.3073,
"step": 2790
},
{
"epoch": 0.5974775545104746,
"grad_norm": 4.542743682861328,
"learning_rate": 1.5586698337292164e-05,
"loss": 0.3113,
"step": 2795
},
{
"epoch": 0.5985463873450192,
"grad_norm": 4.736385345458984,
"learning_rate": 1.557482185273159e-05,
"loss": 0.3666,
"step": 2800
},
{
"epoch": 0.5996152201795639,
"grad_norm": 5.22001314163208,
"learning_rate": 1.5562945368171022e-05,
"loss": 0.3769,
"step": 2805
},
{
"epoch": 0.6006840530141085,
"grad_norm": 5.7952680587768555,
"learning_rate": 1.5551068883610452e-05,
"loss": 0.3267,
"step": 2810
},
{
"epoch": 0.6017528858486533,
"grad_norm": 4.174045085906982,
"learning_rate": 1.5539192399049883e-05,
"loss": 0.2513,
"step": 2815
},
{
"epoch": 0.6028217186831979,
"grad_norm": 3.869800090789795,
"learning_rate": 1.552731591448931e-05,
"loss": 0.273,
"step": 2820
},
{
"epoch": 0.6038905515177426,
"grad_norm": 4.380319118499756,
"learning_rate": 1.5515439429928744e-05,
"loss": 0.3712,
"step": 2825
},
{
"epoch": 0.6049593843522874,
"grad_norm": 3.972041368484497,
"learning_rate": 1.550356294536817e-05,
"loss": 0.4728,
"step": 2830
},
{
"epoch": 0.606028217186832,
"grad_norm": 5.212732791900635,
"learning_rate": 1.5491686460807602e-05,
"loss": 0.3608,
"step": 2835
},
{
"epoch": 0.6070970500213767,
"grad_norm": 5.559129238128662,
"learning_rate": 1.5479809976247032e-05,
"loss": 0.3765,
"step": 2840
},
{
"epoch": 0.6081658828559213,
"grad_norm": 4.520800590515137,
"learning_rate": 1.5467933491686463e-05,
"loss": 0.3285,
"step": 2845
},
{
"epoch": 0.609234715690466,
"grad_norm": 6.37885856628418,
"learning_rate": 1.5456057007125893e-05,
"loss": 0.4822,
"step": 2850
},
{
"epoch": 0.6103035485250107,
"grad_norm": 3.292531967163086,
"learning_rate": 1.544418052256532e-05,
"loss": 0.3185,
"step": 2855
},
{
"epoch": 0.6113723813595554,
"grad_norm": 4.683765411376953,
"learning_rate": 1.5432304038004754e-05,
"loss": 0.3027,
"step": 2860
},
{
"epoch": 0.6124412141941,
"grad_norm": 6.004202365875244,
"learning_rate": 1.542042755344418e-05,
"loss": 0.3819,
"step": 2865
},
{
"epoch": 0.6135100470286448,
"grad_norm": 4.668170928955078,
"learning_rate": 1.5408551068883612e-05,
"loss": 0.2819,
"step": 2870
},
{
"epoch": 0.6145788798631894,
"grad_norm": 6.2482781410217285,
"learning_rate": 1.5396674584323043e-05,
"loss": 0.3369,
"step": 2875
},
{
"epoch": 0.6156477126977341,
"grad_norm": 4.60993766784668,
"learning_rate": 1.5384798099762473e-05,
"loss": 0.331,
"step": 2880
},
{
"epoch": 0.6167165455322787,
"grad_norm": 5.188110828399658,
"learning_rate": 1.53729216152019e-05,
"loss": 0.3662,
"step": 2885
},
{
"epoch": 0.6177853783668235,
"grad_norm": 5.201088905334473,
"learning_rate": 1.536104513064133e-05,
"loss": 0.3472,
"step": 2890
},
{
"epoch": 0.6188542112013681,
"grad_norm": 5.363198280334473,
"learning_rate": 1.534916864608076e-05,
"loss": 0.387,
"step": 2895
},
{
"epoch": 0.6199230440359128,
"grad_norm": 5.238138198852539,
"learning_rate": 1.5337292161520192e-05,
"loss": 0.3017,
"step": 2900
},
{
"epoch": 0.6209918768704574,
"grad_norm": 4.188523769378662,
"learning_rate": 1.532541567695962e-05,
"loss": 0.2628,
"step": 2905
},
{
"epoch": 0.6220607097050022,
"grad_norm": 4.730754852294922,
"learning_rate": 1.5313539192399053e-05,
"loss": 0.2683,
"step": 2910
},
{
"epoch": 0.6231295425395468,
"grad_norm": 3.7036404609680176,
"learning_rate": 1.530166270783848e-05,
"loss": 0.3189,
"step": 2915
},
{
"epoch": 0.6241983753740915,
"grad_norm": 4.961543560028076,
"learning_rate": 1.528978622327791e-05,
"loss": 0.3389,
"step": 2920
},
{
"epoch": 0.6252672082086361,
"grad_norm": 4.376546859741211,
"learning_rate": 1.527790973871734e-05,
"loss": 0.3552,
"step": 2925
},
{
"epoch": 0.6263360410431809,
"grad_norm": 3.2792232036590576,
"learning_rate": 1.5266033254156772e-05,
"loss": 0.2784,
"step": 2930
},
{
"epoch": 0.6274048738777255,
"grad_norm": 4.739627838134766,
"learning_rate": 1.5254156769596201e-05,
"loss": 0.3416,
"step": 2935
},
{
"epoch": 0.6284737067122702,
"grad_norm": 4.889829635620117,
"learning_rate": 1.5242280285035631e-05,
"loss": 0.3805,
"step": 2940
},
{
"epoch": 0.6295425395468148,
"grad_norm": 5.562602519989014,
"learning_rate": 1.523040380047506e-05,
"loss": 0.4616,
"step": 2945
},
{
"epoch": 0.6306113723813596,
"grad_norm": 6.154614448547363,
"learning_rate": 1.521852731591449e-05,
"loss": 0.3584,
"step": 2950
},
{
"epoch": 0.6316802052159042,
"grad_norm": 4.117344856262207,
"learning_rate": 1.520665083135392e-05,
"loss": 0.3439,
"step": 2955
},
{
"epoch": 0.6327490380504489,
"grad_norm": 4.961648941040039,
"learning_rate": 1.519477434679335e-05,
"loss": 0.3569,
"step": 2960
},
{
"epoch": 0.6338178708849936,
"grad_norm": 4.030764579772949,
"learning_rate": 1.5182897862232779e-05,
"loss": 0.2775,
"step": 2965
},
{
"epoch": 0.6348867037195383,
"grad_norm": 5.615406036376953,
"learning_rate": 1.517102137767221e-05,
"loss": 0.3758,
"step": 2970
},
{
"epoch": 0.635955536554083,
"grad_norm": 5.250066757202148,
"learning_rate": 1.5159144893111638e-05,
"loss": 0.4178,
"step": 2975
},
{
"epoch": 0.6370243693886276,
"grad_norm": 3.862907648086548,
"learning_rate": 1.514726840855107e-05,
"loss": 0.2725,
"step": 2980
},
{
"epoch": 0.6380932022231723,
"grad_norm": 7.1906023025512695,
"learning_rate": 1.51353919239905e-05,
"loss": 0.4838,
"step": 2985
},
{
"epoch": 0.639162035057717,
"grad_norm": 4.240938663482666,
"learning_rate": 1.512351543942993e-05,
"loss": 0.3184,
"step": 2990
},
{
"epoch": 0.6402308678922617,
"grad_norm": 5.662024974822998,
"learning_rate": 1.511163895486936e-05,
"loss": 0.3265,
"step": 2995
},
{
"epoch": 0.6412997007268063,
"grad_norm": 5.721799850463867,
"learning_rate": 1.509976247030879e-05,
"loss": 0.3344,
"step": 3000
},
{
"epoch": 0.642368533561351,
"grad_norm": 4.524104118347168,
"learning_rate": 1.508788598574822e-05,
"loss": 0.284,
"step": 3005
},
{
"epoch": 0.6434373663958957,
"grad_norm": 4.907393455505371,
"learning_rate": 1.5076009501187649e-05,
"loss": 0.3337,
"step": 3010
},
{
"epoch": 0.6445061992304404,
"grad_norm": 4.567984580993652,
"learning_rate": 1.5064133016627081e-05,
"loss": 0.3085,
"step": 3015
},
{
"epoch": 0.645575032064985,
"grad_norm": 6.088601589202881,
"learning_rate": 1.505225653206651e-05,
"loss": 0.3685,
"step": 3020
},
{
"epoch": 0.6466438648995297,
"grad_norm": 5.842155456542969,
"learning_rate": 1.504038004750594e-05,
"loss": 0.4621,
"step": 3025
},
{
"epoch": 0.6477126977340744,
"grad_norm": 4.505978584289551,
"learning_rate": 1.502850356294537e-05,
"loss": 0.2958,
"step": 3030
},
{
"epoch": 0.6487815305686191,
"grad_norm": 3.832209825515747,
"learning_rate": 1.50166270783848e-05,
"loss": 0.4165,
"step": 3035
},
{
"epoch": 0.6498503634031637,
"grad_norm": 3.149580240249634,
"learning_rate": 1.5004750593824229e-05,
"loss": 0.336,
"step": 3040
},
{
"epoch": 0.6509191962377084,
"grad_norm": 4.5704121589660645,
"learning_rate": 1.499287410926366e-05,
"loss": 0.328,
"step": 3045
},
{
"epoch": 0.6519880290722531,
"grad_norm": 5.424034595489502,
"learning_rate": 1.4980997624703088e-05,
"loss": 0.3256,
"step": 3050
},
{
"epoch": 0.6530568619067978,
"grad_norm": 4.873384475708008,
"learning_rate": 1.4969121140142519e-05,
"loss": 0.2877,
"step": 3055
},
{
"epoch": 0.6541256947413424,
"grad_norm": 4.21671199798584,
"learning_rate": 1.4957244655581948e-05,
"loss": 0.3468,
"step": 3060
},
{
"epoch": 0.6551945275758871,
"grad_norm": 4.723153591156006,
"learning_rate": 1.494536817102138e-05,
"loss": 0.3836,
"step": 3065
},
{
"epoch": 0.6562633604104318,
"grad_norm": 4.3572587966918945,
"learning_rate": 1.4933491686460809e-05,
"loss": 0.3084,
"step": 3070
},
{
"epoch": 0.6573321932449765,
"grad_norm": 4.8245439529418945,
"learning_rate": 1.492161520190024e-05,
"loss": 0.2879,
"step": 3075
},
{
"epoch": 0.6584010260795211,
"grad_norm": 4.260484218597412,
"learning_rate": 1.4909738717339668e-05,
"loss": 0.2836,
"step": 3080
},
{
"epoch": 0.6594698589140658,
"grad_norm": 3.668529748916626,
"learning_rate": 1.4897862232779099e-05,
"loss": 0.3049,
"step": 3085
},
{
"epoch": 0.6605386917486105,
"grad_norm": 5.860143661499023,
"learning_rate": 1.4885985748218528e-05,
"loss": 0.3972,
"step": 3090
},
{
"epoch": 0.6616075245831552,
"grad_norm": 3.9581236839294434,
"learning_rate": 1.4874109263657958e-05,
"loss": 0.3189,
"step": 3095
},
{
"epoch": 0.6626763574176999,
"grad_norm": 2.8415067195892334,
"learning_rate": 1.4862232779097387e-05,
"loss": 0.207,
"step": 3100
},
{
"epoch": 0.6637451902522445,
"grad_norm": 5.096329689025879,
"learning_rate": 1.485035629453682e-05,
"loss": 0.2844,
"step": 3105
},
{
"epoch": 0.6648140230867893,
"grad_norm": 5.822755813598633,
"learning_rate": 1.4838479809976248e-05,
"loss": 0.3583,
"step": 3110
},
{
"epoch": 0.6658828559213339,
"grad_norm": 5.467360019683838,
"learning_rate": 1.4826603325415679e-05,
"loss": 0.2681,
"step": 3115
},
{
"epoch": 0.6669516887558786,
"grad_norm": 5.418729305267334,
"learning_rate": 1.4814726840855107e-05,
"loss": 0.3788,
"step": 3120
},
{
"epoch": 0.6680205215904232,
"grad_norm": 5.312787055969238,
"learning_rate": 1.4802850356294538e-05,
"loss": 0.3335,
"step": 3125
},
{
"epoch": 0.669089354424968,
"grad_norm": 4.632271766662598,
"learning_rate": 1.4790973871733969e-05,
"loss": 0.2958,
"step": 3130
},
{
"epoch": 0.6701581872595126,
"grad_norm": 5.137240886688232,
"learning_rate": 1.4779097387173397e-05,
"loss": 0.343,
"step": 3135
},
{
"epoch": 0.6712270200940573,
"grad_norm": 4.227065086364746,
"learning_rate": 1.4767220902612828e-05,
"loss": 0.3026,
"step": 3140
},
{
"epoch": 0.672295852928602,
"grad_norm": 4.9906110763549805,
"learning_rate": 1.4755344418052257e-05,
"loss": 0.3575,
"step": 3145
},
{
"epoch": 0.6733646857631467,
"grad_norm": 6.338077545166016,
"learning_rate": 1.4743467933491689e-05,
"loss": 0.3962,
"step": 3150
},
{
"epoch": 0.6744335185976913,
"grad_norm": 5.018848896026611,
"learning_rate": 1.4731591448931118e-05,
"loss": 0.292,
"step": 3155
},
{
"epoch": 0.675502351432236,
"grad_norm": 5.4188432693481445,
"learning_rate": 1.4719714964370548e-05,
"loss": 0.4226,
"step": 3160
},
{
"epoch": 0.6765711842667806,
"grad_norm": 5.020565032958984,
"learning_rate": 1.4707838479809977e-05,
"loss": 0.3999,
"step": 3165
},
{
"epoch": 0.6776400171013254,
"grad_norm": 5.457892894744873,
"learning_rate": 1.4695961995249408e-05,
"loss": 0.3949,
"step": 3170
},
{
"epoch": 0.67870884993587,
"grad_norm": 4.842294216156006,
"learning_rate": 1.4684085510688837e-05,
"loss": 0.2844,
"step": 3175
},
{
"epoch": 0.6797776827704147,
"grad_norm": 4.515163421630859,
"learning_rate": 1.4672209026128267e-05,
"loss": 0.3003,
"step": 3180
},
{
"epoch": 0.6808465156049593,
"grad_norm": 3.4031429290771484,
"learning_rate": 1.4660332541567696e-05,
"loss": 0.2636,
"step": 3185
},
{
"epoch": 0.6819153484395041,
"grad_norm": 4.693248748779297,
"learning_rate": 1.4648456057007128e-05,
"loss": 0.2334,
"step": 3190
},
{
"epoch": 0.6829841812740487,
"grad_norm": 4.690431118011475,
"learning_rate": 1.4636579572446557e-05,
"loss": 0.2574,
"step": 3195
},
{
"epoch": 0.6840530141085934,
"grad_norm": 3.9794492721557617,
"learning_rate": 1.4624703087885988e-05,
"loss": 0.3476,
"step": 3200
},
{
"epoch": 0.685121846943138,
"grad_norm": 4.062690258026123,
"learning_rate": 1.4612826603325417e-05,
"loss": 0.3763,
"step": 3205
},
{
"epoch": 0.6861906797776828,
"grad_norm": 2.888495683670044,
"learning_rate": 1.4600950118764847e-05,
"loss": 0.2873,
"step": 3210
},
{
"epoch": 0.6872595126122274,
"grad_norm": 4.061041355133057,
"learning_rate": 1.4589073634204276e-05,
"loss": 0.2859,
"step": 3215
},
{
"epoch": 0.6883283454467721,
"grad_norm": 5.954913139343262,
"learning_rate": 1.4577197149643707e-05,
"loss": 0.3335,
"step": 3220
},
{
"epoch": 0.6893971782813167,
"grad_norm": 4.9537434577941895,
"learning_rate": 1.4565320665083135e-05,
"loss": 0.3712,
"step": 3225
},
{
"epoch": 0.6904660111158615,
"grad_norm": 3.5754384994506836,
"learning_rate": 1.4553444180522566e-05,
"loss": 0.4072,
"step": 3230
},
{
"epoch": 0.6915348439504062,
"grad_norm": 6.583157062530518,
"learning_rate": 1.4541567695961995e-05,
"loss": 0.3442,
"step": 3235
},
{
"epoch": 0.6926036767849508,
"grad_norm": 4.144803524017334,
"learning_rate": 1.4529691211401427e-05,
"loss": 0.32,
"step": 3240
},
{
"epoch": 0.6936725096194956,
"grad_norm": 3.350670576095581,
"learning_rate": 1.4517814726840856e-05,
"loss": 0.3076,
"step": 3245
},
{
"epoch": 0.6947413424540402,
"grad_norm": 3.798152208328247,
"learning_rate": 1.4505938242280287e-05,
"loss": 0.3134,
"step": 3250
},
{
"epoch": 0.6958101752885849,
"grad_norm": 4.410452365875244,
"learning_rate": 1.4494061757719715e-05,
"loss": 0.3155,
"step": 3255
},
{
"epoch": 0.6968790081231295,
"grad_norm": 5.064853191375732,
"learning_rate": 1.4482185273159146e-05,
"loss": 0.3097,
"step": 3260
},
{
"epoch": 0.6979478409576743,
"grad_norm": 5.49769401550293,
"learning_rate": 1.4470308788598575e-05,
"loss": 0.2727,
"step": 3265
},
{
"epoch": 0.6990166737922189,
"grad_norm": 4.130645751953125,
"learning_rate": 1.4458432304038005e-05,
"loss": 0.3666,
"step": 3270
},
{
"epoch": 0.7000855066267636,
"grad_norm": 5.358222484588623,
"learning_rate": 1.4446555819477438e-05,
"loss": 0.3049,
"step": 3275
},
{
"epoch": 0.7011543394613082,
"grad_norm": 3.783137559890747,
"learning_rate": 1.4434679334916866e-05,
"loss": 0.3506,
"step": 3280
},
{
"epoch": 0.702223172295853,
"grad_norm": 4.486612319946289,
"learning_rate": 1.4422802850356297e-05,
"loss": 0.3027,
"step": 3285
},
{
"epoch": 0.7032920051303976,
"grad_norm": 5.604061126708984,
"learning_rate": 1.4410926365795726e-05,
"loss": 0.2706,
"step": 3290
},
{
"epoch": 0.7043608379649423,
"grad_norm": 5.663457870483398,
"learning_rate": 1.4399049881235156e-05,
"loss": 0.3165,
"step": 3295
},
{
"epoch": 0.7054296707994869,
"grad_norm": 4.874339580535889,
"learning_rate": 1.4387173396674585e-05,
"loss": 0.3567,
"step": 3300
},
{
"epoch": 0.7064985036340317,
"grad_norm": 5.478762626647949,
"learning_rate": 1.4375296912114016e-05,
"loss": 0.2795,
"step": 3305
},
{
"epoch": 0.7075673364685763,
"grad_norm": 4.213021278381348,
"learning_rate": 1.4363420427553445e-05,
"loss": 0.2905,
"step": 3310
},
{
"epoch": 0.708636169303121,
"grad_norm": 4.549129009246826,
"learning_rate": 1.4351543942992875e-05,
"loss": 0.2946,
"step": 3315
},
{
"epoch": 0.7097050021376656,
"grad_norm": 4.900253772735596,
"learning_rate": 1.4339667458432304e-05,
"loss": 0.298,
"step": 3320
},
{
"epoch": 0.7107738349722104,
"grad_norm": 5.591811656951904,
"learning_rate": 1.4327790973871736e-05,
"loss": 0.289,
"step": 3325
},
{
"epoch": 0.711842667806755,
"grad_norm": 3.1972029209136963,
"learning_rate": 1.4315914489311165e-05,
"loss": 0.3194,
"step": 3330
},
{
"epoch": 0.7129115006412997,
"grad_norm": 3.692401647567749,
"learning_rate": 1.4304038004750596e-05,
"loss": 0.2719,
"step": 3335
},
{
"epoch": 0.7139803334758443,
"grad_norm": 6.502699851989746,
"learning_rate": 1.4292161520190025e-05,
"loss": 0.3079,
"step": 3340
},
{
"epoch": 0.7150491663103891,
"grad_norm": 4.761363506317139,
"learning_rate": 1.4280285035629455e-05,
"loss": 0.3373,
"step": 3345
},
{
"epoch": 0.7161179991449337,
"grad_norm": 5.628553867340088,
"learning_rate": 1.4268408551068884e-05,
"loss": 0.3103,
"step": 3350
},
{
"epoch": 0.7171868319794784,
"grad_norm": 5.576054096221924,
"learning_rate": 1.4256532066508314e-05,
"loss": 0.3384,
"step": 3355
},
{
"epoch": 0.718255664814023,
"grad_norm": 4.364500999450684,
"learning_rate": 1.4244655581947743e-05,
"loss": 0.3785,
"step": 3360
},
{
"epoch": 0.7193244976485678,
"grad_norm": 2.8248353004455566,
"learning_rate": 1.4232779097387176e-05,
"loss": 0.2583,
"step": 3365
},
{
"epoch": 0.7203933304831125,
"grad_norm": 5.5604987144470215,
"learning_rate": 1.4220902612826604e-05,
"loss": 0.2992,
"step": 3370
},
{
"epoch": 0.7214621633176571,
"grad_norm": 4.8770527839660645,
"learning_rate": 1.4209026128266035e-05,
"loss": 0.2196,
"step": 3375
},
{
"epoch": 0.7225309961522018,
"grad_norm": 4.998085021972656,
"learning_rate": 1.4197149643705464e-05,
"loss": 0.3438,
"step": 3380
},
{
"epoch": 0.7235998289867465,
"grad_norm": 4.125364303588867,
"learning_rate": 1.4185273159144894e-05,
"loss": 0.333,
"step": 3385
},
{
"epoch": 0.7246686618212912,
"grad_norm": 5.174322605133057,
"learning_rate": 1.4173396674584323e-05,
"loss": 0.4422,
"step": 3390
},
{
"epoch": 0.7257374946558358,
"grad_norm": 4.850910186767578,
"learning_rate": 1.4161520190023754e-05,
"loss": 0.458,
"step": 3395
},
{
"epoch": 0.7268063274903805,
"grad_norm": 4.238053321838379,
"learning_rate": 1.4149643705463183e-05,
"loss": 0.2526,
"step": 3400
},
{
"epoch": 0.7278751603249252,
"grad_norm": 4.8868842124938965,
"learning_rate": 1.4137767220902613e-05,
"loss": 0.2443,
"step": 3405
},
{
"epoch": 0.7289439931594699,
"grad_norm": 6.352740287780762,
"learning_rate": 1.4125890736342045e-05,
"loss": 0.4024,
"step": 3410
},
{
"epoch": 0.7300128259940145,
"grad_norm": 3.7694151401519775,
"learning_rate": 1.4114014251781474e-05,
"loss": 0.3057,
"step": 3415
},
{
"epoch": 0.7310816588285592,
"grad_norm": 4.326847553253174,
"learning_rate": 1.4102137767220905e-05,
"loss": 0.3417,
"step": 3420
},
{
"epoch": 0.7321504916631039,
"grad_norm": 4.306587219238281,
"learning_rate": 1.4090261282660334e-05,
"loss": 0.3535,
"step": 3425
},
{
"epoch": 0.7332193244976486,
"grad_norm": 4.4991044998168945,
"learning_rate": 1.4078384798099764e-05,
"loss": 0.3814,
"step": 3430
},
{
"epoch": 0.7342881573321932,
"grad_norm": 4.0679779052734375,
"learning_rate": 1.4066508313539193e-05,
"loss": 0.3196,
"step": 3435
},
{
"epoch": 0.735356990166738,
"grad_norm": 4.0540666580200195,
"learning_rate": 1.4054631828978624e-05,
"loss": 0.2738,
"step": 3440
},
{
"epoch": 0.7364258230012826,
"grad_norm": 4.532857894897461,
"learning_rate": 1.4042755344418053e-05,
"loss": 0.2127,
"step": 3445
},
{
"epoch": 0.7374946558358273,
"grad_norm": 4.681793212890625,
"learning_rate": 1.4030878859857485e-05,
"loss": 0.2415,
"step": 3450
},
{
"epoch": 0.7385634886703719,
"grad_norm": 5.458173751831055,
"learning_rate": 1.4019002375296914e-05,
"loss": 0.368,
"step": 3455
},
{
"epoch": 0.7396323215049166,
"grad_norm": 4.303793430328369,
"learning_rate": 1.4007125890736344e-05,
"loss": 0.2965,
"step": 3460
},
{
"epoch": 0.7407011543394613,
"grad_norm": 5.24821138381958,
"learning_rate": 1.3995249406175773e-05,
"loss": 0.3676,
"step": 3465
},
{
"epoch": 0.741769987174006,
"grad_norm": 7.041927337646484,
"learning_rate": 1.3983372921615204e-05,
"loss": 0.4793,
"step": 3470
},
{
"epoch": 0.7428388200085506,
"grad_norm": 4.38003396987915,
"learning_rate": 1.3971496437054632e-05,
"loss": 0.2924,
"step": 3475
},
{
"epoch": 0.7439076528430953,
"grad_norm": 4.844277858734131,
"learning_rate": 1.3959619952494063e-05,
"loss": 0.3051,
"step": 3480
},
{
"epoch": 0.74497648567764,
"grad_norm": 4.943488121032715,
"learning_rate": 1.3947743467933492e-05,
"loss": 0.3206,
"step": 3485
},
{
"epoch": 0.7460453185121847,
"grad_norm": 3.5360701084136963,
"learning_rate": 1.3935866983372922e-05,
"loss": 0.3062,
"step": 3490
},
{
"epoch": 0.7471141513467293,
"grad_norm": 4.964517116546631,
"learning_rate": 1.3923990498812351e-05,
"loss": 0.3099,
"step": 3495
},
{
"epoch": 0.748182984181274,
"grad_norm": 4.1770124435424805,
"learning_rate": 1.3912114014251783e-05,
"loss": 0.3528,
"step": 3500
},
{
"epoch": 0.7492518170158188,
"grad_norm": 4.830697059631348,
"learning_rate": 1.3900237529691212e-05,
"loss": 0.3075,
"step": 3505
},
{
"epoch": 0.7503206498503634,
"grad_norm": 4.7558512687683105,
"learning_rate": 1.3888361045130643e-05,
"loss": 0.3132,
"step": 3510
},
{
"epoch": 0.7513894826849081,
"grad_norm": 5.082642555236816,
"learning_rate": 1.3876484560570072e-05,
"loss": 0.3789,
"step": 3515
},
{
"epoch": 0.7524583155194527,
"grad_norm": 5.486532211303711,
"learning_rate": 1.3864608076009502e-05,
"loss": 0.3316,
"step": 3520
},
{
"epoch": 0.7535271483539975,
"grad_norm": 4.763543605804443,
"learning_rate": 1.3852731591448931e-05,
"loss": 0.3113,
"step": 3525
},
{
"epoch": 0.7545959811885421,
"grad_norm": 4.146590709686279,
"learning_rate": 1.3840855106888362e-05,
"loss": 0.2481,
"step": 3530
},
{
"epoch": 0.7556648140230868,
"grad_norm": 4.292271614074707,
"learning_rate": 1.382897862232779e-05,
"loss": 0.3174,
"step": 3535
},
{
"epoch": 0.7567336468576314,
"grad_norm": 5.971374988555908,
"learning_rate": 1.3817102137767223e-05,
"loss": 0.3116,
"step": 3540
},
{
"epoch": 0.7578024796921762,
"grad_norm": 4.599390983581543,
"learning_rate": 1.3805225653206652e-05,
"loss": 0.29,
"step": 3545
},
{
"epoch": 0.7588713125267208,
"grad_norm": 3.7273731231689453,
"learning_rate": 1.3793349168646082e-05,
"loss": 0.33,
"step": 3550
},
{
"epoch": 0.7599401453612655,
"grad_norm": 3.681992530822754,
"learning_rate": 1.3781472684085513e-05,
"loss": 0.2002,
"step": 3555
},
{
"epoch": 0.7610089781958101,
"grad_norm": 5.324198246002197,
"learning_rate": 1.3769596199524942e-05,
"loss": 0.3566,
"step": 3560
},
{
"epoch": 0.7620778110303549,
"grad_norm": 4.434847354888916,
"learning_rate": 1.3757719714964372e-05,
"loss": 0.2618,
"step": 3565
},
{
"epoch": 0.7631466438648995,
"grad_norm": 5.279498100280762,
"learning_rate": 1.3745843230403801e-05,
"loss": 0.316,
"step": 3570
},
{
"epoch": 0.7642154766994442,
"grad_norm": 3.4741098880767822,
"learning_rate": 1.3733966745843233e-05,
"loss": 0.2997,
"step": 3575
},
{
"epoch": 0.7652843095339888,
"grad_norm": 4.7899909019470215,
"learning_rate": 1.372209026128266e-05,
"loss": 0.2809,
"step": 3580
},
{
"epoch": 0.7663531423685336,
"grad_norm": 4.318710803985596,
"learning_rate": 1.3710213776722093e-05,
"loss": 0.2023,
"step": 3585
},
{
"epoch": 0.7674219752030782,
"grad_norm": 4.148991107940674,
"learning_rate": 1.3698337292161522e-05,
"loss": 0.2726,
"step": 3590
},
{
"epoch": 0.7684908080376229,
"grad_norm": 5.0960373878479,
"learning_rate": 1.3686460807600952e-05,
"loss": 0.2878,
"step": 3595
},
{
"epoch": 0.7695596408721675,
"grad_norm": 5.928832530975342,
"learning_rate": 1.3674584323040381e-05,
"loss": 0.4026,
"step": 3600
},
{
"epoch": 0.7706284737067123,
"grad_norm": 4.24060583114624,
"learning_rate": 1.3662707838479811e-05,
"loss": 0.3205,
"step": 3605
},
{
"epoch": 0.7716973065412569,
"grad_norm": 4.517853736877441,
"learning_rate": 1.365083135391924e-05,
"loss": 0.3092,
"step": 3610
},
{
"epoch": 0.7727661393758016,
"grad_norm": 5.5383501052856445,
"learning_rate": 1.3638954869358671e-05,
"loss": 0.3249,
"step": 3615
},
{
"epoch": 0.7738349722103463,
"grad_norm": 3.5598056316375732,
"learning_rate": 1.36270783847981e-05,
"loss": 0.2898,
"step": 3620
},
{
"epoch": 0.774903805044891,
"grad_norm": 5.0517578125,
"learning_rate": 1.3615201900237532e-05,
"loss": 0.3464,
"step": 3625
},
{
"epoch": 0.7759726378794357,
"grad_norm": 4.764474868774414,
"learning_rate": 1.360332541567696e-05,
"loss": 0.3755,
"step": 3630
},
{
"epoch": 0.7770414707139803,
"grad_norm": 4.272229194641113,
"learning_rate": 1.3591448931116391e-05,
"loss": 0.3236,
"step": 3635
},
{
"epoch": 0.7781103035485251,
"grad_norm": 4.496946811676025,
"learning_rate": 1.357957244655582e-05,
"loss": 0.3298,
"step": 3640
},
{
"epoch": 0.7791791363830697,
"grad_norm": 3.3338801860809326,
"learning_rate": 1.356769596199525e-05,
"loss": 0.3301,
"step": 3645
},
{
"epoch": 0.7802479692176144,
"grad_norm": 4.775890350341797,
"learning_rate": 1.355581947743468e-05,
"loss": 0.2428,
"step": 3650
},
{
"epoch": 0.781316802052159,
"grad_norm": 3.7741811275482178,
"learning_rate": 1.354394299287411e-05,
"loss": 0.2789,
"step": 3655
},
{
"epoch": 0.7823856348867038,
"grad_norm": 5.699966907501221,
"learning_rate": 1.3532066508313539e-05,
"loss": 0.4398,
"step": 3660
},
{
"epoch": 0.7834544677212484,
"grad_norm": 5.20950174331665,
"learning_rate": 1.352019002375297e-05,
"loss": 0.3211,
"step": 3665
},
{
"epoch": 0.7845233005557931,
"grad_norm": 4.900545120239258,
"learning_rate": 1.3508313539192398e-05,
"loss": 0.3079,
"step": 3670
},
{
"epoch": 0.7855921333903377,
"grad_norm": 4.627389907836914,
"learning_rate": 1.349643705463183e-05,
"loss": 0.2765,
"step": 3675
},
{
"epoch": 0.7866609662248825,
"grad_norm": 3.996687889099121,
"learning_rate": 1.348456057007126e-05,
"loss": 0.2414,
"step": 3680
},
{
"epoch": 0.7877297990594271,
"grad_norm": 4.968347072601318,
"learning_rate": 1.347268408551069e-05,
"loss": 0.3142,
"step": 3685
},
{
"epoch": 0.7887986318939718,
"grad_norm": 5.365523815155029,
"learning_rate": 1.346080760095012e-05,
"loss": 0.4895,
"step": 3690
},
{
"epoch": 0.7898674647285164,
"grad_norm": 3.6716244220733643,
"learning_rate": 1.344893111638955e-05,
"loss": 0.3058,
"step": 3695
},
{
"epoch": 0.7909362975630612,
"grad_norm": 3.6110551357269287,
"learning_rate": 1.343705463182898e-05,
"loss": 0.2568,
"step": 3700
},
{
"epoch": 0.7920051303976058,
"grad_norm": 3.8466339111328125,
"learning_rate": 1.3425178147268409e-05,
"loss": 0.2505,
"step": 3705
},
{
"epoch": 0.7930739632321505,
"grad_norm": 6.473718643188477,
"learning_rate": 1.3413301662707841e-05,
"loss": 0.3416,
"step": 3710
},
{
"epoch": 0.7941427960666951,
"grad_norm": 4.931123733520508,
"learning_rate": 1.340142517814727e-05,
"loss": 0.2867,
"step": 3715
},
{
"epoch": 0.7952116289012399,
"grad_norm": 4.821789741516113,
"learning_rate": 1.33895486935867e-05,
"loss": 0.2696,
"step": 3720
},
{
"epoch": 0.7962804617357845,
"grad_norm": 3.5999889373779297,
"learning_rate": 1.337767220902613e-05,
"loss": 0.293,
"step": 3725
},
{
"epoch": 0.7973492945703292,
"grad_norm": 3.716235637664795,
"learning_rate": 1.336579572446556e-05,
"loss": 0.2741,
"step": 3730
},
{
"epoch": 0.7984181274048738,
"grad_norm": 3.1744401454925537,
"learning_rate": 1.3353919239904989e-05,
"loss": 0.3276,
"step": 3735
},
{
"epoch": 0.7994869602394186,
"grad_norm": 4.65699577331543,
"learning_rate": 1.334204275534442e-05,
"loss": 0.2688,
"step": 3740
},
{
"epoch": 0.8005557930739632,
"grad_norm": 3.338193416595459,
"learning_rate": 1.3330166270783848e-05,
"loss": 0.2408,
"step": 3745
},
{
"epoch": 0.8016246259085079,
"grad_norm": 4.22088098526001,
"learning_rate": 1.3318289786223279e-05,
"loss": 0.2926,
"step": 3750
},
{
"epoch": 0.8026934587430525,
"grad_norm": 5.624631881713867,
"learning_rate": 1.3306413301662708e-05,
"loss": 0.3119,
"step": 3755
},
{
"epoch": 0.8037622915775973,
"grad_norm": 3.8507394790649414,
"learning_rate": 1.329453681710214e-05,
"loss": 0.3018,
"step": 3760
},
{
"epoch": 0.804831124412142,
"grad_norm": 4.6665239334106445,
"learning_rate": 1.3282660332541569e-05,
"loss": 0.3448,
"step": 3765
},
{
"epoch": 0.8058999572466866,
"grad_norm": 4.100464344024658,
"learning_rate": 1.3270783847981e-05,
"loss": 0.3539,
"step": 3770
},
{
"epoch": 0.8069687900812313,
"grad_norm": 6.0533623695373535,
"learning_rate": 1.3258907363420428e-05,
"loss": 0.2776,
"step": 3775
},
{
"epoch": 0.808037622915776,
"grad_norm": 3.781015396118164,
"learning_rate": 1.3247030878859859e-05,
"loss": 0.2255,
"step": 3780
},
{
"epoch": 0.8091064557503207,
"grad_norm": 5.616995334625244,
"learning_rate": 1.3235154394299288e-05,
"loss": 0.2507,
"step": 3785
},
{
"epoch": 0.8101752885848653,
"grad_norm": 5.021564960479736,
"learning_rate": 1.3223277909738718e-05,
"loss": 0.3463,
"step": 3790
},
{
"epoch": 0.81124412141941,
"grad_norm": 4.946634769439697,
"learning_rate": 1.3211401425178147e-05,
"loss": 0.2849,
"step": 3795
},
{
"epoch": 0.8123129542539547,
"grad_norm": 3.1573128700256348,
"learning_rate": 1.319952494061758e-05,
"loss": 0.2678,
"step": 3800
},
{
"epoch": 0.8133817870884994,
"grad_norm": 5.302856922149658,
"learning_rate": 1.3187648456057008e-05,
"loss": 0.3446,
"step": 3805
},
{
"epoch": 0.814450619923044,
"grad_norm": 5.2195820808410645,
"learning_rate": 1.3175771971496439e-05,
"loss": 0.344,
"step": 3810
},
{
"epoch": 0.8155194527575887,
"grad_norm": 5.514340877532959,
"learning_rate": 1.3163895486935867e-05,
"loss": 0.3305,
"step": 3815
},
{
"epoch": 0.8165882855921334,
"grad_norm": 4.197089195251465,
"learning_rate": 1.3152019002375298e-05,
"loss": 0.2728,
"step": 3820
},
{
"epoch": 0.8176571184266781,
"grad_norm": 4.766973972320557,
"learning_rate": 1.3140142517814727e-05,
"loss": 0.4181,
"step": 3825
},
{
"epoch": 0.8187259512612227,
"grad_norm": 5.202324390411377,
"learning_rate": 1.3128266033254157e-05,
"loss": 0.3351,
"step": 3830
},
{
"epoch": 0.8197947840957674,
"grad_norm": 3.472627878189087,
"learning_rate": 1.311638954869359e-05,
"loss": 0.2646,
"step": 3835
},
{
"epoch": 0.8208636169303121,
"grad_norm": 4.589137554168701,
"learning_rate": 1.3104513064133017e-05,
"loss": 0.2628,
"step": 3840
},
{
"epoch": 0.8219324497648568,
"grad_norm": 3.9725475311279297,
"learning_rate": 1.3092636579572449e-05,
"loss": 0.2747,
"step": 3845
},
{
"epoch": 0.8230012825994014,
"grad_norm": 3.832432985305786,
"learning_rate": 1.3080760095011878e-05,
"loss": 0.2253,
"step": 3850
},
{
"epoch": 0.8240701154339461,
"grad_norm": 4.213531494140625,
"learning_rate": 1.3068883610451308e-05,
"loss": 0.2741,
"step": 3855
},
{
"epoch": 0.8251389482684908,
"grad_norm": 6.430481910705566,
"learning_rate": 1.3057007125890737e-05,
"loss": 0.3982,
"step": 3860
},
{
"epoch": 0.8262077811030355,
"grad_norm": 2.416151762008667,
"learning_rate": 1.3045130641330168e-05,
"loss": 0.3014,
"step": 3865
},
{
"epoch": 0.8272766139375801,
"grad_norm": 4.334439754486084,
"learning_rate": 1.3033254156769597e-05,
"loss": 0.2696,
"step": 3870
},
{
"epoch": 0.8283454467721248,
"grad_norm": 3.599234104156494,
"learning_rate": 1.3021377672209027e-05,
"loss": 0.2607,
"step": 3875
},
{
"epoch": 0.8294142796066695,
"grad_norm": 4.65981388092041,
"learning_rate": 1.3009501187648456e-05,
"loss": 0.3154,
"step": 3880
},
{
"epoch": 0.8304831124412142,
"grad_norm": 5.147418975830078,
"learning_rate": 1.2997624703087888e-05,
"loss": 0.3275,
"step": 3885
},
{
"epoch": 0.8315519452757588,
"grad_norm": 4.910894870758057,
"learning_rate": 1.2985748218527317e-05,
"loss": 0.274,
"step": 3890
},
{
"epoch": 0.8326207781103035,
"grad_norm": 3.3270483016967773,
"learning_rate": 1.2973871733966748e-05,
"loss": 0.3042,
"step": 3895
},
{
"epoch": 0.8336896109448483,
"grad_norm": 5.005611419677734,
"learning_rate": 1.2961995249406177e-05,
"loss": 0.2692,
"step": 3900
},
{
"epoch": 0.8347584437793929,
"grad_norm": 3.320770263671875,
"learning_rate": 1.2950118764845607e-05,
"loss": 0.2505,
"step": 3905
},
{
"epoch": 0.8358272766139376,
"grad_norm": 4.788522720336914,
"learning_rate": 1.2938242280285036e-05,
"loss": 0.3762,
"step": 3910
},
{
"epoch": 0.8368961094484823,
"grad_norm": 5.107404708862305,
"learning_rate": 1.2926365795724467e-05,
"loss": 0.2467,
"step": 3915
},
{
"epoch": 0.837964942283027,
"grad_norm": 3.5440781116485596,
"learning_rate": 1.2914489311163895e-05,
"loss": 0.2227,
"step": 3920
},
{
"epoch": 0.8390337751175716,
"grad_norm": 5.089791774749756,
"learning_rate": 1.2902612826603326e-05,
"loss": 0.2513,
"step": 3925
},
{
"epoch": 0.8401026079521163,
"grad_norm": 5.978660583496094,
"learning_rate": 1.2890736342042755e-05,
"loss": 0.313,
"step": 3930
},
{
"epoch": 0.841171440786661,
"grad_norm": 4.347848415374756,
"learning_rate": 1.2878859857482187e-05,
"loss": 0.265,
"step": 3935
},
{
"epoch": 0.8422402736212057,
"grad_norm": 5.038461208343506,
"learning_rate": 1.2866983372921616e-05,
"loss": 0.2839,
"step": 3940
},
{
"epoch": 0.8433091064557503,
"grad_norm": 4.367410659790039,
"learning_rate": 1.2855106888361046e-05,
"loss": 0.3432,
"step": 3945
},
{
"epoch": 0.844377939290295,
"grad_norm": 4.267697334289551,
"learning_rate": 1.2843230403800475e-05,
"loss": 0.2168,
"step": 3950
},
{
"epoch": 0.8454467721248397,
"grad_norm": 4.99351167678833,
"learning_rate": 1.2831353919239906e-05,
"loss": 0.3083,
"step": 3955
},
{
"epoch": 0.8465156049593844,
"grad_norm": 3.725167751312256,
"learning_rate": 1.2819477434679335e-05,
"loss": 0.3362,
"step": 3960
},
{
"epoch": 0.847584437793929,
"grad_norm": 4.825465679168701,
"learning_rate": 1.2807600950118765e-05,
"loss": 0.2897,
"step": 3965
},
{
"epoch": 0.8486532706284737,
"grad_norm": 4.231856822967529,
"learning_rate": 1.2795724465558198e-05,
"loss": 0.299,
"step": 3970
},
{
"epoch": 0.8497221034630184,
"grad_norm": 3.8439395427703857,
"learning_rate": 1.2783847980997626e-05,
"loss": 0.3421,
"step": 3975
},
{
"epoch": 0.8507909362975631,
"grad_norm": 4.338144779205322,
"learning_rate": 1.2771971496437057e-05,
"loss": 0.2886,
"step": 3980
},
{
"epoch": 0.8518597691321077,
"grad_norm": 5.123786449432373,
"learning_rate": 1.2760095011876486e-05,
"loss": 0.3563,
"step": 3985
},
{
"epoch": 0.8529286019666524,
"grad_norm": 5.506287574768066,
"learning_rate": 1.2748218527315916e-05,
"loss": 0.3204,
"step": 3990
},
{
"epoch": 0.853997434801197,
"grad_norm": 3.644973039627075,
"learning_rate": 1.2736342042755345e-05,
"loss": 0.3025,
"step": 3995
},
{
"epoch": 0.8550662676357418,
"grad_norm": 5.109133720397949,
"learning_rate": 1.2724465558194776e-05,
"loss": 0.2813,
"step": 4000
},
{
"epoch": 0.8561351004702864,
"grad_norm": 5.544173717498779,
"learning_rate": 1.2712589073634205e-05,
"loss": 0.2787,
"step": 4005
},
{
"epoch": 0.8572039333048311,
"grad_norm": 5.382670879364014,
"learning_rate": 1.2700712589073637e-05,
"loss": 0.2643,
"step": 4010
},
{
"epoch": 0.8582727661393758,
"grad_norm": 5.406363010406494,
"learning_rate": 1.2688836104513064e-05,
"loss": 0.291,
"step": 4015
},
{
"epoch": 0.8593415989739205,
"grad_norm": 3.5062954425811768,
"learning_rate": 1.2676959619952496e-05,
"loss": 0.2467,
"step": 4020
},
{
"epoch": 0.8604104318084651,
"grad_norm": 5.817686080932617,
"learning_rate": 1.2665083135391925e-05,
"loss": 0.3489,
"step": 4025
},
{
"epoch": 0.8614792646430098,
"grad_norm": 3.931792974472046,
"learning_rate": 1.2653206650831356e-05,
"loss": 0.2613,
"step": 4030
},
{
"epoch": 0.8625480974775546,
"grad_norm": 4.279338359832764,
"learning_rate": 1.2641330166270785e-05,
"loss": 0.3007,
"step": 4035
},
{
"epoch": 0.8636169303120992,
"grad_norm": 3.9646289348602295,
"learning_rate": 1.2629453681710215e-05,
"loss": 0.2685,
"step": 4040
},
{
"epoch": 0.8646857631466439,
"grad_norm": 5.029911518096924,
"learning_rate": 1.2617577197149644e-05,
"loss": 0.2984,
"step": 4045
},
{
"epoch": 0.8657545959811885,
"grad_norm": 4.78744649887085,
"learning_rate": 1.2605700712589074e-05,
"loss": 0.2321,
"step": 4050
},
{
"epoch": 0.8668234288157333,
"grad_norm": 3.825188636779785,
"learning_rate": 1.2593824228028503e-05,
"loss": 0.2417,
"step": 4055
},
{
"epoch": 0.8678922616502779,
"grad_norm": 4.478353500366211,
"learning_rate": 1.2581947743467936e-05,
"loss": 0.3164,
"step": 4060
},
{
"epoch": 0.8689610944848226,
"grad_norm": 5.523867607116699,
"learning_rate": 1.2570071258907364e-05,
"loss": 0.3769,
"step": 4065
},
{
"epoch": 0.8700299273193672,
"grad_norm": 6.190155506134033,
"learning_rate": 1.2558194774346795e-05,
"loss": 0.3385,
"step": 4070
},
{
"epoch": 0.871098760153912,
"grad_norm": 4.058770179748535,
"learning_rate": 1.2546318289786224e-05,
"loss": 0.3135,
"step": 4075
},
{
"epoch": 0.8721675929884566,
"grad_norm": 5.607039928436279,
"learning_rate": 1.2534441805225654e-05,
"loss": 0.3295,
"step": 4080
},
{
"epoch": 0.8732364258230013,
"grad_norm": 4.902414321899414,
"learning_rate": 1.2522565320665083e-05,
"loss": 0.2992,
"step": 4085
},
{
"epoch": 0.8743052586575459,
"grad_norm": 4.188961505889893,
"learning_rate": 1.2510688836104514e-05,
"loss": 0.2723,
"step": 4090
},
{
"epoch": 0.8753740914920907,
"grad_norm": 4.536145210266113,
"learning_rate": 1.2498812351543943e-05,
"loss": 0.2805,
"step": 4095
},
{
"epoch": 0.8764429243266353,
"grad_norm": 3.7727832794189453,
"learning_rate": 1.2486935866983373e-05,
"loss": 0.2171,
"step": 4100
},
{
"epoch": 0.87751175716118,
"grad_norm": 4.528228759765625,
"learning_rate": 1.2475059382422802e-05,
"loss": 0.2618,
"step": 4105
},
{
"epoch": 0.8785805899957246,
"grad_norm": 4.920950412750244,
"learning_rate": 1.2463182897862234e-05,
"loss": 0.2994,
"step": 4110
},
{
"epoch": 0.8796494228302694,
"grad_norm": 4.851797580718994,
"learning_rate": 1.2451306413301665e-05,
"loss": 0.2866,
"step": 4115
},
{
"epoch": 0.880718255664814,
"grad_norm": 3.021509885787964,
"learning_rate": 1.2439429928741094e-05,
"loss": 0.2091,
"step": 4120
},
{
"epoch": 0.8817870884993587,
"grad_norm": 5.19913911819458,
"learning_rate": 1.2427553444180524e-05,
"loss": 0.3285,
"step": 4125
},
{
"epoch": 0.8828559213339033,
"grad_norm": 4.311760902404785,
"learning_rate": 1.2415676959619953e-05,
"loss": 0.2854,
"step": 4130
},
{
"epoch": 0.8839247541684481,
"grad_norm": 5.5093994140625,
"learning_rate": 1.2403800475059384e-05,
"loss": 0.3004,
"step": 4135
},
{
"epoch": 0.8849935870029927,
"grad_norm": 3.5908706188201904,
"learning_rate": 1.2391923990498813e-05,
"loss": 0.2335,
"step": 4140
},
{
"epoch": 0.8860624198375374,
"grad_norm": 3.561647653579712,
"learning_rate": 1.2380047505938245e-05,
"loss": 0.2919,
"step": 4145
},
{
"epoch": 0.887131252672082,
"grad_norm": 3.1781160831451416,
"learning_rate": 1.2368171021377674e-05,
"loss": 0.2786,
"step": 4150
},
{
"epoch": 0.8882000855066268,
"grad_norm": 4.471413612365723,
"learning_rate": 1.2356294536817104e-05,
"loss": 0.3312,
"step": 4155
},
{
"epoch": 0.8892689183411714,
"grad_norm": 5.232965469360352,
"learning_rate": 1.2344418052256533e-05,
"loss": 0.2677,
"step": 4160
},
{
"epoch": 0.8903377511757161,
"grad_norm": 4.883133888244629,
"learning_rate": 1.2332541567695964e-05,
"loss": 0.2774,
"step": 4165
},
{
"epoch": 0.8914065840102608,
"grad_norm": 4.092249393463135,
"learning_rate": 1.2320665083135392e-05,
"loss": 0.2649,
"step": 4170
},
{
"epoch": 0.8924754168448055,
"grad_norm": 3.5607283115386963,
"learning_rate": 1.2308788598574823e-05,
"loss": 0.3119,
"step": 4175
},
{
"epoch": 0.8935442496793502,
"grad_norm": 4.573966026306152,
"learning_rate": 1.2296912114014252e-05,
"loss": 0.243,
"step": 4180
},
{
"epoch": 0.8946130825138948,
"grad_norm": 4.2962775230407715,
"learning_rate": 1.2285035629453684e-05,
"loss": 0.292,
"step": 4185
},
{
"epoch": 0.8956819153484396,
"grad_norm": 4.585544109344482,
"learning_rate": 1.2273159144893111e-05,
"loss": 0.352,
"step": 4190
},
{
"epoch": 0.8967507481829842,
"grad_norm": 4.529600143432617,
"learning_rate": 1.2261282660332543e-05,
"loss": 0.2422,
"step": 4195
},
{
"epoch": 0.8978195810175289,
"grad_norm": 2.9587581157684326,
"learning_rate": 1.2249406175771972e-05,
"loss": 0.2427,
"step": 4200
},
{
"epoch": 0.8988884138520735,
"grad_norm": 4.409660339355469,
"learning_rate": 1.2237529691211403e-05,
"loss": 0.2246,
"step": 4205
},
{
"epoch": 0.8999572466866183,
"grad_norm": 3.328666925430298,
"learning_rate": 1.2225653206650832e-05,
"loss": 0.2275,
"step": 4210
},
{
"epoch": 0.9010260795211629,
"grad_norm": 4.411447048187256,
"learning_rate": 1.2213776722090262e-05,
"loss": 0.3766,
"step": 4215
},
{
"epoch": 0.9020949123557076,
"grad_norm": 3.3779454231262207,
"learning_rate": 1.2201900237529691e-05,
"loss": 0.2748,
"step": 4220
},
{
"epoch": 0.9031637451902522,
"grad_norm": 5.558443069458008,
"learning_rate": 1.2190023752969122e-05,
"loss": 0.2941,
"step": 4225
},
{
"epoch": 0.904232578024797,
"grad_norm": 3.7313380241394043,
"learning_rate": 1.217814726840855e-05,
"loss": 0.2693,
"step": 4230
},
{
"epoch": 0.9053014108593416,
"grad_norm": 3.5401077270507812,
"learning_rate": 1.2166270783847983e-05,
"loss": 0.3058,
"step": 4235
},
{
"epoch": 0.9063702436938863,
"grad_norm": 3.6305854320526123,
"learning_rate": 1.2154394299287412e-05,
"loss": 0.2167,
"step": 4240
},
{
"epoch": 0.9074390765284309,
"grad_norm": 4.208883285522461,
"learning_rate": 1.2142517814726842e-05,
"loss": 0.2371,
"step": 4245
},
{
"epoch": 0.9085079093629757,
"grad_norm": 4.586354732513428,
"learning_rate": 1.2130641330166273e-05,
"loss": 0.2199,
"step": 4250
},
{
"epoch": 0.9095767421975203,
"grad_norm": 3.673724889755249,
"learning_rate": 1.2118764845605702e-05,
"loss": 0.2803,
"step": 4255
},
{
"epoch": 0.910645575032065,
"grad_norm": 4.0301337242126465,
"learning_rate": 1.2106888361045132e-05,
"loss": 0.2765,
"step": 4260
},
{
"epoch": 0.9117144078666096,
"grad_norm": 4.114202976226807,
"learning_rate": 1.2095011876484561e-05,
"loss": 0.2734,
"step": 4265
},
{
"epoch": 0.9127832407011544,
"grad_norm": 6.415131568908691,
"learning_rate": 1.2083135391923993e-05,
"loss": 0.3345,
"step": 4270
},
{
"epoch": 0.913852073535699,
"grad_norm": 4.800512790679932,
"learning_rate": 1.207125890736342e-05,
"loss": 0.2873,
"step": 4275
},
{
"epoch": 0.9149209063702437,
"grad_norm": 4.536464214324951,
"learning_rate": 1.2059382422802853e-05,
"loss": 0.2506,
"step": 4280
},
{
"epoch": 0.9159897392047883,
"grad_norm": 4.594064235687256,
"learning_rate": 1.2047505938242281e-05,
"loss": 0.2335,
"step": 4285
},
{
"epoch": 0.917058572039333,
"grad_norm": 5.493027687072754,
"learning_rate": 1.2035629453681712e-05,
"loss": 0.3218,
"step": 4290
},
{
"epoch": 0.9181274048738777,
"grad_norm": 4.560657501220703,
"learning_rate": 1.2023752969121141e-05,
"loss": 0.2971,
"step": 4295
},
{
"epoch": 0.9191962377084224,
"grad_norm": 3.5777430534362793,
"learning_rate": 1.2011876484560571e-05,
"loss": 0.2296,
"step": 4300
},
{
"epoch": 0.9202650705429671,
"grad_norm": 4.112082481384277,
"learning_rate": 1.2e-05,
"loss": 0.3087,
"step": 4305
},
{
"epoch": 0.9213339033775118,
"grad_norm": 3.815093994140625,
"learning_rate": 1.1988123515439431e-05,
"loss": 0.3353,
"step": 4310
},
{
"epoch": 0.9224027362120565,
"grad_norm": 5.078567028045654,
"learning_rate": 1.197624703087886e-05,
"loss": 0.3046,
"step": 4315
},
{
"epoch": 0.9234715690466011,
"grad_norm": 3.549429178237915,
"learning_rate": 1.1964370546318292e-05,
"loss": 0.3431,
"step": 4320
},
{
"epoch": 0.9245404018811458,
"grad_norm": 4.466531276702881,
"learning_rate": 1.195249406175772e-05,
"loss": 0.2707,
"step": 4325
},
{
"epoch": 0.9256092347156905,
"grad_norm": 5.423553943634033,
"learning_rate": 1.1940617577197151e-05,
"loss": 0.284,
"step": 4330
},
{
"epoch": 0.9266780675502352,
"grad_norm": 4.436051845550537,
"learning_rate": 1.192874109263658e-05,
"loss": 0.2714,
"step": 4335
},
{
"epoch": 0.9277469003847798,
"grad_norm": 4.404295444488525,
"learning_rate": 1.191686460807601e-05,
"loss": 0.2751,
"step": 4340
},
{
"epoch": 0.9288157332193245,
"grad_norm": 4.390391826629639,
"learning_rate": 1.190498812351544e-05,
"loss": 0.3047,
"step": 4345
},
{
"epoch": 0.9298845660538692,
"grad_norm": 4.6937479972839355,
"learning_rate": 1.189311163895487e-05,
"loss": 0.2867,
"step": 4350
},
{
"epoch": 0.9309533988884139,
"grad_norm": 4.352549076080322,
"learning_rate": 1.1881235154394299e-05,
"loss": 0.2895,
"step": 4355
},
{
"epoch": 0.9320222317229585,
"grad_norm": 4.013473033905029,
"learning_rate": 1.186935866983373e-05,
"loss": 0.2749,
"step": 4360
},
{
"epoch": 0.9330910645575032,
"grad_norm": 3.603860378265381,
"learning_rate": 1.1857482185273158e-05,
"loss": 0.2549,
"step": 4365
},
{
"epoch": 0.9341598973920479,
"grad_norm": 5.079062461853027,
"learning_rate": 1.184560570071259e-05,
"loss": 0.2648,
"step": 4370
},
{
"epoch": 0.9352287302265926,
"grad_norm": 6.029326438903809,
"learning_rate": 1.183372921615202e-05,
"loss": 0.3001,
"step": 4375
},
{
"epoch": 0.9362975630611372,
"grad_norm": 4.8559041023254395,
"learning_rate": 1.182185273159145e-05,
"loss": 0.3198,
"step": 4380
},
{
"epoch": 0.9373663958956819,
"grad_norm": 4.295980453491211,
"learning_rate": 1.1809976247030879e-05,
"loss": 0.2583,
"step": 4385
},
{
"epoch": 0.9384352287302266,
"grad_norm": 6.648914337158203,
"learning_rate": 1.179809976247031e-05,
"loss": 0.2894,
"step": 4390
},
{
"epoch": 0.9395040615647713,
"grad_norm": 5.454647064208984,
"learning_rate": 1.178622327790974e-05,
"loss": 0.3017,
"step": 4395
},
{
"epoch": 0.9405728943993159,
"grad_norm": 5.520369529724121,
"learning_rate": 1.1774346793349169e-05,
"loss": 0.2754,
"step": 4400
},
{
"epoch": 0.9416417272338606,
"grad_norm": 3.847935914993286,
"learning_rate": 1.1762470308788601e-05,
"loss": 0.3289,
"step": 4405
},
{
"epoch": 0.9427105600684053,
"grad_norm": 4.063333988189697,
"learning_rate": 1.175059382422803e-05,
"loss": 0.2787,
"step": 4410
},
{
"epoch": 0.94377939290295,
"grad_norm": 4.977645397186279,
"learning_rate": 1.173871733966746e-05,
"loss": 0.2406,
"step": 4415
},
{
"epoch": 0.9448482257374946,
"grad_norm": 4.375988483428955,
"learning_rate": 1.172684085510689e-05,
"loss": 0.3144,
"step": 4420
},
{
"epoch": 0.9459170585720393,
"grad_norm": 4.656064987182617,
"learning_rate": 1.171496437054632e-05,
"loss": 0.3237,
"step": 4425
},
{
"epoch": 0.946985891406584,
"grad_norm": 4.027129650115967,
"learning_rate": 1.1703087885985749e-05,
"loss": 0.2641,
"step": 4430
},
{
"epoch": 0.9480547242411287,
"grad_norm": 4.126834869384766,
"learning_rate": 1.169121140142518e-05,
"loss": 0.2875,
"step": 4435
},
{
"epoch": 0.9491235570756734,
"grad_norm": 3.4707841873168945,
"learning_rate": 1.1679334916864608e-05,
"loss": 0.3211,
"step": 4440
},
{
"epoch": 0.950192389910218,
"grad_norm": 2.8617501258850098,
"learning_rate": 1.166745843230404e-05,
"loss": 0.2403,
"step": 4445
},
{
"epoch": 0.9512612227447628,
"grad_norm": 4.50408935546875,
"learning_rate": 1.1655581947743468e-05,
"loss": 0.3018,
"step": 4450
},
{
"epoch": 0.9523300555793074,
"grad_norm": 3.976015329360962,
"learning_rate": 1.16437054631829e-05,
"loss": 0.2531,
"step": 4455
},
{
"epoch": 0.9533988884138521,
"grad_norm": 6.214652061462402,
"learning_rate": 1.1631828978622329e-05,
"loss": 0.349,
"step": 4460
},
{
"epoch": 0.9544677212483967,
"grad_norm": 3.969996929168701,
"learning_rate": 1.161995249406176e-05,
"loss": 0.238,
"step": 4465
},
{
"epoch": 0.9555365540829415,
"grad_norm": 3.9902470111846924,
"learning_rate": 1.1608076009501188e-05,
"loss": 0.2768,
"step": 4470
},
{
"epoch": 0.9566053869174861,
"grad_norm": 4.20414924621582,
"learning_rate": 1.1596199524940619e-05,
"loss": 0.2944,
"step": 4475
},
{
"epoch": 0.9576742197520308,
"grad_norm": 3.5199337005615234,
"learning_rate": 1.1584323040380048e-05,
"loss": 0.3043,
"step": 4480
},
{
"epoch": 0.9587430525865754,
"grad_norm": 3.7765684127807617,
"learning_rate": 1.1572446555819478e-05,
"loss": 0.2434,
"step": 4485
},
{
"epoch": 0.9598118854211202,
"grad_norm": 3.9338152408599854,
"learning_rate": 1.1560570071258907e-05,
"loss": 0.2451,
"step": 4490
},
{
"epoch": 0.9608807182556648,
"grad_norm": 2.86897873878479,
"learning_rate": 1.154869358669834e-05,
"loss": 0.213,
"step": 4495
},
{
"epoch": 0.9619495510902095,
"grad_norm": 4.536627292633057,
"learning_rate": 1.1536817102137768e-05,
"loss": 0.26,
"step": 4500
},
{
"epoch": 0.9630183839247541,
"grad_norm": 5.863621234893799,
"learning_rate": 1.1524940617577199e-05,
"loss": 0.3173,
"step": 4505
},
{
"epoch": 0.9640872167592989,
"grad_norm": 5.156888008117676,
"learning_rate": 1.1513064133016627e-05,
"loss": 0.2745,
"step": 4510
},
{
"epoch": 0.9651560495938435,
"grad_norm": 3.947845220565796,
"learning_rate": 1.1501187648456058e-05,
"loss": 0.2824,
"step": 4515
},
{
"epoch": 0.9662248824283882,
"grad_norm": 3.6855573654174805,
"learning_rate": 1.1489311163895487e-05,
"loss": 0.2769,
"step": 4520
},
{
"epoch": 0.9672937152629328,
"grad_norm": 3.929898977279663,
"learning_rate": 1.1477434679334917e-05,
"loss": 0.2464,
"step": 4525
},
{
"epoch": 0.9683625480974776,
"grad_norm": 3.9288270473480225,
"learning_rate": 1.146555819477435e-05,
"loss": 0.3213,
"step": 4530
},
{
"epoch": 0.9694313809320222,
"grad_norm": 5.536011219024658,
"learning_rate": 1.1453681710213777e-05,
"loss": 0.3606,
"step": 4535
},
{
"epoch": 0.9705002137665669,
"grad_norm": 3.3420379161834717,
"learning_rate": 1.1441805225653209e-05,
"loss": 0.2183,
"step": 4540
},
{
"epoch": 0.9715690466011115,
"grad_norm": 3.492932081222534,
"learning_rate": 1.1429928741092638e-05,
"loss": 0.2567,
"step": 4545
},
{
"epoch": 0.9726378794356563,
"grad_norm": 5.132521629333496,
"learning_rate": 1.1418052256532068e-05,
"loss": 0.2521,
"step": 4550
},
{
"epoch": 0.9737067122702009,
"grad_norm": 4.512472152709961,
"learning_rate": 1.1406175771971497e-05,
"loss": 0.2696,
"step": 4555
},
{
"epoch": 0.9747755451047456,
"grad_norm": 5.246362686157227,
"learning_rate": 1.1394299287410928e-05,
"loss": 0.3409,
"step": 4560
},
{
"epoch": 0.9758443779392902,
"grad_norm": 4.033038139343262,
"learning_rate": 1.1382422802850357e-05,
"loss": 0.2732,
"step": 4565
},
{
"epoch": 0.976913210773835,
"grad_norm": 4.162726879119873,
"learning_rate": 1.1370546318289787e-05,
"loss": 0.3003,
"step": 4570
},
{
"epoch": 0.9779820436083797,
"grad_norm": 5.6553730964660645,
"learning_rate": 1.1358669833729216e-05,
"loss": 0.3426,
"step": 4575
},
{
"epoch": 0.9790508764429243,
"grad_norm": 3.857776403427124,
"learning_rate": 1.1346793349168648e-05,
"loss": 0.2873,
"step": 4580
},
{
"epoch": 0.980119709277469,
"grad_norm": 4.109443187713623,
"learning_rate": 1.1334916864608077e-05,
"loss": 0.3,
"step": 4585
},
{
"epoch": 0.9811885421120137,
"grad_norm": 3.3073673248291016,
"learning_rate": 1.1323040380047508e-05,
"loss": 0.2074,
"step": 4590
},
{
"epoch": 0.9822573749465584,
"grad_norm": 3.0706233978271484,
"learning_rate": 1.1311163895486937e-05,
"loss": 0.2521,
"step": 4595
},
{
"epoch": 0.983326207781103,
"grad_norm": 5.8296356201171875,
"learning_rate": 1.1299287410926367e-05,
"loss": 0.3123,
"step": 4600
},
{
"epoch": 0.9843950406156478,
"grad_norm": 3.409862995147705,
"learning_rate": 1.1287410926365796e-05,
"loss": 0.2492,
"step": 4605
},
{
"epoch": 0.9854638734501924,
"grad_norm": 5.090631008148193,
"learning_rate": 1.1275534441805227e-05,
"loss": 0.3012,
"step": 4610
},
{
"epoch": 0.9865327062847371,
"grad_norm": 6.443350315093994,
"learning_rate": 1.1263657957244655e-05,
"loss": 0.2516,
"step": 4615
},
{
"epoch": 0.9876015391192817,
"grad_norm": 4.340301513671875,
"learning_rate": 1.1251781472684088e-05,
"loss": 0.3629,
"step": 4620
},
{
"epoch": 0.9886703719538265,
"grad_norm": 4.117158889770508,
"learning_rate": 1.1239904988123515e-05,
"loss": 0.2484,
"step": 4625
},
{
"epoch": 0.9897392047883711,
"grad_norm": 4.39588737487793,
"learning_rate": 1.1228028503562947e-05,
"loss": 0.2749,
"step": 4630
},
{
"epoch": 0.9908080376229158,
"grad_norm": 4.059388637542725,
"learning_rate": 1.1216152019002376e-05,
"loss": 0.2064,
"step": 4635
},
{
"epoch": 0.9918768704574604,
"grad_norm": 3.4412331581115723,
"learning_rate": 1.1204275534441806e-05,
"loss": 0.3089,
"step": 4640
},
{
"epoch": 0.9929457032920052,
"grad_norm": 4.691385746002197,
"learning_rate": 1.1192399049881235e-05,
"loss": 0.3145,
"step": 4645
},
{
"epoch": 0.9940145361265498,
"grad_norm": 3.472172737121582,
"learning_rate": 1.1180522565320666e-05,
"loss": 0.2357,
"step": 4650
},
{
"epoch": 0.9950833689610945,
"grad_norm": 4.1867289543151855,
"learning_rate": 1.1168646080760095e-05,
"loss": 0.2803,
"step": 4655
},
{
"epoch": 0.9961522017956391,
"grad_norm": 4.0518083572387695,
"learning_rate": 1.1156769596199525e-05,
"loss": 0.2437,
"step": 4660
},
{
"epoch": 0.9972210346301839,
"grad_norm": 3.507197141647339,
"learning_rate": 1.1144893111638954e-05,
"loss": 0.2708,
"step": 4665
},
{
"epoch": 0.9982898674647285,
"grad_norm": 5.1572585105896,
"learning_rate": 1.1133016627078386e-05,
"loss": 0.253,
"step": 4670
},
{
"epoch": 0.9993587002992732,
"grad_norm": 4.823436737060547,
"learning_rate": 1.1121140142517817e-05,
"loss": 0.2219,
"step": 4675
},
{
"epoch": 1.0,
"eval_loss": 0.1271175593137741,
"eval_mrr": 0.9770190895741555,
"eval_runtime": 313.9716,
"eval_samples_per_second": 7.23,
"eval_steps_per_second": 0.905,
"step": 4678
},
{
"epoch": 1.000427533133818,
"grad_norm": 5.583497047424316,
"learning_rate": 1.1109263657957246e-05,
"loss": 0.2621,
"step": 4680
},
{
"epoch": 1.0014963659683624,
"grad_norm": 4.658013343811035,
"learning_rate": 1.1097387173396676e-05,
"loss": 0.382,
"step": 4685
},
{
"epoch": 1.0025651988029072,
"grad_norm": 3.0044312477111816,
"learning_rate": 1.1085510688836105e-05,
"loss": 0.3026,
"step": 4690
},
{
"epoch": 1.003634031637452,
"grad_norm": 4.063423156738281,
"learning_rate": 1.1073634204275536e-05,
"loss": 0.3643,
"step": 4695
},
{
"epoch": 1.0047028644719966,
"grad_norm": 4.625239372253418,
"learning_rate": 1.1061757719714965e-05,
"loss": 0.382,
"step": 4700
},
{
"epoch": 1.0057716973065411,
"grad_norm": 3.8251540660858154,
"learning_rate": 1.1049881235154397e-05,
"loss": 0.3082,
"step": 4705
},
{
"epoch": 1.0068405301410859,
"grad_norm": 4.241628170013428,
"learning_rate": 1.1038004750593824e-05,
"loss": 0.3411,
"step": 4710
},
{
"epoch": 1.0079093629756306,
"grad_norm": 5.6527276039123535,
"learning_rate": 1.1026128266033256e-05,
"loss": 0.317,
"step": 4715
},
{
"epoch": 1.0089781958101753,
"grad_norm": 5.0404052734375,
"learning_rate": 1.1014251781472685e-05,
"loss": 0.4396,
"step": 4720
},
{
"epoch": 1.01004702864472,
"grad_norm": 4.585846900939941,
"learning_rate": 1.1002375296912116e-05,
"loss": 0.4034,
"step": 4725
},
{
"epoch": 1.0111158614792646,
"grad_norm": 4.704357624053955,
"learning_rate": 1.0990498812351544e-05,
"loss": 0.2875,
"step": 4730
},
{
"epoch": 1.0121846943138093,
"grad_norm": 5.956788063049316,
"learning_rate": 1.0978622327790975e-05,
"loss": 0.4919,
"step": 4735
},
{
"epoch": 1.013253527148354,
"grad_norm": 4.240102291107178,
"learning_rate": 1.0966745843230404e-05,
"loss": 0.3118,
"step": 4740
},
{
"epoch": 1.0143223599828988,
"grad_norm": 4.7897515296936035,
"learning_rate": 1.0954869358669834e-05,
"loss": 0.3976,
"step": 4745
},
{
"epoch": 1.0153911928174433,
"grad_norm": 3.1631078720092773,
"learning_rate": 1.0942992874109263e-05,
"loss": 0.2919,
"step": 4750
},
{
"epoch": 1.016460025651988,
"grad_norm": 4.258396148681641,
"learning_rate": 1.0931116389548696e-05,
"loss": 0.5247,
"step": 4755
},
{
"epoch": 1.0175288584865327,
"grad_norm": 3.010542392730713,
"learning_rate": 1.0919239904988124e-05,
"loss": 0.2126,
"step": 4760
},
{
"epoch": 1.0185976913210775,
"grad_norm": 3.0874409675598145,
"learning_rate": 1.0907363420427555e-05,
"loss": 0.3455,
"step": 4765
},
{
"epoch": 1.019666524155622,
"grad_norm": 4.446132183074951,
"learning_rate": 1.0895486935866984e-05,
"loss": 0.3498,
"step": 4770
},
{
"epoch": 1.0207353569901667,
"grad_norm": 4.1357502937316895,
"learning_rate": 1.0883610451306414e-05,
"loss": 0.29,
"step": 4775
},
{
"epoch": 1.0218041898247114,
"grad_norm": 6.850640296936035,
"learning_rate": 1.0871733966745843e-05,
"loss": 0.3684,
"step": 4780
},
{
"epoch": 1.0228730226592562,
"grad_norm": 3.9681396484375,
"learning_rate": 1.0859857482185274e-05,
"loss": 0.3033,
"step": 4785
},
{
"epoch": 1.0239418554938007,
"grad_norm": 3.521563768386841,
"learning_rate": 1.0847980997624703e-05,
"loss": 0.2747,
"step": 4790
},
{
"epoch": 1.0250106883283454,
"grad_norm": 4.060203552246094,
"learning_rate": 1.0836104513064135e-05,
"loss": 0.2813,
"step": 4795
},
{
"epoch": 1.0260795211628901,
"grad_norm": 3.187224864959717,
"learning_rate": 1.0824228028503562e-05,
"loss": 0.2945,
"step": 4800
},
{
"epoch": 1.0271483539974349,
"grad_norm": 3.6413896083831787,
"learning_rate": 1.0812351543942994e-05,
"loss": 0.421,
"step": 4805
},
{
"epoch": 1.0282171868319794,
"grad_norm": 4.686298847198486,
"learning_rate": 1.0800475059382423e-05,
"loss": 0.3365,
"step": 4810
},
{
"epoch": 1.029286019666524,
"grad_norm": 5.890400409698486,
"learning_rate": 1.0788598574821854e-05,
"loss": 0.3708,
"step": 4815
},
{
"epoch": 1.0303548525010688,
"grad_norm": 3.566652774810791,
"learning_rate": 1.0776722090261284e-05,
"loss": 0.4345,
"step": 4820
},
{
"epoch": 1.0314236853356136,
"grad_norm": 5.6578826904296875,
"learning_rate": 1.0764845605700713e-05,
"loss": 0.3728,
"step": 4825
},
{
"epoch": 1.032492518170158,
"grad_norm": 4.193053245544434,
"learning_rate": 1.0752969121140144e-05,
"loss": 0.3173,
"step": 4830
},
{
"epoch": 1.0335613510047028,
"grad_norm": 4.646356105804443,
"learning_rate": 1.0741092636579572e-05,
"loss": 0.2574,
"step": 4835
},
{
"epoch": 1.0346301838392475,
"grad_norm": 3.941087245941162,
"learning_rate": 1.0729216152019005e-05,
"loss": 0.3128,
"step": 4840
},
{
"epoch": 1.0356990166737923,
"grad_norm": 4.5648884773254395,
"learning_rate": 1.0717339667458434e-05,
"loss": 0.2542,
"step": 4845
},
{
"epoch": 1.036767849508337,
"grad_norm": 3.661923408508301,
"learning_rate": 1.0705463182897864e-05,
"loss": 0.1649,
"step": 4850
},
{
"epoch": 1.0378366823428815,
"grad_norm": 5.052914619445801,
"learning_rate": 1.0693586698337293e-05,
"loss": 0.4091,
"step": 4855
},
{
"epoch": 1.0389055151774262,
"grad_norm": 5.769303321838379,
"learning_rate": 1.0681710213776724e-05,
"loss": 0.3553,
"step": 4860
},
{
"epoch": 1.039974348011971,
"grad_norm": 8.323318481445312,
"learning_rate": 1.0669833729216152e-05,
"loss": 0.5474,
"step": 4865
},
{
"epoch": 1.0410431808465157,
"grad_norm": 5.351403713226318,
"learning_rate": 1.0657957244655583e-05,
"loss": 0.3586,
"step": 4870
},
{
"epoch": 1.0421120136810602,
"grad_norm": 3.5083069801330566,
"learning_rate": 1.0646080760095012e-05,
"loss": 0.2589,
"step": 4875
},
{
"epoch": 1.043180846515605,
"grad_norm": 3.8574445247650146,
"learning_rate": 1.0634204275534444e-05,
"loss": 0.3143,
"step": 4880
},
{
"epoch": 1.0442496793501497,
"grad_norm": 3.950756311416626,
"learning_rate": 1.0622327790973871e-05,
"loss": 0.3248,
"step": 4885
},
{
"epoch": 1.0453185121846944,
"grad_norm": 5.606834411621094,
"learning_rate": 1.0610451306413303e-05,
"loss": 0.4631,
"step": 4890
},
{
"epoch": 1.046387345019239,
"grad_norm": 4.092567443847656,
"learning_rate": 1.0598574821852732e-05,
"loss": 0.2977,
"step": 4895
},
{
"epoch": 1.0474561778537836,
"grad_norm": 5.365922451019287,
"learning_rate": 1.0586698337292163e-05,
"loss": 0.2487,
"step": 4900
},
{
"epoch": 1.0485250106883284,
"grad_norm": 5.173450946807861,
"learning_rate": 1.0574821852731592e-05,
"loss": 0.3554,
"step": 4905
},
{
"epoch": 1.049593843522873,
"grad_norm": 5.21553373336792,
"learning_rate": 1.0562945368171022e-05,
"loss": 0.3579,
"step": 4910
},
{
"epoch": 1.0506626763574176,
"grad_norm": 4.973548889160156,
"learning_rate": 1.0551068883610451e-05,
"loss": 0.3562,
"step": 4915
},
{
"epoch": 1.0517315091919623,
"grad_norm": 6.216787815093994,
"learning_rate": 1.0539192399049882e-05,
"loss": 0.4625,
"step": 4920
},
{
"epoch": 1.052800342026507,
"grad_norm": 4.355904579162598,
"learning_rate": 1.052731591448931e-05,
"loss": 0.2834,
"step": 4925
},
{
"epoch": 1.0538691748610518,
"grad_norm": 4.44107723236084,
"learning_rate": 1.0515439429928743e-05,
"loss": 0.4072,
"step": 4930
},
{
"epoch": 1.0549380076955963,
"grad_norm": 5.2141289710998535,
"learning_rate": 1.0503562945368172e-05,
"loss": 0.2831,
"step": 4935
},
{
"epoch": 1.056006840530141,
"grad_norm": 4.4729228019714355,
"learning_rate": 1.0491686460807602e-05,
"loss": 0.2642,
"step": 4940
},
{
"epoch": 1.0570756733646858,
"grad_norm": 4.615827560424805,
"learning_rate": 1.0479809976247031e-05,
"loss": 0.2887,
"step": 4945
},
{
"epoch": 1.0581445061992305,
"grad_norm": 4.060108661651611,
"learning_rate": 1.0467933491686462e-05,
"loss": 0.4394,
"step": 4950
},
{
"epoch": 1.059213339033775,
"grad_norm": 3.323357582092285,
"learning_rate": 1.0456057007125892e-05,
"loss": 0.2957,
"step": 4955
},
{
"epoch": 1.0602821718683197,
"grad_norm": 3.9010369777679443,
"learning_rate": 1.0444180522565321e-05,
"loss": 0.298,
"step": 4960
},
{
"epoch": 1.0613510047028645,
"grad_norm": 4.847980976104736,
"learning_rate": 1.0432304038004753e-05,
"loss": 0.2643,
"step": 4965
},
{
"epoch": 1.0624198375374092,
"grad_norm": 4.916622638702393,
"learning_rate": 1.0420427553444182e-05,
"loss": 0.3147,
"step": 4970
},
{
"epoch": 1.063488670371954,
"grad_norm": 6.059121131896973,
"learning_rate": 1.0408551068883613e-05,
"loss": 0.3433,
"step": 4975
},
{
"epoch": 1.0645575032064984,
"grad_norm": 4.212458610534668,
"learning_rate": 1.0396674584323041e-05,
"loss": 0.2797,
"step": 4980
},
{
"epoch": 1.0656263360410432,
"grad_norm": 3.9374332427978516,
"learning_rate": 1.0384798099762472e-05,
"loss": 0.2958,
"step": 4985
},
{
"epoch": 1.066695168875588,
"grad_norm": 7.207469940185547,
"learning_rate": 1.0372921615201901e-05,
"loss": 0.3972,
"step": 4990
},
{
"epoch": 1.0677640017101326,
"grad_norm": 5.122316360473633,
"learning_rate": 1.0361045130641331e-05,
"loss": 0.3361,
"step": 4995
},
{
"epoch": 1.0688328345446771,
"grad_norm": 3.4103052616119385,
"learning_rate": 1.034916864608076e-05,
"loss": 0.3174,
"step": 5000
},
{
"epoch": 1.0699016673792219,
"grad_norm": 4.129265308380127,
"learning_rate": 1.033729216152019e-05,
"loss": 0.3346,
"step": 5005
},
{
"epoch": 1.0709705002137666,
"grad_norm": 4.027009963989258,
"learning_rate": 1.032541567695962e-05,
"loss": 0.326,
"step": 5010
},
{
"epoch": 1.0720393330483113,
"grad_norm": 3.362579822540283,
"learning_rate": 1.0313539192399052e-05,
"loss": 0.2975,
"step": 5015
},
{
"epoch": 1.0731081658828558,
"grad_norm": 5.225454330444336,
"learning_rate": 1.030166270783848e-05,
"loss": 0.3303,
"step": 5020
},
{
"epoch": 1.0741769987174006,
"grad_norm": 3.756742000579834,
"learning_rate": 1.0289786223277911e-05,
"loss": 0.2285,
"step": 5025
},
{
"epoch": 1.0752458315519453,
"grad_norm": 4.867086887359619,
"learning_rate": 1.027790973871734e-05,
"loss": 0.3798,
"step": 5030
},
{
"epoch": 1.07631466438649,
"grad_norm": 4.204124927520752,
"learning_rate": 1.026603325415677e-05,
"loss": 0.2882,
"step": 5035
},
{
"epoch": 1.0773834972210345,
"grad_norm": 4.995541095733643,
"learning_rate": 1.02541567695962e-05,
"loss": 0.4249,
"step": 5040
},
{
"epoch": 1.0784523300555793,
"grad_norm": 4.921726226806641,
"learning_rate": 1.024228028503563e-05,
"loss": 0.4139,
"step": 5045
},
{
"epoch": 1.079521162890124,
"grad_norm": 5.5460734367370605,
"learning_rate": 1.0230403800475059e-05,
"loss": 0.4502,
"step": 5050
},
{
"epoch": 1.0805899957246687,
"grad_norm": 4.828423023223877,
"learning_rate": 1.0218527315914491e-05,
"loss": 0.3521,
"step": 5055
},
{
"epoch": 1.0816588285592132,
"grad_norm": 3.87648344039917,
"learning_rate": 1.0206650831353918e-05,
"loss": 0.342,
"step": 5060
},
{
"epoch": 1.082727661393758,
"grad_norm": 4.833287715911865,
"learning_rate": 1.019477434679335e-05,
"loss": 0.294,
"step": 5065
},
{
"epoch": 1.0837964942283027,
"grad_norm": 4.559665679931641,
"learning_rate": 1.018289786223278e-05,
"loss": 0.2994,
"step": 5070
},
{
"epoch": 1.0848653270628474,
"grad_norm": 4.908376216888428,
"learning_rate": 1.017102137767221e-05,
"loss": 0.3467,
"step": 5075
},
{
"epoch": 1.085934159897392,
"grad_norm": 6.717689514160156,
"learning_rate": 1.0159144893111639e-05,
"loss": 0.443,
"step": 5080
},
{
"epoch": 1.0870029927319367,
"grad_norm": 3.5398693084716797,
"learning_rate": 1.014726840855107e-05,
"loss": 0.2759,
"step": 5085
},
{
"epoch": 1.0880718255664814,
"grad_norm": 4.501621246337891,
"learning_rate": 1.0135391923990498e-05,
"loss": 0.2614,
"step": 5090
},
{
"epoch": 1.0891406584010261,
"grad_norm": 5.194151401519775,
"learning_rate": 1.0123515439429929e-05,
"loss": 0.3649,
"step": 5095
},
{
"epoch": 1.0902094912355706,
"grad_norm": 4.68430757522583,
"learning_rate": 1.0111638954869361e-05,
"loss": 0.3699,
"step": 5100
},
{
"epoch": 1.0912783240701154,
"grad_norm": 6.266822814941406,
"learning_rate": 1.009976247030879e-05,
"loss": 0.3785,
"step": 5105
},
{
"epoch": 1.09234715690466,
"grad_norm": 5.040585517883301,
"learning_rate": 1.008788598574822e-05,
"loss": 0.325,
"step": 5110
},
{
"epoch": 1.0934159897392048,
"grad_norm": 6.65608024597168,
"learning_rate": 1.007600950118765e-05,
"loss": 0.3954,
"step": 5115
},
{
"epoch": 1.0944848225737496,
"grad_norm": 4.3081488609313965,
"learning_rate": 1.006413301662708e-05,
"loss": 0.2887,
"step": 5120
},
{
"epoch": 1.095553655408294,
"grad_norm": 3.5742883682250977,
"learning_rate": 1.0052256532066509e-05,
"loss": 0.2565,
"step": 5125
},
{
"epoch": 1.0966224882428388,
"grad_norm": 4.272683620452881,
"learning_rate": 1.004038004750594e-05,
"loss": 0.2661,
"step": 5130
},
{
"epoch": 1.0976913210773835,
"grad_norm": 3.7064707279205322,
"learning_rate": 1.0028503562945368e-05,
"loss": 0.286,
"step": 5135
},
{
"epoch": 1.0987601539119283,
"grad_norm": 6.0004963874816895,
"learning_rate": 1.00166270783848e-05,
"loss": 0.3969,
"step": 5140
},
{
"epoch": 1.0998289867464728,
"grad_norm": 6.301701068878174,
"learning_rate": 1.0004750593824228e-05,
"loss": 0.2451,
"step": 5145
},
{
"epoch": 1.1008978195810175,
"grad_norm": 5.471083164215088,
"learning_rate": 9.992874109263658e-06,
"loss": 0.2472,
"step": 5150
},
{
"epoch": 1.1019666524155622,
"grad_norm": 6.456992149353027,
"learning_rate": 9.980997624703089e-06,
"loss": 0.286,
"step": 5155
},
{
"epoch": 1.103035485250107,
"grad_norm": 6.616683006286621,
"learning_rate": 9.969121140142518e-06,
"loss": 0.3109,
"step": 5160
},
{
"epoch": 1.1041043180846515,
"grad_norm": 5.748746871948242,
"learning_rate": 9.95724465558195e-06,
"loss": 0.441,
"step": 5165
},
{
"epoch": 1.1051731509191962,
"grad_norm": 4.254424571990967,
"learning_rate": 9.945368171021379e-06,
"loss": 0.3494,
"step": 5170
},
{
"epoch": 1.106241983753741,
"grad_norm": 6.022365093231201,
"learning_rate": 9.93349168646081e-06,
"loss": 0.345,
"step": 5175
},
{
"epoch": 1.1073108165882857,
"grad_norm": 3.26804518699646,
"learning_rate": 9.921615201900238e-06,
"loss": 0.2503,
"step": 5180
},
{
"epoch": 1.1083796494228302,
"grad_norm": 3.100945234298706,
"learning_rate": 9.909738717339669e-06,
"loss": 0.3922,
"step": 5185
},
{
"epoch": 1.109448482257375,
"grad_norm": 4.631006717681885,
"learning_rate": 9.897862232779099e-06,
"loss": 0.3429,
"step": 5190
},
{
"epoch": 1.1105173150919196,
"grad_norm": 4.623953819274902,
"learning_rate": 9.885985748218528e-06,
"loss": 0.3437,
"step": 5195
},
{
"epoch": 1.1115861479264644,
"grad_norm": 3.877652406692505,
"learning_rate": 9.874109263657959e-06,
"loss": 0.2751,
"step": 5200
},
{
"epoch": 1.1126549807610089,
"grad_norm": 4.4313225746154785,
"learning_rate": 9.862232779097387e-06,
"loss": 0.3634,
"step": 5205
},
{
"epoch": 1.1137238135955536,
"grad_norm": 5.426332473754883,
"learning_rate": 9.850356294536818e-06,
"loss": 0.2685,
"step": 5210
},
{
"epoch": 1.1147926464300983,
"grad_norm": 3.7707009315490723,
"learning_rate": 9.838479809976248e-06,
"loss": 0.2121,
"step": 5215
},
{
"epoch": 1.115861479264643,
"grad_norm": 3.5573911666870117,
"learning_rate": 9.826603325415677e-06,
"loss": 0.4083,
"step": 5220
},
{
"epoch": 1.1169303120991876,
"grad_norm": 3.2365455627441406,
"learning_rate": 9.814726840855108e-06,
"loss": 0.2497,
"step": 5225
},
{
"epoch": 1.1179991449337323,
"grad_norm": 3.604321241378784,
"learning_rate": 9.802850356294538e-06,
"loss": 0.3521,
"step": 5230
},
{
"epoch": 1.119067977768277,
"grad_norm": 4.779599666595459,
"learning_rate": 9.790973871733967e-06,
"loss": 0.3048,
"step": 5235
},
{
"epoch": 1.1201368106028218,
"grad_norm": 3.685837745666504,
"learning_rate": 9.779097387173398e-06,
"loss": 0.2622,
"step": 5240
},
{
"epoch": 1.1212056434373663,
"grad_norm": 4.687803268432617,
"learning_rate": 9.767220902612827e-06,
"loss": 0.2651,
"step": 5245
},
{
"epoch": 1.122274476271911,
"grad_norm": 3.861872911453247,
"learning_rate": 9.755344418052257e-06,
"loss": 0.2971,
"step": 5250
},
{
"epoch": 1.1233433091064557,
"grad_norm": 5.285345077514648,
"learning_rate": 9.743467933491688e-06,
"loss": 0.3186,
"step": 5255
},
{
"epoch": 1.1244121419410005,
"grad_norm": 4.946507930755615,
"learning_rate": 9.731591448931117e-06,
"loss": 0.3277,
"step": 5260
},
{
"epoch": 1.1254809747755452,
"grad_norm": 3.527979612350464,
"learning_rate": 9.719714964370547e-06,
"loss": 0.2521,
"step": 5265
},
{
"epoch": 1.1265498076100897,
"grad_norm": 4.42695426940918,
"learning_rate": 9.707838479809976e-06,
"loss": 0.3114,
"step": 5270
},
{
"epoch": 1.1276186404446344,
"grad_norm": 2.8614661693573,
"learning_rate": 9.695961995249407e-06,
"loss": 0.3582,
"step": 5275
},
{
"epoch": 1.1286874732791792,
"grad_norm": 4.813942909240723,
"learning_rate": 9.684085510688837e-06,
"loss": 0.3617,
"step": 5280
},
{
"epoch": 1.129756306113724,
"grad_norm": 4.115063667297363,
"learning_rate": 9.672209026128266e-06,
"loss": 0.328,
"step": 5285
},
{
"epoch": 1.1308251389482684,
"grad_norm": 2.9611902236938477,
"learning_rate": 9.660332541567697e-06,
"loss": 0.2627,
"step": 5290
},
{
"epoch": 1.1318939717828131,
"grad_norm": 4.242338180541992,
"learning_rate": 9.648456057007125e-06,
"loss": 0.3054,
"step": 5295
},
{
"epoch": 1.1329628046173579,
"grad_norm": 3.4355380535125732,
"learning_rate": 9.636579572446556e-06,
"loss": 0.5342,
"step": 5300
},
{
"epoch": 1.1340316374519026,
"grad_norm": 3.823155641555786,
"learning_rate": 9.624703087885987e-06,
"loss": 0.2956,
"step": 5305
},
{
"epoch": 1.1351004702864471,
"grad_norm": 3.815985679626465,
"learning_rate": 9.612826603325417e-06,
"loss": 0.3008,
"step": 5310
},
{
"epoch": 1.1361693031209918,
"grad_norm": 6.562064170837402,
"learning_rate": 9.600950118764848e-06,
"loss": 0.4844,
"step": 5315
},
{
"epoch": 1.1372381359555366,
"grad_norm": 4.454050540924072,
"learning_rate": 9.589073634204276e-06,
"loss": 0.2941,
"step": 5320
},
{
"epoch": 1.1383069687900813,
"grad_norm": 4.194582462310791,
"learning_rate": 9.577197149643707e-06,
"loss": 0.3928,
"step": 5325
},
{
"epoch": 1.1393758016246258,
"grad_norm": 5.349386215209961,
"learning_rate": 9.565320665083136e-06,
"loss": 0.2804,
"step": 5330
},
{
"epoch": 1.1404446344591705,
"grad_norm": 4.539825916290283,
"learning_rate": 9.553444180522566e-06,
"loss": 0.2605,
"step": 5335
},
{
"epoch": 1.1415134672937153,
"grad_norm": 4.817893028259277,
"learning_rate": 9.541567695961997e-06,
"loss": 0.2871,
"step": 5340
},
{
"epoch": 1.14258230012826,
"grad_norm": 3.5281782150268555,
"learning_rate": 9.529691211401426e-06,
"loss": 0.2106,
"step": 5345
},
{
"epoch": 1.1436511329628045,
"grad_norm": 5.409219264984131,
"learning_rate": 9.517814726840856e-06,
"loss": 0.3053,
"step": 5350
},
{
"epoch": 1.1447199657973492,
"grad_norm": 4.795240879058838,
"learning_rate": 9.505938242280285e-06,
"loss": 0.3719,
"step": 5355
},
{
"epoch": 1.145788798631894,
"grad_norm": 6.551200866699219,
"learning_rate": 9.494061757719716e-06,
"loss": 0.396,
"step": 5360
},
{
"epoch": 1.1468576314664387,
"grad_norm": 4.7790703773498535,
"learning_rate": 9.482185273159146e-06,
"loss": 0.4105,
"step": 5365
},
{
"epoch": 1.1479264643009834,
"grad_norm": 5.062493801116943,
"learning_rate": 9.470308788598575e-06,
"loss": 0.3882,
"step": 5370
},
{
"epoch": 1.148995297135528,
"grad_norm": 4.342947006225586,
"learning_rate": 9.458432304038006e-06,
"loss": 0.2815,
"step": 5375
},
{
"epoch": 1.1500641299700727,
"grad_norm": 4.391014099121094,
"learning_rate": 9.446555819477435e-06,
"loss": 0.2477,
"step": 5380
},
{
"epoch": 1.1511329628046174,
"grad_norm": 3.2322447299957275,
"learning_rate": 9.434679334916865e-06,
"loss": 0.2798,
"step": 5385
},
{
"epoch": 1.152201795639162,
"grad_norm": 3.8520939350128174,
"learning_rate": 9.422802850356296e-06,
"loss": 0.2758,
"step": 5390
},
{
"epoch": 1.1532706284737066,
"grad_norm": 3.970700740814209,
"learning_rate": 9.410926365795725e-06,
"loss": 0.2938,
"step": 5395
},
{
"epoch": 1.1543394613082514,
"grad_norm": 4.378193378448486,
"learning_rate": 9.399049881235155e-06,
"loss": 0.3946,
"step": 5400
},
{
"epoch": 1.155408294142796,
"grad_norm": 3.779149293899536,
"learning_rate": 9.387173396674586e-06,
"loss": 0.2348,
"step": 5405
},
{
"epoch": 1.1564771269773408,
"grad_norm": 3.6311495304107666,
"learning_rate": 9.375296912114015e-06,
"loss": 0.2701,
"step": 5410
},
{
"epoch": 1.1575459598118853,
"grad_norm": 4.026009559631348,
"learning_rate": 9.363420427553445e-06,
"loss": 0.3154,
"step": 5415
},
{
"epoch": 1.15861479264643,
"grad_norm": 3.796111583709717,
"learning_rate": 9.351543942992874e-06,
"loss": 0.2617,
"step": 5420
},
{
"epoch": 1.1596836254809748,
"grad_norm": 4.301056861877441,
"learning_rate": 9.339667458432304e-06,
"loss": 0.3529,
"step": 5425
},
{
"epoch": 1.1607524583155195,
"grad_norm": 4.392220973968506,
"learning_rate": 9.327790973871735e-06,
"loss": 0.2858,
"step": 5430
},
{
"epoch": 1.161821291150064,
"grad_norm": 3.698474168777466,
"learning_rate": 9.315914489311164e-06,
"loss": 0.42,
"step": 5435
},
{
"epoch": 1.1628901239846088,
"grad_norm": 4.409991264343262,
"learning_rate": 9.304038004750594e-06,
"loss": 0.3894,
"step": 5440
},
{
"epoch": 1.1639589568191535,
"grad_norm": 2.799488067626953,
"learning_rate": 9.292161520190025e-06,
"loss": 0.3073,
"step": 5445
},
{
"epoch": 1.1650277896536982,
"grad_norm": 3.6285009384155273,
"learning_rate": 9.280285035629456e-06,
"loss": 0.2824,
"step": 5450
},
{
"epoch": 1.1660966224882428,
"grad_norm": 4.096553802490234,
"learning_rate": 9.268408551068884e-06,
"loss": 0.3139,
"step": 5455
},
{
"epoch": 1.1671654553227875,
"grad_norm": 6.436227798461914,
"learning_rate": 9.256532066508315e-06,
"loss": 0.4651,
"step": 5460
},
{
"epoch": 1.1682342881573322,
"grad_norm": 4.163245677947998,
"learning_rate": 9.244655581947744e-06,
"loss": 0.2611,
"step": 5465
},
{
"epoch": 1.169303120991877,
"grad_norm": 4.249100208282471,
"learning_rate": 9.232779097387174e-06,
"loss": 0.2663,
"step": 5470
},
{
"epoch": 1.1703719538264215,
"grad_norm": 4.877579212188721,
"learning_rate": 9.220902612826605e-06,
"loss": 0.252,
"step": 5475
},
{
"epoch": 1.1714407866609662,
"grad_norm": 4.387494087219238,
"learning_rate": 9.209026128266034e-06,
"loss": 0.4996,
"step": 5480
},
{
"epoch": 1.172509619495511,
"grad_norm": 3.6732518672943115,
"learning_rate": 9.197149643705464e-06,
"loss": 0.2786,
"step": 5485
},
{
"epoch": 1.1735784523300556,
"grad_norm": 4.684414386749268,
"learning_rate": 9.185273159144895e-06,
"loss": 0.2628,
"step": 5490
},
{
"epoch": 1.1746472851646002,
"grad_norm": 5.551144599914551,
"learning_rate": 9.173396674584324e-06,
"loss": 0.3326,
"step": 5495
},
{
"epoch": 1.1757161179991449,
"grad_norm": 3.942741632461548,
"learning_rate": 9.161520190023754e-06,
"loss": 0.3294,
"step": 5500
},
{
"epoch": 1.1767849508336896,
"grad_norm": 4.97520637512207,
"learning_rate": 9.149643705463183e-06,
"loss": 0.341,
"step": 5505
},
{
"epoch": 1.1778537836682343,
"grad_norm": 4.264441967010498,
"learning_rate": 9.137767220902614e-06,
"loss": 0.2878,
"step": 5510
},
{
"epoch": 1.178922616502779,
"grad_norm": 5.5287299156188965,
"learning_rate": 9.125890736342044e-06,
"loss": 0.354,
"step": 5515
},
{
"epoch": 1.1799914493373236,
"grad_norm": 2.997340679168701,
"learning_rate": 9.114014251781473e-06,
"loss": 0.2667,
"step": 5520
},
{
"epoch": 1.1810602821718683,
"grad_norm": 4.381051540374756,
"learning_rate": 9.102137767220904e-06,
"loss": 0.2932,
"step": 5525
},
{
"epoch": 1.182129115006413,
"grad_norm": 3.4648494720458984,
"learning_rate": 9.090261282660332e-06,
"loss": 0.2608,
"step": 5530
},
{
"epoch": 1.1831979478409576,
"grad_norm": 4.567250728607178,
"learning_rate": 9.078384798099763e-06,
"loss": 0.3078,
"step": 5535
},
{
"epoch": 1.1842667806755023,
"grad_norm": 4.373274326324463,
"learning_rate": 9.066508313539194e-06,
"loss": 0.4028,
"step": 5540
},
{
"epoch": 1.185335613510047,
"grad_norm": 4.338989734649658,
"learning_rate": 9.054631828978622e-06,
"loss": 0.3429,
"step": 5545
},
{
"epoch": 1.1864044463445917,
"grad_norm": 4.9778008460998535,
"learning_rate": 9.042755344418053e-06,
"loss": 0.3481,
"step": 5550
},
{
"epoch": 1.1874732791791365,
"grad_norm": 4.068686008453369,
"learning_rate": 9.030878859857482e-06,
"loss": 0.2931,
"step": 5555
},
{
"epoch": 1.188542112013681,
"grad_norm": 3.909130096435547,
"learning_rate": 9.019002375296912e-06,
"loss": 0.2719,
"step": 5560
},
{
"epoch": 1.1896109448482257,
"grad_norm": 4.785898208618164,
"learning_rate": 9.007125890736343e-06,
"loss": 0.3968,
"step": 5565
},
{
"epoch": 1.1906797776827704,
"grad_norm": 5.576188087463379,
"learning_rate": 8.995249406175772e-06,
"loss": 0.4653,
"step": 5570
},
{
"epoch": 1.1917486105173152,
"grad_norm": 3.010072946548462,
"learning_rate": 8.983372921615202e-06,
"loss": 0.2727,
"step": 5575
},
{
"epoch": 1.1928174433518597,
"grad_norm": 4.709297180175781,
"learning_rate": 8.971496437054633e-06,
"loss": 0.4521,
"step": 5580
},
{
"epoch": 1.1938862761864044,
"grad_norm": 5.573824405670166,
"learning_rate": 8.959619952494063e-06,
"loss": 0.3042,
"step": 5585
},
{
"epoch": 1.1949551090209491,
"grad_norm": 4.321738243103027,
"learning_rate": 8.947743467933492e-06,
"loss": 0.2687,
"step": 5590
},
{
"epoch": 1.1960239418554939,
"grad_norm": 5.602605819702148,
"learning_rate": 8.935866983372923e-06,
"loss": 0.2892,
"step": 5595
},
{
"epoch": 1.1970927746900384,
"grad_norm": 3.6464884281158447,
"learning_rate": 8.923990498812353e-06,
"loss": 0.2515,
"step": 5600
},
{
"epoch": 1.1981616075245831,
"grad_norm": 3.9809868335723877,
"learning_rate": 8.912114014251782e-06,
"loss": 0.3022,
"step": 5605
},
{
"epoch": 1.1992304403591278,
"grad_norm": 4.83494758605957,
"learning_rate": 8.900237529691213e-06,
"loss": 0.4018,
"step": 5610
},
{
"epoch": 1.2002992731936726,
"grad_norm": 3.961460590362549,
"learning_rate": 8.888361045130642e-06,
"loss": 0.2532,
"step": 5615
},
{
"epoch": 1.2013681060282173,
"grad_norm": 2.4498984813690186,
"learning_rate": 8.876484560570072e-06,
"loss": 0.2285,
"step": 5620
},
{
"epoch": 1.2024369388627618,
"grad_norm": 3.654311418533325,
"learning_rate": 8.864608076009503e-06,
"loss": 0.3307,
"step": 5625
},
{
"epoch": 1.2035057716973065,
"grad_norm": 4.238831996917725,
"learning_rate": 8.852731591448932e-06,
"loss": 0.2353,
"step": 5630
},
{
"epoch": 1.2045746045318513,
"grad_norm": 3.811962842941284,
"learning_rate": 8.840855106888362e-06,
"loss": 0.3137,
"step": 5635
},
{
"epoch": 1.2056434373663958,
"grad_norm": 3.8361501693725586,
"learning_rate": 8.828978622327791e-06,
"loss": 0.2549,
"step": 5640
},
{
"epoch": 1.2067122702009405,
"grad_norm": 4.136886119842529,
"learning_rate": 8.817102137767222e-06,
"loss": 0.2901,
"step": 5645
},
{
"epoch": 1.2077811030354852,
"grad_norm": 4.573363304138184,
"learning_rate": 8.805225653206652e-06,
"loss": 0.4423,
"step": 5650
},
{
"epoch": 1.20884993587003,
"grad_norm": 4.777524948120117,
"learning_rate": 8.793349168646081e-06,
"loss": 0.2959,
"step": 5655
},
{
"epoch": 1.2099187687045747,
"grad_norm": 4.250500679016113,
"learning_rate": 8.781472684085511e-06,
"loss": 0.2523,
"step": 5660
},
{
"epoch": 1.2109876015391192,
"grad_norm": 4.024094581604004,
"learning_rate": 8.769596199524942e-06,
"loss": 0.1746,
"step": 5665
},
{
"epoch": 1.212056434373664,
"grad_norm": 4.290604591369629,
"learning_rate": 8.757719714964371e-06,
"loss": 0.3541,
"step": 5670
},
{
"epoch": 1.2131252672082087,
"grad_norm": 3.597705125808716,
"learning_rate": 8.745843230403801e-06,
"loss": 0.2801,
"step": 5675
},
{
"epoch": 1.2141941000427534,
"grad_norm": 5.059614181518555,
"learning_rate": 8.73396674584323e-06,
"loss": 0.2846,
"step": 5680
},
{
"epoch": 1.215262932877298,
"grad_norm": 3.8920083045959473,
"learning_rate": 8.722090261282661e-06,
"loss": 0.3503,
"step": 5685
},
{
"epoch": 1.2163317657118426,
"grad_norm": 4.512190818786621,
"learning_rate": 8.710213776722091e-06,
"loss": 0.2831,
"step": 5690
},
{
"epoch": 1.2174005985463874,
"grad_norm": 4.729888916015625,
"learning_rate": 8.69833729216152e-06,
"loss": 0.2708,
"step": 5695
},
{
"epoch": 1.218469431380932,
"grad_norm": 4.533064365386963,
"learning_rate": 8.68646080760095e-06,
"loss": 0.2846,
"step": 5700
},
{
"epoch": 1.2195382642154766,
"grad_norm": 3.9406075477600098,
"learning_rate": 8.67458432304038e-06,
"loss": 0.3136,
"step": 5705
},
{
"epoch": 1.2206070970500214,
"grad_norm": 6.5291924476623535,
"learning_rate": 8.66270783847981e-06,
"loss": 0.3377,
"step": 5710
},
{
"epoch": 1.221675929884566,
"grad_norm": 4.291172981262207,
"learning_rate": 8.65083135391924e-06,
"loss": 0.4648,
"step": 5715
},
{
"epoch": 1.2227447627191108,
"grad_norm": 5.999503135681152,
"learning_rate": 8.63895486935867e-06,
"loss": 0.371,
"step": 5720
},
{
"epoch": 1.2238135955536553,
"grad_norm": 3.673821449279785,
"learning_rate": 8.6270783847981e-06,
"loss": 0.3419,
"step": 5725
},
{
"epoch": 1.2248824283882,
"grad_norm": 4.6455607414245605,
"learning_rate": 8.61520190023753e-06,
"loss": 0.3034,
"step": 5730
},
{
"epoch": 1.2259512612227448,
"grad_norm": 4.375533103942871,
"learning_rate": 8.603325415676961e-06,
"loss": 0.2912,
"step": 5735
},
{
"epoch": 1.2270200940572895,
"grad_norm": 4.1931376457214355,
"learning_rate": 8.59144893111639e-06,
"loss": 0.3251,
"step": 5740
},
{
"epoch": 1.228088926891834,
"grad_norm": 7.451878547668457,
"learning_rate": 8.57957244655582e-06,
"loss": 0.3989,
"step": 5745
},
{
"epoch": 1.2291577597263788,
"grad_norm": 5.163100242614746,
"learning_rate": 8.567695961995251e-06,
"loss": 0.3193,
"step": 5750
},
{
"epoch": 1.2302265925609235,
"grad_norm": 6.099165439605713,
"learning_rate": 8.55581947743468e-06,
"loss": 0.3586,
"step": 5755
},
{
"epoch": 1.2312954253954682,
"grad_norm": 3.8234498500823975,
"learning_rate": 8.54394299287411e-06,
"loss": 0.2832,
"step": 5760
},
{
"epoch": 1.232364258230013,
"grad_norm": 4.173794269561768,
"learning_rate": 8.53206650831354e-06,
"loss": 0.389,
"step": 5765
},
{
"epoch": 1.2334330910645575,
"grad_norm": 4.987196922302246,
"learning_rate": 8.52019002375297e-06,
"loss": 0.3889,
"step": 5770
},
{
"epoch": 1.2345019238991022,
"grad_norm": 3.354900360107422,
"learning_rate": 8.5083135391924e-06,
"loss": 0.2243,
"step": 5775
},
{
"epoch": 1.235570756733647,
"grad_norm": 4.882574558258057,
"learning_rate": 8.49643705463183e-06,
"loss": 0.2416,
"step": 5780
},
{
"epoch": 1.2366395895681914,
"grad_norm": 4.3282790184021,
"learning_rate": 8.48456057007126e-06,
"loss": 0.3066,
"step": 5785
},
{
"epoch": 1.2377084224027362,
"grad_norm": 5.309357166290283,
"learning_rate": 8.472684085510689e-06,
"loss": 0.3227,
"step": 5790
},
{
"epoch": 1.2387772552372809,
"grad_norm": 3.708139181137085,
"learning_rate": 8.46080760095012e-06,
"loss": 0.3194,
"step": 5795
},
{
"epoch": 1.2398460880718256,
"grad_norm": 5.823927879333496,
"learning_rate": 8.44893111638955e-06,
"loss": 0.3851,
"step": 5800
},
{
"epoch": 1.2409149209063703,
"grad_norm": 5.825521945953369,
"learning_rate": 8.437054631828979e-06,
"loss": 0.2793,
"step": 5805
},
{
"epoch": 1.2419837537409149,
"grad_norm": 4.350478172302246,
"learning_rate": 8.42517814726841e-06,
"loss": 0.2482,
"step": 5810
},
{
"epoch": 1.2430525865754596,
"grad_norm": 4.824470043182373,
"learning_rate": 8.413301662707838e-06,
"loss": 0.3311,
"step": 5815
},
{
"epoch": 1.2441214194100043,
"grad_norm": 4.695113182067871,
"learning_rate": 8.401425178147269e-06,
"loss": 0.2456,
"step": 5820
},
{
"epoch": 1.245190252244549,
"grad_norm": 5.539307594299316,
"learning_rate": 8.3895486935867e-06,
"loss": 0.3267,
"step": 5825
},
{
"epoch": 1.2462590850790936,
"grad_norm": 4.055349826812744,
"learning_rate": 8.377672209026128e-06,
"loss": 0.2496,
"step": 5830
},
{
"epoch": 1.2473279179136383,
"grad_norm": 4.012608051300049,
"learning_rate": 8.365795724465559e-06,
"loss": 0.2896,
"step": 5835
},
{
"epoch": 1.248396750748183,
"grad_norm": 4.369838714599609,
"learning_rate": 8.35391923990499e-06,
"loss": 0.2717,
"step": 5840
},
{
"epoch": 1.2494655835827277,
"grad_norm": 7.311318874359131,
"learning_rate": 8.342042755344418e-06,
"loss": 0.344,
"step": 5845
},
{
"epoch": 1.2505344164172723,
"grad_norm": 3.8691282272338867,
"learning_rate": 8.330166270783849e-06,
"loss": 0.1997,
"step": 5850
},
{
"epoch": 1.251603249251817,
"grad_norm": 4.140939712524414,
"learning_rate": 8.318289786223278e-06,
"loss": 0.2454,
"step": 5855
},
{
"epoch": 1.2526720820863617,
"grad_norm": 4.034096717834473,
"learning_rate": 8.306413301662708e-06,
"loss": 0.2923,
"step": 5860
},
{
"epoch": 1.2537409149209064,
"grad_norm": 4.175270080566406,
"learning_rate": 8.294536817102139e-06,
"loss": 0.268,
"step": 5865
},
{
"epoch": 1.2548097477554512,
"grad_norm": 5.182862758636475,
"learning_rate": 8.28266033254157e-06,
"loss": 0.2469,
"step": 5870
},
{
"epoch": 1.2558785805899957,
"grad_norm": 3.4455058574676514,
"learning_rate": 8.270783847980998e-06,
"loss": 0.2488,
"step": 5875
},
{
"epoch": 1.2569474134245404,
"grad_norm": 3.5229389667510986,
"learning_rate": 8.258907363420429e-06,
"loss": 0.2809,
"step": 5880
},
{
"epoch": 1.2580162462590851,
"grad_norm": 5.2068071365356445,
"learning_rate": 8.247030878859859e-06,
"loss": 0.2967,
"step": 5885
},
{
"epoch": 1.2590850790936297,
"grad_norm": 5.500560283660889,
"learning_rate": 8.235154394299288e-06,
"loss": 0.3366,
"step": 5890
},
{
"epoch": 1.2601539119281744,
"grad_norm": 3.9053938388824463,
"learning_rate": 8.223277909738719e-06,
"loss": 0.2968,
"step": 5895
},
{
"epoch": 1.2612227447627191,
"grad_norm": 3.7163820266723633,
"learning_rate": 8.211401425178147e-06,
"loss": 0.2,
"step": 5900
},
{
"epoch": 1.2622915775972638,
"grad_norm": 4.347673416137695,
"learning_rate": 8.199524940617578e-06,
"loss": 0.2761,
"step": 5905
},
{
"epoch": 1.2633604104318086,
"grad_norm": 3.297481060028076,
"learning_rate": 8.187648456057008e-06,
"loss": 0.2177,
"step": 5910
},
{
"epoch": 1.264429243266353,
"grad_norm": 5.587257385253906,
"learning_rate": 8.175771971496437e-06,
"loss": 0.2721,
"step": 5915
},
{
"epoch": 1.2654980761008978,
"grad_norm": 3.562802791595459,
"learning_rate": 8.163895486935868e-06,
"loss": 0.2592,
"step": 5920
},
{
"epoch": 1.2665669089354425,
"grad_norm": 5.265760898590088,
"learning_rate": 8.152019002375298e-06,
"loss": 0.453,
"step": 5925
},
{
"epoch": 1.267635741769987,
"grad_norm": 4.091883182525635,
"learning_rate": 8.140142517814727e-06,
"loss": 0.1952,
"step": 5930
},
{
"epoch": 1.2687045746045318,
"grad_norm": 4.552518844604492,
"learning_rate": 8.128266033254158e-06,
"loss": 0.3269,
"step": 5935
},
{
"epoch": 1.2697734074390765,
"grad_norm": 4.755618572235107,
"learning_rate": 8.116389548693587e-06,
"loss": 0.2776,
"step": 5940
},
{
"epoch": 1.2708422402736212,
"grad_norm": 4.392646312713623,
"learning_rate": 8.104513064133017e-06,
"loss": 0.2627,
"step": 5945
},
{
"epoch": 1.271911073108166,
"grad_norm": 2.6964704990386963,
"learning_rate": 8.092636579572448e-06,
"loss": 0.2524,
"step": 5950
},
{
"epoch": 1.2729799059427105,
"grad_norm": 3.914213180541992,
"learning_rate": 8.080760095011877e-06,
"loss": 0.2713,
"step": 5955
},
{
"epoch": 1.2740487387772552,
"grad_norm": 3.009427785873413,
"learning_rate": 8.068883610451307e-06,
"loss": 0.2618,
"step": 5960
},
{
"epoch": 1.2751175716118,
"grad_norm": 4.362711429595947,
"learning_rate": 8.057007125890736e-06,
"loss": 0.2735,
"step": 5965
},
{
"epoch": 1.2761864044463445,
"grad_norm": 5.038128852844238,
"learning_rate": 8.045130641330167e-06,
"loss": 0.327,
"step": 5970
},
{
"epoch": 1.2772552372808892,
"grad_norm": 5.867886543273926,
"learning_rate": 8.033254156769597e-06,
"loss": 0.3294,
"step": 5975
},
{
"epoch": 1.278324070115434,
"grad_norm": 3.8101038932800293,
"learning_rate": 8.021377672209026e-06,
"loss": 0.3299,
"step": 5980
},
{
"epoch": 1.2793929029499786,
"grad_norm": 4.082939624786377,
"learning_rate": 8.009501187648457e-06,
"loss": 0.2328,
"step": 5985
},
{
"epoch": 1.2804617357845234,
"grad_norm": 5.352798938751221,
"learning_rate": 7.997624703087885e-06,
"loss": 0.2554,
"step": 5990
},
{
"epoch": 1.281530568619068,
"grad_norm": 2.7532148361206055,
"learning_rate": 7.985748218527316e-06,
"loss": 0.3285,
"step": 5995
},
{
"epoch": 1.2825994014536126,
"grad_norm": 4.2501349449157715,
"learning_rate": 7.973871733966747e-06,
"loss": 0.2884,
"step": 6000
},
{
"epoch": 1.2836682342881574,
"grad_norm": 3.0817322731018066,
"learning_rate": 7.961995249406177e-06,
"loss": 0.3201,
"step": 6005
},
{
"epoch": 1.284737067122702,
"grad_norm": 4.214169502258301,
"learning_rate": 7.950118764845608e-06,
"loss": 0.3529,
"step": 6010
},
{
"epoch": 1.2858058999572468,
"grad_norm": 4.896885871887207,
"learning_rate": 7.938242280285036e-06,
"loss": 0.3113,
"step": 6015
},
{
"epoch": 1.2868747327917913,
"grad_norm": 4.869765758514404,
"learning_rate": 7.926365795724467e-06,
"loss": 0.386,
"step": 6020
},
{
"epoch": 1.287943565626336,
"grad_norm": 4.720851421356201,
"learning_rate": 7.914489311163896e-06,
"loss": 0.2453,
"step": 6025
},
{
"epoch": 1.2890123984608808,
"grad_norm": 4.764908790588379,
"learning_rate": 7.902612826603326e-06,
"loss": 0.3845,
"step": 6030
},
{
"epoch": 1.2900812312954253,
"grad_norm": 4.5335845947265625,
"learning_rate": 7.890736342042757e-06,
"loss": 0.3311,
"step": 6035
},
{
"epoch": 1.29115006412997,
"grad_norm": 5.650118350982666,
"learning_rate": 7.878859857482186e-06,
"loss": 0.3556,
"step": 6040
},
{
"epoch": 1.2922188969645148,
"grad_norm": 4.7145209312438965,
"learning_rate": 7.866983372921616e-06,
"loss": 0.2491,
"step": 6045
},
{
"epoch": 1.2932877297990595,
"grad_norm": 5.045220851898193,
"learning_rate": 7.855106888361045e-06,
"loss": 0.3478,
"step": 6050
},
{
"epoch": 1.2943565626336042,
"grad_norm": 3.746929407119751,
"learning_rate": 7.843230403800476e-06,
"loss": 0.175,
"step": 6055
},
{
"epoch": 1.2954253954681487,
"grad_norm": 3.4932451248168945,
"learning_rate": 7.831353919239906e-06,
"loss": 0.2475,
"step": 6060
},
{
"epoch": 1.2964942283026935,
"grad_norm": 4.507287502288818,
"learning_rate": 7.819477434679335e-06,
"loss": 0.2793,
"step": 6065
},
{
"epoch": 1.2975630611372382,
"grad_norm": 3.872846841812134,
"learning_rate": 7.807600950118766e-06,
"loss": 0.3745,
"step": 6070
},
{
"epoch": 1.2986318939717827,
"grad_norm": 3.80639910697937,
"learning_rate": 7.795724465558195e-06,
"loss": 0.2619,
"step": 6075
},
{
"epoch": 1.2997007268063274,
"grad_norm": 4.278339862823486,
"learning_rate": 7.783847980997625e-06,
"loss": 0.2882,
"step": 6080
},
{
"epoch": 1.3007695596408722,
"grad_norm": 3.2503674030303955,
"learning_rate": 7.771971496437056e-06,
"loss": 0.2651,
"step": 6085
},
{
"epoch": 1.3018383924754169,
"grad_norm": 3.709991216659546,
"learning_rate": 7.760095011876485e-06,
"loss": 0.3257,
"step": 6090
},
{
"epoch": 1.3029072253099616,
"grad_norm": 4.797738075256348,
"learning_rate": 7.748218527315915e-06,
"loss": 0.2626,
"step": 6095
},
{
"epoch": 1.3039760581445061,
"grad_norm": 3.289095163345337,
"learning_rate": 7.736342042755346e-06,
"loss": 0.3391,
"step": 6100
},
{
"epoch": 1.3050448909790509,
"grad_norm": 5.237732410430908,
"learning_rate": 7.724465558194774e-06,
"loss": 0.2868,
"step": 6105
},
{
"epoch": 1.3061137238135956,
"grad_norm": 3.3352086544036865,
"learning_rate": 7.712589073634205e-06,
"loss": 0.2358,
"step": 6110
},
{
"epoch": 1.30718255664814,
"grad_norm": 4.8291168212890625,
"learning_rate": 7.700712589073634e-06,
"loss": 0.3505,
"step": 6115
},
{
"epoch": 1.308251389482685,
"grad_norm": 6.421624183654785,
"learning_rate": 7.688836104513064e-06,
"loss": 0.3643,
"step": 6120
},
{
"epoch": 1.3093202223172296,
"grad_norm": 2.7074790000915527,
"learning_rate": 7.676959619952495e-06,
"loss": 0.2729,
"step": 6125
},
{
"epoch": 1.3103890551517743,
"grad_norm": 4.26420783996582,
"learning_rate": 7.665083135391924e-06,
"loss": 0.2363,
"step": 6130
},
{
"epoch": 1.311457887986319,
"grad_norm": 6.1749773025512695,
"learning_rate": 7.653206650831354e-06,
"loss": 0.3173,
"step": 6135
},
{
"epoch": 1.3125267208208635,
"grad_norm": 3.3525917530059814,
"learning_rate": 7.641330166270783e-06,
"loss": 0.2797,
"step": 6140
},
{
"epoch": 1.3135955536554083,
"grad_norm": 3.1302783489227295,
"learning_rate": 7.629453681710216e-06,
"loss": 0.3054,
"step": 6145
},
{
"epoch": 1.314664386489953,
"grad_norm": 3.0552220344543457,
"learning_rate": 7.617577197149645e-06,
"loss": 0.2596,
"step": 6150
},
{
"epoch": 1.3157332193244977,
"grad_norm": 5.424324035644531,
"learning_rate": 7.605700712589075e-06,
"loss": 0.3871,
"step": 6155
},
{
"epoch": 1.3168020521590424,
"grad_norm": 4.735466480255127,
"learning_rate": 7.593824228028505e-06,
"loss": 0.2798,
"step": 6160
},
{
"epoch": 1.317870884993587,
"grad_norm": 5.158178806304932,
"learning_rate": 7.581947743467934e-06,
"loss": 0.2532,
"step": 6165
},
{
"epoch": 1.3189397178281317,
"grad_norm": 5.720581531524658,
"learning_rate": 7.570071258907364e-06,
"loss": 0.2649,
"step": 6170
},
{
"epoch": 1.3200085506626764,
"grad_norm": 4.740435600280762,
"learning_rate": 7.5581947743467946e-06,
"loss": 0.3457,
"step": 6175
},
{
"epoch": 1.321077383497221,
"grad_norm": 4.528372287750244,
"learning_rate": 7.546318289786224e-06,
"loss": 0.4062,
"step": 6180
},
{
"epoch": 1.3221462163317657,
"grad_norm": 5.7430243492126465,
"learning_rate": 7.534441805225654e-06,
"loss": 0.4318,
"step": 6185
},
{
"epoch": 1.3232150491663104,
"grad_norm": 3.7349984645843506,
"learning_rate": 7.522565320665084e-06,
"loss": 0.2305,
"step": 6190
},
{
"epoch": 1.3242838820008551,
"grad_norm": 3.384366273880005,
"learning_rate": 7.510688836104514e-06,
"loss": 0.218,
"step": 6195
},
{
"epoch": 1.3253527148353998,
"grad_norm": 4.311688423156738,
"learning_rate": 7.498812351543944e-06,
"loss": 0.2738,
"step": 6200
},
{
"epoch": 1.3264215476699444,
"grad_norm": 3.9737985134124756,
"learning_rate": 7.486935866983374e-06,
"loss": 0.3043,
"step": 6205
},
{
"epoch": 1.327490380504489,
"grad_norm": 3.2927355766296387,
"learning_rate": 7.475059382422803e-06,
"loss": 0.2055,
"step": 6210
},
{
"epoch": 1.3285592133390338,
"grad_norm": 4.364592552185059,
"learning_rate": 7.463182897862233e-06,
"loss": 0.2528,
"step": 6215
},
{
"epoch": 1.3296280461735783,
"grad_norm": 4.896527290344238,
"learning_rate": 7.451306413301664e-06,
"loss": 0.3514,
"step": 6220
},
{
"epoch": 1.330696879008123,
"grad_norm": 3.7543258666992188,
"learning_rate": 7.439429928741093e-06,
"loss": 0.3199,
"step": 6225
},
{
"epoch": 1.3317657118426678,
"grad_norm": 4.389688491821289,
"learning_rate": 7.427553444180523e-06,
"loss": 0.2453,
"step": 6230
},
{
"epoch": 1.3328345446772125,
"grad_norm": 5.297595500946045,
"learning_rate": 7.415676959619953e-06,
"loss": 0.3237,
"step": 6235
},
{
"epoch": 1.3339033775117572,
"grad_norm": 4.290585041046143,
"learning_rate": 7.403800475059383e-06,
"loss": 0.3409,
"step": 6240
},
{
"epoch": 1.3349722103463018,
"grad_norm": 3.8684494495391846,
"learning_rate": 7.391923990498813e-06,
"loss": 0.268,
"step": 6245
},
{
"epoch": 1.3360410431808465,
"grad_norm": 7.344365119934082,
"learning_rate": 7.380047505938243e-06,
"loss": 0.4072,
"step": 6250
},
{
"epoch": 1.3371098760153912,
"grad_norm": 4.403175354003906,
"learning_rate": 7.368171021377672e-06,
"loss": 0.3601,
"step": 6255
},
{
"epoch": 1.338178708849936,
"grad_norm": 4.6706414222717285,
"learning_rate": 7.356294536817102e-06,
"loss": 0.3997,
"step": 6260
},
{
"epoch": 1.3392475416844807,
"grad_norm": 3.4723129272460938,
"learning_rate": 7.344418052256533e-06,
"loss": 0.2062,
"step": 6265
},
{
"epoch": 1.3403163745190252,
"grad_norm": 3.8669190406799316,
"learning_rate": 7.332541567695962e-06,
"loss": 0.2703,
"step": 6270
},
{
"epoch": 1.34138520735357,
"grad_norm": 4.620151519775391,
"learning_rate": 7.320665083135392e-06,
"loss": 0.2498,
"step": 6275
},
{
"epoch": 1.3424540401881147,
"grad_norm": 4.765347480773926,
"learning_rate": 7.308788598574822e-06,
"loss": 0.3418,
"step": 6280
},
{
"epoch": 1.3435228730226592,
"grad_norm": 3.9806559085845947,
"learning_rate": 7.296912114014253e-06,
"loss": 0.2046,
"step": 6285
},
{
"epoch": 1.344591705857204,
"grad_norm": 6.489411354064941,
"learning_rate": 7.285035629453683e-06,
"loss": 0.3267,
"step": 6290
},
{
"epoch": 1.3456605386917486,
"grad_norm": 4.385682582855225,
"learning_rate": 7.2731591448931125e-06,
"loss": 0.2756,
"step": 6295
},
{
"epoch": 1.3467293715262934,
"grad_norm": 5.30741548538208,
"learning_rate": 7.261282660332542e-06,
"loss": 0.2808,
"step": 6300
},
{
"epoch": 1.347798204360838,
"grad_norm": 3.52230167388916,
"learning_rate": 7.249406175771973e-06,
"loss": 0.3037,
"step": 6305
},
{
"epoch": 1.3488670371953826,
"grad_norm": 3.3302509784698486,
"learning_rate": 7.2375296912114025e-06,
"loss": 0.3837,
"step": 6310
},
{
"epoch": 1.3499358700299273,
"grad_norm": 4.349034309387207,
"learning_rate": 7.225653206650832e-06,
"loss": 0.2496,
"step": 6315
},
{
"epoch": 1.351004702864472,
"grad_norm": 3.651261329650879,
"learning_rate": 7.213776722090262e-06,
"loss": 0.3131,
"step": 6320
},
{
"epoch": 1.3520735356990166,
"grad_norm": 4.3042144775390625,
"learning_rate": 7.201900237529692e-06,
"loss": 0.3038,
"step": 6325
},
{
"epoch": 1.3531423685335613,
"grad_norm": 4.746523380279541,
"learning_rate": 7.190023752969122e-06,
"loss": 0.2872,
"step": 6330
},
{
"epoch": 1.354211201368106,
"grad_norm": 3.058163642883301,
"learning_rate": 7.178147268408552e-06,
"loss": 0.3548,
"step": 6335
},
{
"epoch": 1.3552800342026508,
"grad_norm": 4.4561309814453125,
"learning_rate": 7.1662707838479815e-06,
"loss": 0.1994,
"step": 6340
},
{
"epoch": 1.3563488670371955,
"grad_norm": 3.580275535583496,
"learning_rate": 7.154394299287411e-06,
"loss": 0.2383,
"step": 6345
},
{
"epoch": 1.35741769987174,
"grad_norm": 4.0294671058654785,
"learning_rate": 7.142517814726842e-06,
"loss": 0.3412,
"step": 6350
},
{
"epoch": 1.3584865327062847,
"grad_norm": 4.032179355621338,
"learning_rate": 7.1306413301662715e-06,
"loss": 0.2392,
"step": 6355
},
{
"epoch": 1.3595553655408295,
"grad_norm": 3.6529910564422607,
"learning_rate": 7.118764845605701e-06,
"loss": 0.2946,
"step": 6360
},
{
"epoch": 1.360624198375374,
"grad_norm": 5.5632781982421875,
"learning_rate": 7.106888361045131e-06,
"loss": 0.3075,
"step": 6365
},
{
"epoch": 1.3616930312099187,
"grad_norm": 4.6378865242004395,
"learning_rate": 7.0950118764845614e-06,
"loss": 0.2695,
"step": 6370
},
{
"epoch": 1.3627618640444634,
"grad_norm": 4.01276969909668,
"learning_rate": 7.083135391923991e-06,
"loss": 0.2289,
"step": 6375
},
{
"epoch": 1.3638306968790082,
"grad_norm": 2.731029748916626,
"learning_rate": 7.071258907363421e-06,
"loss": 0.339,
"step": 6380
},
{
"epoch": 1.3648995297135529,
"grad_norm": 6.142641544342041,
"learning_rate": 7.0593824228028505e-06,
"loss": 0.3048,
"step": 6385
},
{
"epoch": 1.3659683625480974,
"grad_norm": 4.8854289054870605,
"learning_rate": 7.04750593824228e-06,
"loss": 0.3437,
"step": 6390
},
{
"epoch": 1.3670371953826421,
"grad_norm": 4.592909336090088,
"learning_rate": 7.035629453681711e-06,
"loss": 0.2587,
"step": 6395
},
{
"epoch": 1.3681060282171869,
"grad_norm": 4.572000026702881,
"learning_rate": 7.0237529691211405e-06,
"loss": 0.3156,
"step": 6400
},
{
"epoch": 1.3691748610517316,
"grad_norm": 6.196121692657471,
"learning_rate": 7.01187648456057e-06,
"loss": 0.2827,
"step": 6405
},
{
"epoch": 1.3702436938862763,
"grad_norm": 3.967109441757202,
"learning_rate": 7e-06,
"loss": 0.2337,
"step": 6410
},
{
"epoch": 1.3713125267208208,
"grad_norm": 3.1756539344787598,
"learning_rate": 6.98812351543943e-06,
"loss": 0.265,
"step": 6415
},
{
"epoch": 1.3723813595553656,
"grad_norm": 3.4292986392974854,
"learning_rate": 6.97624703087886e-06,
"loss": 0.2822,
"step": 6420
},
{
"epoch": 1.3734501923899103,
"grad_norm": 4.521055698394775,
"learning_rate": 6.964370546318291e-06,
"loss": 0.2697,
"step": 6425
},
{
"epoch": 1.3745190252244548,
"grad_norm": 3.9092273712158203,
"learning_rate": 6.95249406175772e-06,
"loss": 0.2136,
"step": 6430
},
{
"epoch": 1.3755878580589995,
"grad_norm": 3.5216240882873535,
"learning_rate": 6.940617577197151e-06,
"loss": 0.2549,
"step": 6435
},
{
"epoch": 1.3766566908935443,
"grad_norm": 5.987946510314941,
"learning_rate": 6.928741092636581e-06,
"loss": 0.2631,
"step": 6440
},
{
"epoch": 1.377725523728089,
"grad_norm": 5.098079681396484,
"learning_rate": 6.91686460807601e-06,
"loss": 0.324,
"step": 6445
},
{
"epoch": 1.3787943565626337,
"grad_norm": 4.314655303955078,
"learning_rate": 6.90498812351544e-06,
"loss": 0.3722,
"step": 6450
},
{
"epoch": 1.3798631893971782,
"grad_norm": 5.151162147521973,
"learning_rate": 6.893111638954871e-06,
"loss": 0.453,
"step": 6455
},
{
"epoch": 1.380932022231723,
"grad_norm": 4.187003135681152,
"learning_rate": 6.8812351543943e-06,
"loss": 0.2798,
"step": 6460
},
{
"epoch": 1.3820008550662677,
"grad_norm": 5.253510475158691,
"learning_rate": 6.86935866983373e-06,
"loss": 0.2605,
"step": 6465
},
{
"epoch": 1.3830696879008122,
"grad_norm": 2.9405324459075928,
"learning_rate": 6.85748218527316e-06,
"loss": 0.3834,
"step": 6470
},
{
"epoch": 1.384138520735357,
"grad_norm": 3.8434178829193115,
"learning_rate": 6.845605700712589e-06,
"loss": 0.2683,
"step": 6475
},
{
"epoch": 1.3852073535699017,
"grad_norm": 4.633339881896973,
"learning_rate": 6.83372921615202e-06,
"loss": 0.243,
"step": 6480
},
{
"epoch": 1.3862761864044464,
"grad_norm": 4.103108882904053,
"learning_rate": 6.82185273159145e-06,
"loss": 0.3287,
"step": 6485
},
{
"epoch": 1.3873450192389911,
"grad_norm": 4.187243938446045,
"learning_rate": 6.809976247030879e-06,
"loss": 0.2754,
"step": 6490
},
{
"epoch": 1.3884138520735356,
"grad_norm": 5.196486949920654,
"learning_rate": 6.798099762470309e-06,
"loss": 0.3701,
"step": 6495
},
{
"epoch": 1.3894826849080804,
"grad_norm": 4.622681140899658,
"learning_rate": 6.78622327790974e-06,
"loss": 0.2762,
"step": 6500
},
{
"epoch": 1.390551517742625,
"grad_norm": 2.859978675842285,
"learning_rate": 6.774346793349169e-06,
"loss": 0.266,
"step": 6505
},
{
"epoch": 1.3916203505771696,
"grad_norm": 5.8184332847595215,
"learning_rate": 6.762470308788599e-06,
"loss": 0.2958,
"step": 6510
},
{
"epoch": 1.3926891834117143,
"grad_norm": 3.787079334259033,
"learning_rate": 6.750593824228029e-06,
"loss": 0.2754,
"step": 6515
},
{
"epoch": 1.393758016246259,
"grad_norm": 4.132429599761963,
"learning_rate": 6.7387173396674584e-06,
"loss": 0.3634,
"step": 6520
},
{
"epoch": 1.3948268490808038,
"grad_norm": 5.011837005615234,
"learning_rate": 6.726840855106889e-06,
"loss": 0.2974,
"step": 6525
},
{
"epoch": 1.3958956819153485,
"grad_norm": 5.0287957191467285,
"learning_rate": 6.714964370546319e-06,
"loss": 0.3158,
"step": 6530
},
{
"epoch": 1.396964514749893,
"grad_norm": 3.846284866333008,
"learning_rate": 6.703087885985748e-06,
"loss": 0.2718,
"step": 6535
},
{
"epoch": 1.3980333475844378,
"grad_norm": 5.715949058532715,
"learning_rate": 6.691211401425178e-06,
"loss": 0.2586,
"step": 6540
},
{
"epoch": 1.3991021804189825,
"grad_norm": 2.0641372203826904,
"learning_rate": 6.679334916864609e-06,
"loss": 0.2071,
"step": 6545
},
{
"epoch": 1.4001710132535272,
"grad_norm": 3.989108085632324,
"learning_rate": 6.667458432304038e-06,
"loss": 0.3308,
"step": 6550
},
{
"epoch": 1.401239846088072,
"grad_norm": 5.488873481750488,
"learning_rate": 6.655581947743468e-06,
"loss": 0.2517,
"step": 6555
},
{
"epoch": 1.4023086789226165,
"grad_norm": 4.91823673248291,
"learning_rate": 6.643705463182898e-06,
"loss": 0.234,
"step": 6560
},
{
"epoch": 1.4033775117571612,
"grad_norm": 5.4402289390563965,
"learning_rate": 6.631828978622329e-06,
"loss": 0.3025,
"step": 6565
},
{
"epoch": 1.404446344591706,
"grad_norm": 5.417737007141113,
"learning_rate": 6.619952494061759e-06,
"loss": 0.2619,
"step": 6570
},
{
"epoch": 1.4055151774262504,
"grad_norm": 3.603675127029419,
"learning_rate": 6.6080760095011885e-06,
"loss": 0.2521,
"step": 6575
},
{
"epoch": 1.4065840102607952,
"grad_norm": 4.426266670227051,
"learning_rate": 6.596199524940618e-06,
"loss": 0.3113,
"step": 6580
},
{
"epoch": 1.40765284309534,
"grad_norm": 4.535027980804443,
"learning_rate": 6.584323040380049e-06,
"loss": 0.3537,
"step": 6585
},
{
"epoch": 1.4087216759298846,
"grad_norm": 3.585488796234131,
"learning_rate": 6.5724465558194785e-06,
"loss": 0.2386,
"step": 6590
},
{
"epoch": 1.4097905087644294,
"grad_norm": 5.358974456787109,
"learning_rate": 6.560570071258908e-06,
"loss": 0.3081,
"step": 6595
},
{
"epoch": 1.4108593415989739,
"grad_norm": 3.859417200088501,
"learning_rate": 6.548693586698338e-06,
"loss": 0.189,
"step": 6600
},
{
"epoch": 1.4119281744335186,
"grad_norm": 3.350184679031372,
"learning_rate": 6.536817102137768e-06,
"loss": 0.2643,
"step": 6605
},
{
"epoch": 1.4129970072680633,
"grad_norm": 3.4859519004821777,
"learning_rate": 6.524940617577198e-06,
"loss": 0.2206,
"step": 6610
},
{
"epoch": 1.4140658401026078,
"grad_norm": 6.238532543182373,
"learning_rate": 6.513064133016628e-06,
"loss": 0.3687,
"step": 6615
},
{
"epoch": 1.4151346729371526,
"grad_norm": 6.955577850341797,
"learning_rate": 6.5011876484560576e-06,
"loss": 0.3626,
"step": 6620
},
{
"epoch": 1.4162035057716973,
"grad_norm": 4.4574995040893555,
"learning_rate": 6.489311163895487e-06,
"loss": 0.2711,
"step": 6625
},
{
"epoch": 1.417272338606242,
"grad_norm": 4.533407211303711,
"learning_rate": 6.477434679334918e-06,
"loss": 0.3304,
"step": 6630
},
{
"epoch": 1.4183411714407868,
"grad_norm": 2.947624921798706,
"learning_rate": 6.4655581947743475e-06,
"loss": 0.321,
"step": 6635
},
{
"epoch": 1.4194100042753313,
"grad_norm": 4.557621955871582,
"learning_rate": 6.453681710213777e-06,
"loss": 0.319,
"step": 6640
},
{
"epoch": 1.420478837109876,
"grad_norm": 4.511264324188232,
"learning_rate": 6.441805225653207e-06,
"loss": 0.2363,
"step": 6645
},
{
"epoch": 1.4215476699444207,
"grad_norm": 4.200313568115234,
"learning_rate": 6.429928741092637e-06,
"loss": 0.2319,
"step": 6650
},
{
"epoch": 1.4226165027789655,
"grad_norm": 7.376286506652832,
"learning_rate": 6.418052256532067e-06,
"loss": 0.3526,
"step": 6655
},
{
"epoch": 1.4236853356135102,
"grad_norm": 4.415379047393799,
"learning_rate": 6.406175771971497e-06,
"loss": 0.4186,
"step": 6660
},
{
"epoch": 1.4247541684480547,
"grad_norm": 4.578277587890625,
"learning_rate": 6.394299287410927e-06,
"loss": 0.2846,
"step": 6665
},
{
"epoch": 1.4258230012825994,
"grad_norm": 4.811502456665039,
"learning_rate": 6.382422802850356e-06,
"loss": 0.3077,
"step": 6670
},
{
"epoch": 1.4268918341171442,
"grad_norm": 3.3036532402038574,
"learning_rate": 6.370546318289787e-06,
"loss": 0.3709,
"step": 6675
},
{
"epoch": 1.4279606669516887,
"grad_norm": 4.229010105133057,
"learning_rate": 6.3586698337292165e-06,
"loss": 0.3438,
"step": 6680
},
{
"epoch": 1.4290294997862334,
"grad_norm": 7.352675914764404,
"learning_rate": 6.346793349168646e-06,
"loss": 0.4159,
"step": 6685
},
{
"epoch": 1.4300983326207781,
"grad_norm": 3.935654878616333,
"learning_rate": 6.334916864608076e-06,
"loss": 0.3511,
"step": 6690
},
{
"epoch": 1.4311671654553229,
"grad_norm": 4.271127700805664,
"learning_rate": 6.323040380047506e-06,
"loss": 0.3061,
"step": 6695
},
{
"epoch": 1.4322359982898676,
"grad_norm": 4.57000207901001,
"learning_rate": 6.311163895486936e-06,
"loss": 0.2694,
"step": 6700
},
{
"epoch": 1.433304831124412,
"grad_norm": 4.243838787078857,
"learning_rate": 6.299287410926367e-06,
"loss": 0.3412,
"step": 6705
},
{
"epoch": 1.4343736639589568,
"grad_norm": 4.534287929534912,
"learning_rate": 6.2874109263657964e-06,
"loss": 0.2631,
"step": 6710
},
{
"epoch": 1.4354424967935016,
"grad_norm": 4.457566261291504,
"learning_rate": 6.275534441805227e-06,
"loss": 0.3879,
"step": 6715
},
{
"epoch": 1.436511329628046,
"grad_norm": 3.7211356163024902,
"learning_rate": 6.263657957244657e-06,
"loss": 0.2942,
"step": 6720
},
{
"epoch": 1.4375801624625908,
"grad_norm": 6.8076300621032715,
"learning_rate": 6.251781472684086e-06,
"loss": 0.3901,
"step": 6725
},
{
"epoch": 1.4386489952971355,
"grad_norm": 6.238668441772461,
"learning_rate": 6.239904988123516e-06,
"loss": 0.3192,
"step": 6730
},
{
"epoch": 1.4397178281316803,
"grad_norm": 4.374307155609131,
"learning_rate": 6.228028503562946e-06,
"loss": 0.2269,
"step": 6735
},
{
"epoch": 1.440786660966225,
"grad_norm": 5.202229976654053,
"learning_rate": 6.216152019002376e-06,
"loss": 0.372,
"step": 6740
},
{
"epoch": 1.4418554938007695,
"grad_norm": 4.483334064483643,
"learning_rate": 6.204275534441806e-06,
"loss": 0.2467,
"step": 6745
},
{
"epoch": 1.4429243266353142,
"grad_norm": 3.3366737365722656,
"learning_rate": 6.192399049881236e-06,
"loss": 0.2313,
"step": 6750
},
{
"epoch": 1.443993159469859,
"grad_norm": 6.443348407745361,
"learning_rate": 6.1805225653206655e-06,
"loss": 0.3538,
"step": 6755
},
{
"epoch": 1.4450619923044035,
"grad_norm": 3.4701974391937256,
"learning_rate": 6.168646080760096e-06,
"loss": 0.252,
"step": 6760
},
{
"epoch": 1.4461308251389482,
"grad_norm": 3.572749137878418,
"learning_rate": 6.156769596199526e-06,
"loss": 0.3049,
"step": 6765
},
{
"epoch": 1.447199657973493,
"grad_norm": 4.363938808441162,
"learning_rate": 6.144893111638955e-06,
"loss": 0.2796,
"step": 6770
},
{
"epoch": 1.4482684908080377,
"grad_norm": 3.493666172027588,
"learning_rate": 6.133016627078385e-06,
"loss": 0.2128,
"step": 6775
},
{
"epoch": 1.4493373236425824,
"grad_norm": 4.754271507263184,
"learning_rate": 6.121140142517815e-06,
"loss": 0.3407,
"step": 6780
},
{
"epoch": 1.450406156477127,
"grad_norm": 4.948278903961182,
"learning_rate": 6.109263657957245e-06,
"loss": 0.2196,
"step": 6785
},
{
"epoch": 1.4514749893116716,
"grad_norm": 4.344764709472656,
"learning_rate": 6.097387173396675e-06,
"loss": 0.2472,
"step": 6790
},
{
"epoch": 1.4525438221462164,
"grad_norm": 4.455203056335449,
"learning_rate": 6.085510688836105e-06,
"loss": 0.2788,
"step": 6795
},
{
"epoch": 1.453612654980761,
"grad_norm": 5.69878625869751,
"learning_rate": 6.0736342042755345e-06,
"loss": 0.305,
"step": 6800
},
{
"epoch": 1.4546814878153058,
"grad_norm": 4.746001243591309,
"learning_rate": 6.061757719714965e-06,
"loss": 0.3072,
"step": 6805
},
{
"epoch": 1.4557503206498503,
"grad_norm": 3.463618755340576,
"learning_rate": 6.049881235154395e-06,
"loss": 0.2879,
"step": 6810
},
{
"epoch": 1.456819153484395,
"grad_norm": 3.4969255924224854,
"learning_rate": 6.0380047505938244e-06,
"loss": 0.4406,
"step": 6815
},
{
"epoch": 1.4578879863189398,
"grad_norm": 3.6291632652282715,
"learning_rate": 6.026128266033254e-06,
"loss": 0.3371,
"step": 6820
},
{
"epoch": 1.4589568191534843,
"grad_norm": 4.0304765701293945,
"learning_rate": 6.014251781472684e-06,
"loss": 0.2775,
"step": 6825
},
{
"epoch": 1.460025651988029,
"grad_norm": 3.6861469745635986,
"learning_rate": 6.002375296912114e-06,
"loss": 0.2872,
"step": 6830
},
{
"epoch": 1.4610944848225738,
"grad_norm": 4.720432758331299,
"learning_rate": 5.990498812351544e-06,
"loss": 0.3056,
"step": 6835
},
{
"epoch": 1.4621633176571185,
"grad_norm": 3.8419721126556396,
"learning_rate": 5.978622327790974e-06,
"loss": 0.3183,
"step": 6840
},
{
"epoch": 1.4632321504916632,
"grad_norm": 4.320315361022949,
"learning_rate": 5.9667458432304035e-06,
"loss": 0.2801,
"step": 6845
},
{
"epoch": 1.4643009833262077,
"grad_norm": 4.07327127456665,
"learning_rate": 5.954869358669835e-06,
"loss": 0.2641,
"step": 6850
},
{
"epoch": 1.4653698161607525,
"grad_norm": 5.109342098236084,
"learning_rate": 5.942992874109265e-06,
"loss": 0.2903,
"step": 6855
},
{
"epoch": 1.4664386489952972,
"grad_norm": 5.147985458374023,
"learning_rate": 5.931116389548694e-06,
"loss": 0.4097,
"step": 6860
},
{
"epoch": 1.4675074818298417,
"grad_norm": 5.812030792236328,
"learning_rate": 5.919239904988124e-06,
"loss": 0.2133,
"step": 6865
},
{
"epoch": 1.4685763146643864,
"grad_norm": 4.3751220703125,
"learning_rate": 5.9073634204275545e-06,
"loss": 0.3064,
"step": 6870
},
{
"epoch": 1.4696451474989312,
"grad_norm": 3.8219094276428223,
"learning_rate": 5.895486935866984e-06,
"loss": 0.2629,
"step": 6875
},
{
"epoch": 1.470713980333476,
"grad_norm": 3.550219774246216,
"learning_rate": 5.883610451306414e-06,
"loss": 0.1846,
"step": 6880
},
{
"epoch": 1.4717828131680206,
"grad_norm": 4.344959259033203,
"learning_rate": 5.871733966745844e-06,
"loss": 0.2552,
"step": 6885
},
{
"epoch": 1.4728516460025651,
"grad_norm": 3.7821099758148193,
"learning_rate": 5.859857482185274e-06,
"loss": 0.2812,
"step": 6890
},
{
"epoch": 1.4739204788371099,
"grad_norm": 5.074913501739502,
"learning_rate": 5.847980997624704e-06,
"loss": 0.2796,
"step": 6895
},
{
"epoch": 1.4749893116716546,
"grad_norm": 5.702268600463867,
"learning_rate": 5.836104513064134e-06,
"loss": 0.3157,
"step": 6900
},
{
"epoch": 1.476058144506199,
"grad_norm": 4.769154071807861,
"learning_rate": 5.824228028503563e-06,
"loss": 0.271,
"step": 6905
},
{
"epoch": 1.4771269773407438,
"grad_norm": 3.915893077850342,
"learning_rate": 5.812351543942993e-06,
"loss": 0.2352,
"step": 6910
},
{
"epoch": 1.4781958101752886,
"grad_norm": 5.49572229385376,
"learning_rate": 5.8004750593824236e-06,
"loss": 0.3752,
"step": 6915
},
{
"epoch": 1.4792646430098333,
"grad_norm": 5.197114944458008,
"learning_rate": 5.788598574821853e-06,
"loss": 0.2811,
"step": 6920
},
{
"epoch": 1.480333475844378,
"grad_norm": 4.672935485839844,
"learning_rate": 5.776722090261283e-06,
"loss": 0.3303,
"step": 6925
},
{
"epoch": 1.4814023086789225,
"grad_norm": 3.5662314891815186,
"learning_rate": 5.764845605700713e-06,
"loss": 0.3382,
"step": 6930
},
{
"epoch": 1.4824711415134673,
"grad_norm": 3.7478342056274414,
"learning_rate": 5.752969121140143e-06,
"loss": 0.2235,
"step": 6935
},
{
"epoch": 1.483539974348012,
"grad_norm": 5.836414813995361,
"learning_rate": 5.741092636579573e-06,
"loss": 0.2446,
"step": 6940
},
{
"epoch": 1.4846088071825567,
"grad_norm": 4.945041179656982,
"learning_rate": 5.729216152019003e-06,
"loss": 0.2745,
"step": 6945
},
{
"epoch": 1.4856776400171015,
"grad_norm": 4.556496620178223,
"learning_rate": 5.717339667458432e-06,
"loss": 0.3061,
"step": 6950
},
{
"epoch": 1.486746472851646,
"grad_norm": 5.837685585021973,
"learning_rate": 5.705463182897862e-06,
"loss": 0.3059,
"step": 6955
},
{
"epoch": 1.4878153056861907,
"grad_norm": 3.4342663288116455,
"learning_rate": 5.6935866983372926e-06,
"loss": 0.2692,
"step": 6960
},
{
"epoch": 1.4888841385207354,
"grad_norm": 4.30683708190918,
"learning_rate": 5.681710213776722e-06,
"loss": 0.2853,
"step": 6965
},
{
"epoch": 1.48995297135528,
"grad_norm": 3.7401227951049805,
"learning_rate": 5.669833729216152e-06,
"loss": 0.238,
"step": 6970
},
{
"epoch": 1.4910218041898247,
"grad_norm": 3.991908311843872,
"learning_rate": 5.657957244655582e-06,
"loss": 0.3086,
"step": 6975
},
{
"epoch": 1.4920906370243694,
"grad_norm": 5.546383857727051,
"learning_rate": 5.646080760095012e-06,
"loss": 0.3016,
"step": 6980
},
{
"epoch": 1.4931594698589141,
"grad_norm": 4.429809093475342,
"learning_rate": 5.634204275534442e-06,
"loss": 0.3272,
"step": 6985
},
{
"epoch": 1.4942283026934589,
"grad_norm": 4.91778564453125,
"learning_rate": 5.6223277909738725e-06,
"loss": 0.3125,
"step": 6990
},
{
"epoch": 1.4952971355280034,
"grad_norm": 5.806905269622803,
"learning_rate": 5.610451306413302e-06,
"loss": 0.2777,
"step": 6995
},
{
"epoch": 1.496365968362548,
"grad_norm": 5.0485711097717285,
"learning_rate": 5.598574821852733e-06,
"loss": 0.3026,
"step": 7000
},
{
"epoch": 1.4974348011970928,
"grad_norm": 4.642349720001221,
"learning_rate": 5.5866983372921624e-06,
"loss": 0.2522,
"step": 7005
},
{
"epoch": 1.4985036340316373,
"grad_norm": 3.192457437515259,
"learning_rate": 5.574821852731592e-06,
"loss": 0.2487,
"step": 7010
},
{
"epoch": 1.499572466866182,
"grad_norm": 4.002120494842529,
"learning_rate": 5.562945368171022e-06,
"loss": 0.2316,
"step": 7015
},
{
"epoch": 1.5006412997007268,
"grad_norm": 4.840696334838867,
"learning_rate": 5.551068883610452e-06,
"loss": 0.2484,
"step": 7020
},
{
"epoch": 1.5017101325352715,
"grad_norm": 4.7393927574157715,
"learning_rate": 5.539192399049882e-06,
"loss": 0.2852,
"step": 7025
},
{
"epoch": 1.5027789653698163,
"grad_norm": 4.964815616607666,
"learning_rate": 5.527315914489312e-06,
"loss": 0.2944,
"step": 7030
},
{
"epoch": 1.5038477982043608,
"grad_norm": 4.7306342124938965,
"learning_rate": 5.5154394299287415e-06,
"loss": 0.3133,
"step": 7035
},
{
"epoch": 1.5049166310389055,
"grad_norm": 5.262001991271973,
"learning_rate": 5.503562945368171e-06,
"loss": 0.2557,
"step": 7040
},
{
"epoch": 1.5059854638734502,
"grad_norm": 4.136565685272217,
"learning_rate": 5.491686460807602e-06,
"loss": 0.232,
"step": 7045
},
{
"epoch": 1.5070542967079947,
"grad_norm": 3.917520046234131,
"learning_rate": 5.4798099762470315e-06,
"loss": 0.2635,
"step": 7050
},
{
"epoch": 1.5081231295425397,
"grad_norm": 5.6809210777282715,
"learning_rate": 5.467933491686461e-06,
"loss": 0.3033,
"step": 7055
},
{
"epoch": 1.5091919623770842,
"grad_norm": 3.7200369834899902,
"learning_rate": 5.456057007125891e-06,
"loss": 0.2477,
"step": 7060
},
{
"epoch": 1.510260795211629,
"grad_norm": 4.6949543952941895,
"learning_rate": 5.444180522565321e-06,
"loss": 0.2443,
"step": 7065
},
{
"epoch": 1.5113296280461737,
"grad_norm": 4.025641918182373,
"learning_rate": 5.432304038004751e-06,
"loss": 0.4329,
"step": 7070
},
{
"epoch": 1.5123984608807182,
"grad_norm": 3.7725117206573486,
"learning_rate": 5.420427553444181e-06,
"loss": 0.2682,
"step": 7075
},
{
"epoch": 1.513467293715263,
"grad_norm": 4.11836051940918,
"learning_rate": 5.4085510688836105e-06,
"loss": 0.3149,
"step": 7080
},
{
"epoch": 1.5145361265498076,
"grad_norm": 4.033612251281738,
"learning_rate": 5.39667458432304e-06,
"loss": 0.353,
"step": 7085
},
{
"epoch": 1.5156049593843521,
"grad_norm": 5.4751482009887695,
"learning_rate": 5.384798099762471e-06,
"loss": 0.2247,
"step": 7090
},
{
"epoch": 1.516673792218897,
"grad_norm": 4.203334808349609,
"learning_rate": 5.3729216152019005e-06,
"loss": 0.2862,
"step": 7095
},
{
"epoch": 1.5177426250534416,
"grad_norm": 5.31473970413208,
"learning_rate": 5.36104513064133e-06,
"loss": 0.287,
"step": 7100
},
{
"epoch": 1.5188114578879863,
"grad_norm": 4.896878719329834,
"learning_rate": 5.34916864608076e-06,
"loss": 0.3141,
"step": 7105
},
{
"epoch": 1.519880290722531,
"grad_norm": 3.62528133392334,
"learning_rate": 5.33729216152019e-06,
"loss": 0.4446,
"step": 7110
},
{
"epoch": 1.5209491235570756,
"grad_norm": 5.231464385986328,
"learning_rate": 5.32541567695962e-06,
"loss": 0.2853,
"step": 7115
},
{
"epoch": 1.5220179563916203,
"grad_norm": 3.0587196350097656,
"learning_rate": 5.31353919239905e-06,
"loss": 0.2662,
"step": 7120
},
{
"epoch": 1.523086789226165,
"grad_norm": 5.080547332763672,
"learning_rate": 5.3016627078384795e-06,
"loss": 0.2729,
"step": 7125
},
{
"epoch": 1.5241556220607098,
"grad_norm": 3.547877073287964,
"learning_rate": 5.289786223277911e-06,
"loss": 0.2376,
"step": 7130
},
{
"epoch": 1.5252244548952545,
"grad_norm": 3.9913973808288574,
"learning_rate": 5.277909738717341e-06,
"loss": 0.2434,
"step": 7135
},
{
"epoch": 1.526293287729799,
"grad_norm": 3.9852547645568848,
"learning_rate": 5.26603325415677e-06,
"loss": 0.302,
"step": 7140
},
{
"epoch": 1.5273621205643437,
"grad_norm": 3.660104274749756,
"learning_rate": 5.2541567695962e-06,
"loss": 0.2346,
"step": 7145
},
{
"epoch": 1.5284309533988885,
"grad_norm": 4.887364387512207,
"learning_rate": 5.242280285035631e-06,
"loss": 0.4036,
"step": 7150
},
{
"epoch": 1.529499786233433,
"grad_norm": 5.766690254211426,
"learning_rate": 5.23040380047506e-06,
"loss": 0.2902,
"step": 7155
},
{
"epoch": 1.530568619067978,
"grad_norm": 5.018100738525391,
"learning_rate": 5.21852731591449e-06,
"loss": 0.4254,
"step": 7160
},
{
"epoch": 1.5316374519025224,
"grad_norm": 2.8769116401672363,
"learning_rate": 5.20665083135392e-06,
"loss": 0.2863,
"step": 7165
},
{
"epoch": 1.5327062847370672,
"grad_norm": 4.766345024108887,
"learning_rate": 5.194774346793349e-06,
"loss": 0.2618,
"step": 7170
},
{
"epoch": 1.533775117571612,
"grad_norm": 4.371603012084961,
"learning_rate": 5.18289786223278e-06,
"loss": 0.3614,
"step": 7175
},
{
"epoch": 1.5348439504061564,
"grad_norm": 3.7386531829833984,
"learning_rate": 5.17102137767221e-06,
"loss": 0.3084,
"step": 7180
},
{
"epoch": 1.5359127832407011,
"grad_norm": 3.2616264820098877,
"learning_rate": 5.159144893111639e-06,
"loss": 0.2799,
"step": 7185
},
{
"epoch": 1.5369816160752459,
"grad_norm": 4.840415000915527,
"learning_rate": 5.147268408551069e-06,
"loss": 0.2843,
"step": 7190
},
{
"epoch": 1.5380504489097904,
"grad_norm": 2.643326997756958,
"learning_rate": 5.1353919239905e-06,
"loss": 0.255,
"step": 7195
},
{
"epoch": 1.5391192817443353,
"grad_norm": 3.9539496898651123,
"learning_rate": 5.123515439429929e-06,
"loss": 0.2278,
"step": 7200
},
{
"epoch": 1.5401881145788798,
"grad_norm": 4.173327922821045,
"learning_rate": 5.111638954869359e-06,
"loss": 0.3137,
"step": 7205
},
{
"epoch": 1.5412569474134246,
"grad_norm": 4.327914237976074,
"learning_rate": 5.099762470308789e-06,
"loss": 0.3365,
"step": 7210
},
{
"epoch": 1.5423257802479693,
"grad_norm": 2.9048960208892822,
"learning_rate": 5.087885985748218e-06,
"loss": 0.1981,
"step": 7215
},
{
"epoch": 1.5433946130825138,
"grad_norm": 4.26038932800293,
"learning_rate": 5.076009501187649e-06,
"loss": 0.2338,
"step": 7220
},
{
"epoch": 1.5444634459170585,
"grad_norm": 5.362328052520752,
"learning_rate": 5.064133016627079e-06,
"loss": 0.2692,
"step": 7225
},
{
"epoch": 1.5455322787516033,
"grad_norm": 4.408464431762695,
"learning_rate": 5.052256532066508e-06,
"loss": 0.2129,
"step": 7230
},
{
"epoch": 1.5466011115861478,
"grad_norm": 5.237843990325928,
"learning_rate": 5.040380047505938e-06,
"loss": 0.2395,
"step": 7235
},
{
"epoch": 1.5476699444206927,
"grad_norm": 6.1017045974731445,
"learning_rate": 5.028503562945369e-06,
"loss": 0.485,
"step": 7240
},
{
"epoch": 1.5487387772552372,
"grad_norm": 5.8066582679748535,
"learning_rate": 5.016627078384798e-06,
"loss": 0.2166,
"step": 7245
},
{
"epoch": 1.549807610089782,
"grad_norm": 6.7323899269104,
"learning_rate": 5.004750593824228e-06,
"loss": 0.2799,
"step": 7250
},
{
"epoch": 1.5508764429243267,
"grad_norm": 4.477848052978516,
"learning_rate": 4.9928741092636586e-06,
"loss": 0.2856,
"step": 7255
},
{
"epoch": 1.5519452757588712,
"grad_norm": 3.282881498336792,
"learning_rate": 4.980997624703088e-06,
"loss": 0.272,
"step": 7260
},
{
"epoch": 1.5530141085934162,
"grad_norm": 4.757537364959717,
"learning_rate": 4.969121140142518e-06,
"loss": 0.299,
"step": 7265
},
{
"epoch": 1.5540829414279607,
"grad_norm": 6.090857028961182,
"learning_rate": 4.9572446555819485e-06,
"loss": 0.3309,
"step": 7270
},
{
"epoch": 1.5551517742625054,
"grad_norm": 3.326892137527466,
"learning_rate": 4.945368171021378e-06,
"loss": 0.223,
"step": 7275
},
{
"epoch": 1.5562206070970501,
"grad_norm": 3.5346665382385254,
"learning_rate": 4.933491686460808e-06,
"loss": 0.2351,
"step": 7280
},
{
"epoch": 1.5572894399315946,
"grad_norm": 3.1125802993774414,
"learning_rate": 4.921615201900238e-06,
"loss": 0.2177,
"step": 7285
},
{
"epoch": 1.5583582727661394,
"grad_norm": 3.7614200115203857,
"learning_rate": 4.909738717339667e-06,
"loss": 0.2606,
"step": 7290
},
{
"epoch": 1.559427105600684,
"grad_norm": 3.761014223098755,
"learning_rate": 4.897862232779098e-06,
"loss": 0.3972,
"step": 7295
},
{
"epoch": 1.5604959384352286,
"grad_norm": 3.6661438941955566,
"learning_rate": 4.885985748218528e-06,
"loss": 0.2594,
"step": 7300
},
{
"epoch": 1.5615647712697736,
"grad_norm": 4.455360412597656,
"learning_rate": 4.874109263657958e-06,
"loss": 0.2934,
"step": 7305
},
{
"epoch": 1.562633604104318,
"grad_norm": 4.19691801071167,
"learning_rate": 4.862232779097388e-06,
"loss": 0.4105,
"step": 7310
},
{
"epoch": 1.5637024369388628,
"grad_norm": 4.041048049926758,
"learning_rate": 4.8503562945368175e-06,
"loss": 0.1971,
"step": 7315
},
{
"epoch": 1.5647712697734075,
"grad_norm": 3.2611756324768066,
"learning_rate": 4.838479809976247e-06,
"loss": 0.2107,
"step": 7320
},
{
"epoch": 1.565840102607952,
"grad_norm": 3.419591188430786,
"learning_rate": 4.826603325415678e-06,
"loss": 0.2441,
"step": 7325
},
{
"epoch": 1.5669089354424968,
"grad_norm": 4.567037105560303,
"learning_rate": 4.8147268408551075e-06,
"loss": 0.2413,
"step": 7330
},
{
"epoch": 1.5679777682770415,
"grad_norm": 3.887484550476074,
"learning_rate": 4.802850356294537e-06,
"loss": 0.2619,
"step": 7335
},
{
"epoch": 1.569046601111586,
"grad_norm": 4.95120906829834,
"learning_rate": 4.790973871733967e-06,
"loss": 0.3098,
"step": 7340
},
{
"epoch": 1.570115433946131,
"grad_norm": 4.205053806304932,
"learning_rate": 4.779097387173397e-06,
"loss": 0.3024,
"step": 7345
},
{
"epoch": 1.5711842667806755,
"grad_norm": 6.198763847351074,
"learning_rate": 4.767220902612827e-06,
"loss": 0.2548,
"step": 7350
},
{
"epoch": 1.5722530996152202,
"grad_norm": 4.158599853515625,
"learning_rate": 4.755344418052257e-06,
"loss": 0.2925,
"step": 7355
},
{
"epoch": 1.573321932449765,
"grad_norm": 3.3105695247650146,
"learning_rate": 4.7434679334916866e-06,
"loss": 0.2245,
"step": 7360
},
{
"epoch": 1.5743907652843094,
"grad_norm": 2.852360963821411,
"learning_rate": 4.731591448931116e-06,
"loss": 0.2612,
"step": 7365
},
{
"epoch": 1.5754595981188542,
"grad_norm": 5.082930564880371,
"learning_rate": 4.719714964370547e-06,
"loss": 0.4075,
"step": 7370
},
{
"epoch": 1.576528430953399,
"grad_norm": 3.626047372817993,
"learning_rate": 4.7078384798099765e-06,
"loss": 0.2457,
"step": 7375
},
{
"epoch": 1.5775972637879434,
"grad_norm": 3.2513113021850586,
"learning_rate": 4.695961995249407e-06,
"loss": 0.214,
"step": 7380
},
{
"epoch": 1.5786660966224884,
"grad_norm": 4.396987438201904,
"learning_rate": 4.684085510688837e-06,
"loss": 0.2761,
"step": 7385
},
{
"epoch": 1.5797349294570329,
"grad_norm": 4.177000045776367,
"learning_rate": 4.6722090261282665e-06,
"loss": 0.278,
"step": 7390
},
{
"epoch": 1.5808037622915776,
"grad_norm": 6.472886562347412,
"learning_rate": 4.660332541567696e-06,
"loss": 0.4008,
"step": 7395
},
{
"epoch": 1.5818725951261223,
"grad_norm": 5.244050979614258,
"learning_rate": 4.648456057007127e-06,
"loss": 0.3222,
"step": 7400
},
{
"epoch": 1.5829414279606668,
"grad_norm": 3.3180673122406006,
"learning_rate": 4.636579572446556e-06,
"loss": 0.2645,
"step": 7405
},
{
"epoch": 1.5840102607952118,
"grad_norm": 4.317756652832031,
"learning_rate": 4.624703087885986e-06,
"loss": 0.2121,
"step": 7410
},
{
"epoch": 1.5850790936297563,
"grad_norm": 5.13472843170166,
"learning_rate": 4.612826603325416e-06,
"loss": 0.2679,
"step": 7415
},
{
"epoch": 1.586147926464301,
"grad_norm": 4.850220680236816,
"learning_rate": 4.6009501187648455e-06,
"loss": 0.342,
"step": 7420
},
{
"epoch": 1.5872167592988458,
"grad_norm": 3.7907469272613525,
"learning_rate": 4.589073634204276e-06,
"loss": 0.2312,
"step": 7425
},
{
"epoch": 1.5882855921333903,
"grad_norm": 5.306363582611084,
"learning_rate": 4.577197149643706e-06,
"loss": 0.3332,
"step": 7430
},
{
"epoch": 1.589354424967935,
"grad_norm": 4.227755069732666,
"learning_rate": 4.5653206650831355e-06,
"loss": 0.2628,
"step": 7435
},
{
"epoch": 1.5904232578024797,
"grad_norm": 4.175191879272461,
"learning_rate": 4.553444180522565e-06,
"loss": 0.2824,
"step": 7440
},
{
"epoch": 1.5914920906370242,
"grad_norm": 4.70232629776001,
"learning_rate": 4.541567695961996e-06,
"loss": 0.3249,
"step": 7445
},
{
"epoch": 1.5925609234715692,
"grad_norm": 5.078143119812012,
"learning_rate": 4.5296912114014254e-06,
"loss": 0.3738,
"step": 7450
},
{
"epoch": 1.5936297563061137,
"grad_norm": 3.0150363445281982,
"learning_rate": 4.517814726840856e-06,
"loss": 0.3357,
"step": 7455
},
{
"epoch": 1.5946985891406584,
"grad_norm": 6.010279655456543,
"learning_rate": 4.505938242280286e-06,
"loss": 0.2563,
"step": 7460
},
{
"epoch": 1.5957674219752032,
"grad_norm": 4.169801712036133,
"learning_rate": 4.494061757719715e-06,
"loss": 0.2091,
"step": 7465
},
{
"epoch": 1.5968362548097477,
"grad_norm": 5.483653545379639,
"learning_rate": 4.482185273159145e-06,
"loss": 0.2425,
"step": 7470
},
{
"epoch": 1.5979050876442924,
"grad_norm": 3.874551773071289,
"learning_rate": 4.470308788598575e-06,
"loss": 0.2699,
"step": 7475
},
{
"epoch": 1.5989739204788371,
"grad_norm": 5.686993598937988,
"learning_rate": 4.458432304038005e-06,
"loss": 0.3409,
"step": 7480
},
{
"epoch": 1.6000427533133816,
"grad_norm": 4.527751922607422,
"learning_rate": 4.446555819477435e-06,
"loss": 0.2233,
"step": 7485
},
{
"epoch": 1.6011115861479266,
"grad_norm": 4.663357257843018,
"learning_rate": 4.434679334916865e-06,
"loss": 0.3269,
"step": 7490
},
{
"epoch": 1.602180418982471,
"grad_norm": 5.009659767150879,
"learning_rate": 4.4228028503562945e-06,
"loss": 0.3029,
"step": 7495
},
{
"epoch": 1.6032492518170158,
"grad_norm": 3.9787962436676025,
"learning_rate": 4.410926365795725e-06,
"loss": 0.2547,
"step": 7500
},
{
"epoch": 1.6043180846515606,
"grad_norm": 5.281296253204346,
"learning_rate": 4.399049881235155e-06,
"loss": 0.2855,
"step": 7505
},
{
"epoch": 1.605386917486105,
"grad_norm": 6.091033935546875,
"learning_rate": 4.387173396674584e-06,
"loss": 0.3106,
"step": 7510
},
{
"epoch": 1.6064557503206498,
"grad_norm": 5.57248067855835,
"learning_rate": 4.375296912114015e-06,
"loss": 0.262,
"step": 7515
},
{
"epoch": 1.6075245831551945,
"grad_norm": 4.538100242614746,
"learning_rate": 4.363420427553445e-06,
"loss": 0.3016,
"step": 7520
},
{
"epoch": 1.608593415989739,
"grad_norm": 2.859865665435791,
"learning_rate": 4.351543942992874e-06,
"loss": 0.2852,
"step": 7525
},
{
"epoch": 1.609662248824284,
"grad_norm": 4.841543197631836,
"learning_rate": 4.339667458432305e-06,
"loss": 0.3126,
"step": 7530
},
{
"epoch": 1.6107310816588285,
"grad_norm": 4.134354114532471,
"learning_rate": 4.327790973871735e-06,
"loss": 0.2779,
"step": 7535
},
{
"epoch": 1.6117999144933732,
"grad_norm": 5.4539875984191895,
"learning_rate": 4.315914489311164e-06,
"loss": 0.2811,
"step": 7540
},
{
"epoch": 1.612868747327918,
"grad_norm": 4.018299579620361,
"learning_rate": 4.304038004750594e-06,
"loss": 0.259,
"step": 7545
},
{
"epoch": 1.6139375801624625,
"grad_norm": 3.978214740753174,
"learning_rate": 4.292161520190024e-06,
"loss": 0.2602,
"step": 7550
},
{
"epoch": 1.6150064129970074,
"grad_norm": 4.782619953155518,
"learning_rate": 4.280285035629454e-06,
"loss": 0.2481,
"step": 7555
},
{
"epoch": 1.616075245831552,
"grad_norm": 4.34796142578125,
"learning_rate": 4.268408551068884e-06,
"loss": 0.2022,
"step": 7560
},
{
"epoch": 1.6171440786660967,
"grad_norm": 4.58864688873291,
"learning_rate": 4.256532066508314e-06,
"loss": 0.2943,
"step": 7565
},
{
"epoch": 1.6182129115006414,
"grad_norm": 3.2588422298431396,
"learning_rate": 4.244655581947743e-06,
"loss": 0.2144,
"step": 7570
},
{
"epoch": 1.619281744335186,
"grad_norm": 4.609071731567383,
"learning_rate": 4.232779097387174e-06,
"loss": 0.2589,
"step": 7575
},
{
"epoch": 1.6203505771697306,
"grad_norm": 3.8828067779541016,
"learning_rate": 4.220902612826604e-06,
"loss": 0.1999,
"step": 7580
},
{
"epoch": 1.6214194100042754,
"grad_norm": 5.068613052368164,
"learning_rate": 4.209026128266034e-06,
"loss": 0.3035,
"step": 7585
},
{
"epoch": 1.6224882428388199,
"grad_norm": 3.4416937828063965,
"learning_rate": 4.197149643705464e-06,
"loss": 0.2322,
"step": 7590
},
{
"epoch": 1.6235570756733648,
"grad_norm": 4.246146202087402,
"learning_rate": 4.185273159144894e-06,
"loss": 0.237,
"step": 7595
},
{
"epoch": 1.6246259085079093,
"grad_norm": 4.175546646118164,
"learning_rate": 4.173396674584323e-06,
"loss": 0.2815,
"step": 7600
},
{
"epoch": 1.625694741342454,
"grad_norm": 5.142884254455566,
"learning_rate": 4.161520190023753e-06,
"loss": 0.4136,
"step": 7605
},
{
"epoch": 1.6267635741769988,
"grad_norm": 4.261429309844971,
"learning_rate": 4.1496437054631835e-06,
"loss": 0.2474,
"step": 7610
},
{
"epoch": 1.6278324070115433,
"grad_norm": 5.0894646644592285,
"learning_rate": 4.137767220902613e-06,
"loss": 0.3143,
"step": 7615
},
{
"epoch": 1.628901239846088,
"grad_norm": 4.596246242523193,
"learning_rate": 4.125890736342043e-06,
"loss": 0.244,
"step": 7620
},
{
"epoch": 1.6299700726806328,
"grad_norm": 4.05454158782959,
"learning_rate": 4.114014251781473e-06,
"loss": 0.3134,
"step": 7625
},
{
"epoch": 1.6310389055151773,
"grad_norm": 5.604685306549072,
"learning_rate": 4.102137767220903e-06,
"loss": 0.2516,
"step": 7630
},
{
"epoch": 1.6321077383497222,
"grad_norm": 2.5428969860076904,
"learning_rate": 4.090261282660333e-06,
"loss": 0.3159,
"step": 7635
},
{
"epoch": 1.6331765711842667,
"grad_norm": 3.228505849838257,
"learning_rate": 4.078384798099763e-06,
"loss": 0.2519,
"step": 7640
},
{
"epoch": 1.6342454040188115,
"grad_norm": 5.0502753257751465,
"learning_rate": 4.066508313539192e-06,
"loss": 0.2785,
"step": 7645
},
{
"epoch": 1.6353142368533562,
"grad_norm": 3.824427366256714,
"learning_rate": 4.054631828978622e-06,
"loss": 0.2627,
"step": 7650
},
{
"epoch": 1.6363830696879007,
"grad_norm": 4.460954666137695,
"learning_rate": 4.0427553444180526e-06,
"loss": 0.2677,
"step": 7655
},
{
"epoch": 1.6374519025224454,
"grad_norm": 3.3890676498413086,
"learning_rate": 4.030878859857483e-06,
"loss": 0.1962,
"step": 7660
},
{
"epoch": 1.6385207353569902,
"grad_norm": 4.556974411010742,
"learning_rate": 4.019002375296913e-06,
"loss": 0.3779,
"step": 7665
},
{
"epoch": 1.639589568191535,
"grad_norm": 3.9803950786590576,
"learning_rate": 4.0071258907363425e-06,
"loss": 0.2731,
"step": 7670
},
{
"epoch": 1.6406584010260796,
"grad_norm": 3.7230427265167236,
"learning_rate": 3.995249406175772e-06,
"loss": 0.2806,
"step": 7675
},
{
"epoch": 1.6417272338606241,
"grad_norm": 3.6325037479400635,
"learning_rate": 3.983372921615202e-06,
"loss": 0.2582,
"step": 7680
},
{
"epoch": 1.6427960666951689,
"grad_norm": 4.024942398071289,
"learning_rate": 3.9714964370546325e-06,
"loss": 0.2064,
"step": 7685
},
{
"epoch": 1.6438648995297136,
"grad_norm": 4.7745819091796875,
"learning_rate": 3.959619952494062e-06,
"loss": 0.263,
"step": 7690
},
{
"epoch": 1.644933732364258,
"grad_norm": 3.8132996559143066,
"learning_rate": 3.947743467933492e-06,
"loss": 0.3126,
"step": 7695
},
{
"epoch": 1.646002565198803,
"grad_norm": 3.711763620376587,
"learning_rate": 3.9358669833729216e-06,
"loss": 0.2889,
"step": 7700
},
{
"epoch": 1.6470713980333476,
"grad_norm": 3.696894645690918,
"learning_rate": 3.923990498812352e-06,
"loss": 0.2372,
"step": 7705
},
{
"epoch": 1.6481402308678923,
"grad_norm": 5.242607593536377,
"learning_rate": 3.912114014251782e-06,
"loss": 0.2062,
"step": 7710
},
{
"epoch": 1.649209063702437,
"grad_norm": 3.8635284900665283,
"learning_rate": 3.9002375296912115e-06,
"loss": 0.3085,
"step": 7715
},
{
"epoch": 1.6502778965369815,
"grad_norm": 4.494617938995361,
"learning_rate": 3.888361045130641e-06,
"loss": 0.22,
"step": 7720
},
{
"epoch": 1.6513467293715263,
"grad_norm": 5.683468818664551,
"learning_rate": 3.876484560570072e-06,
"loss": 0.258,
"step": 7725
},
{
"epoch": 1.652415562206071,
"grad_norm": 7.1560845375061035,
"learning_rate": 3.8646080760095015e-06,
"loss": 0.2496,
"step": 7730
},
{
"epoch": 1.6534843950406155,
"grad_norm": 4.27496337890625,
"learning_rate": 3.852731591448932e-06,
"loss": 0.2975,
"step": 7735
},
{
"epoch": 1.6545532278751605,
"grad_norm": 5.494519233703613,
"learning_rate": 3.840855106888362e-06,
"loss": 0.2744,
"step": 7740
},
{
"epoch": 1.655622060709705,
"grad_norm": 4.088238716125488,
"learning_rate": 3.8289786223277914e-06,
"loss": 0.2255,
"step": 7745
},
{
"epoch": 1.6566908935442497,
"grad_norm": 3.627351760864258,
"learning_rate": 3.817102137767221e-06,
"loss": 0.2387,
"step": 7750
},
{
"epoch": 1.6577597263787944,
"grad_norm": 4.195761680603027,
"learning_rate": 3.8052256532066513e-06,
"loss": 0.2724,
"step": 7755
},
{
"epoch": 1.658828559213339,
"grad_norm": 4.758053779602051,
"learning_rate": 3.793349168646081e-06,
"loss": 0.282,
"step": 7760
},
{
"epoch": 1.6598973920478837,
"grad_norm": 3.427823066711426,
"learning_rate": 3.781472684085511e-06,
"loss": 0.166,
"step": 7765
},
{
"epoch": 1.6609662248824284,
"grad_norm": 4.784726142883301,
"learning_rate": 3.769596199524941e-06,
"loss": 0.2653,
"step": 7770
},
{
"epoch": 1.662035057716973,
"grad_norm": 4.018444538116455,
"learning_rate": 3.757719714964371e-06,
"loss": 0.2368,
"step": 7775
},
{
"epoch": 1.6631038905515179,
"grad_norm": 4.532012462615967,
"learning_rate": 3.7458432304038006e-06,
"loss": 0.2235,
"step": 7780
},
{
"epoch": 1.6641727233860624,
"grad_norm": 4.576938152313232,
"learning_rate": 3.7339667458432303e-06,
"loss": 0.309,
"step": 7785
},
{
"epoch": 1.665241556220607,
"grad_norm": 4.126202583312988,
"learning_rate": 3.7220902612826604e-06,
"loss": 0.3402,
"step": 7790
},
{
"epoch": 1.6663103890551518,
"grad_norm": 5.895056247711182,
"learning_rate": 3.710213776722091e-06,
"loss": 0.2741,
"step": 7795
},
{
"epoch": 1.6673792218896963,
"grad_norm": 5.252209663391113,
"learning_rate": 3.6983372921615207e-06,
"loss": 0.2282,
"step": 7800
},
{
"epoch": 1.6684480547242413,
"grad_norm": 5.411665439605713,
"learning_rate": 3.6864608076009504e-06,
"loss": 0.3233,
"step": 7805
},
{
"epoch": 1.6695168875587858,
"grad_norm": 3.801215887069702,
"learning_rate": 3.6745843230403805e-06,
"loss": 0.23,
"step": 7810
},
{
"epoch": 1.6705857203933305,
"grad_norm": 5.455605983734131,
"learning_rate": 3.6627078384798102e-06,
"loss": 0.22,
"step": 7815
},
{
"epoch": 1.6716545532278753,
"grad_norm": 3.8827927112579346,
"learning_rate": 3.6508313539192404e-06,
"loss": 0.216,
"step": 7820
},
{
"epoch": 1.6727233860624198,
"grad_norm": 3.9195375442504883,
"learning_rate": 3.63895486935867e-06,
"loss": 0.3479,
"step": 7825
},
{
"epoch": 1.6737922188969645,
"grad_norm": 4.495283603668213,
"learning_rate": 3.6270783847981e-06,
"loss": 0.2256,
"step": 7830
},
{
"epoch": 1.6748610517315092,
"grad_norm": 5.642339706420898,
"learning_rate": 3.61520190023753e-06,
"loss": 0.3205,
"step": 7835
},
{
"epoch": 1.6759298845660537,
"grad_norm": 5.5151495933532715,
"learning_rate": 3.60332541567696e-06,
"loss": 0.28,
"step": 7840
},
{
"epoch": 1.6769987174005987,
"grad_norm": 3.8195252418518066,
"learning_rate": 3.5914489311163897e-06,
"loss": 0.255,
"step": 7845
},
{
"epoch": 1.6780675502351432,
"grad_norm": 5.310424327850342,
"learning_rate": 3.5795724465558194e-06,
"loss": 0.2584,
"step": 7850
},
{
"epoch": 1.679136383069688,
"grad_norm": 5.491156101226807,
"learning_rate": 3.5676959619952495e-06,
"loss": 0.254,
"step": 7855
},
{
"epoch": 1.6802052159042327,
"grad_norm": 4.094849109649658,
"learning_rate": 3.5558194774346792e-06,
"loss": 0.2051,
"step": 7860
},
{
"epoch": 1.6812740487387772,
"grad_norm": 3.9543018341064453,
"learning_rate": 3.54394299287411e-06,
"loss": 0.2653,
"step": 7865
},
{
"epoch": 1.682342881573322,
"grad_norm": 4.145587921142578,
"learning_rate": 3.5320665083135395e-06,
"loss": 0.2882,
"step": 7870
},
{
"epoch": 1.6834117144078666,
"grad_norm": 3.4505057334899902,
"learning_rate": 3.5201900237529696e-06,
"loss": 0.2685,
"step": 7875
},
{
"epoch": 1.6844805472424111,
"grad_norm": 4.536677837371826,
"learning_rate": 3.5083135391923993e-06,
"loss": 0.2606,
"step": 7880
},
{
"epoch": 1.685549380076956,
"grad_norm": 5.157629013061523,
"learning_rate": 3.4964370546318295e-06,
"loss": 0.2266,
"step": 7885
},
{
"epoch": 1.6866182129115006,
"grad_norm": 4.595909595489502,
"learning_rate": 3.484560570071259e-06,
"loss": 0.2112,
"step": 7890
},
{
"epoch": 1.6876870457460453,
"grad_norm": 4.331202030181885,
"learning_rate": 3.4726840855106893e-06,
"loss": 0.245,
"step": 7895
},
{
"epoch": 1.68875587858059,
"grad_norm": 5.239740371704102,
"learning_rate": 3.460807600950119e-06,
"loss": 0.2144,
"step": 7900
},
{
"epoch": 1.6898247114151346,
"grad_norm": 3.1925699710845947,
"learning_rate": 3.448931116389549e-06,
"loss": 0.3334,
"step": 7905
},
{
"epoch": 1.6908935442496793,
"grad_norm": 3.5667247772216797,
"learning_rate": 3.437054631828979e-06,
"loss": 0.2754,
"step": 7910
},
{
"epoch": 1.691962377084224,
"grad_norm": 4.145174026489258,
"learning_rate": 3.4251781472684085e-06,
"loss": 0.3048,
"step": 7915
},
{
"epoch": 1.6930312099187685,
"grad_norm": 3.559020519256592,
"learning_rate": 3.4133016627078386e-06,
"loss": 0.2319,
"step": 7920
},
{
"epoch": 1.6941000427533135,
"grad_norm": 3.1762850284576416,
"learning_rate": 3.4014251781472683e-06,
"loss": 0.3505,
"step": 7925
},
{
"epoch": 1.695168875587858,
"grad_norm": 4.600183963775635,
"learning_rate": 3.3895486935866985e-06,
"loss": 0.3171,
"step": 7930
},
{
"epoch": 1.6962377084224027,
"grad_norm": 4.069181442260742,
"learning_rate": 3.3776722090261286e-06,
"loss": 0.2359,
"step": 7935
},
{
"epoch": 1.6973065412569475,
"grad_norm": 5.979001998901367,
"learning_rate": 3.3657957244655587e-06,
"loss": 0.259,
"step": 7940
},
{
"epoch": 1.698375374091492,
"grad_norm": 4.2909040451049805,
"learning_rate": 3.3539192399049884e-06,
"loss": 0.2345,
"step": 7945
},
{
"epoch": 1.699444206926037,
"grad_norm": 4.572742938995361,
"learning_rate": 3.3420427553444185e-06,
"loss": 0.2364,
"step": 7950
},
{
"epoch": 1.7005130397605814,
"grad_norm": 4.979130744934082,
"learning_rate": 3.3301662707838482e-06,
"loss": 0.3125,
"step": 7955
},
{
"epoch": 1.7015818725951262,
"grad_norm": 8.828888893127441,
"learning_rate": 3.3182897862232784e-06,
"loss": 0.3855,
"step": 7960
},
{
"epoch": 1.702650705429671,
"grad_norm": 3.5113627910614014,
"learning_rate": 3.306413301662708e-06,
"loss": 0.3085,
"step": 7965
},
{
"epoch": 1.7037195382642154,
"grad_norm": 3.138580322265625,
"learning_rate": 3.294536817102138e-06,
"loss": 0.2558,
"step": 7970
},
{
"epoch": 1.7047883710987601,
"grad_norm": 3.382124900817871,
"learning_rate": 3.282660332541568e-06,
"loss": 0.2863,
"step": 7975
},
{
"epoch": 1.7058572039333049,
"grad_norm": 4.64111328125,
"learning_rate": 3.2707838479809976e-06,
"loss": 0.2763,
"step": 7980
},
{
"epoch": 1.7069260367678494,
"grad_norm": 3.9928252696990967,
"learning_rate": 3.2589073634204277e-06,
"loss": 0.2307,
"step": 7985
},
{
"epoch": 1.7079948696023943,
"grad_norm": 4.402683258056641,
"learning_rate": 3.2470308788598574e-06,
"loss": 0.2604,
"step": 7990
},
{
"epoch": 1.7090637024369388,
"grad_norm": 4.634458541870117,
"learning_rate": 3.2351543942992876e-06,
"loss": 0.3524,
"step": 7995
},
{
"epoch": 1.7101325352714836,
"grad_norm": 3.9876441955566406,
"learning_rate": 3.2232779097387173e-06,
"loss": 0.2591,
"step": 8000
},
{
"epoch": 1.7112013681060283,
"grad_norm": 5.491477012634277,
"learning_rate": 3.211401425178148e-06,
"loss": 0.2945,
"step": 8005
},
{
"epoch": 1.7122702009405728,
"grad_norm": 3.348909378051758,
"learning_rate": 3.1995249406175775e-06,
"loss": 0.2698,
"step": 8010
},
{
"epoch": 1.7133390337751175,
"grad_norm": 2.3808627128601074,
"learning_rate": 3.1876484560570076e-06,
"loss": 0.2091,
"step": 8015
},
{
"epoch": 1.7144078666096623,
"grad_norm": 4.511120319366455,
"learning_rate": 3.1757719714964373e-06,
"loss": 0.2313,
"step": 8020
},
{
"epoch": 1.7154766994442068,
"grad_norm": 3.1614320278167725,
"learning_rate": 3.1638954869358675e-06,
"loss": 0.2935,
"step": 8025
},
{
"epoch": 1.7165455322787517,
"grad_norm": 4.708336353302002,
"learning_rate": 3.152019002375297e-06,
"loss": 0.2358,
"step": 8030
},
{
"epoch": 1.7176143651132962,
"grad_norm": 5.274806499481201,
"learning_rate": 3.1401425178147273e-06,
"loss": 0.2842,
"step": 8035
},
{
"epoch": 1.718683197947841,
"grad_norm": 4.673067569732666,
"learning_rate": 3.128266033254157e-06,
"loss": 0.26,
"step": 8040
},
{
"epoch": 1.7197520307823857,
"grad_norm": 7.412868499755859,
"learning_rate": 3.1163895486935867e-06,
"loss": 0.312,
"step": 8045
},
{
"epoch": 1.7208208636169302,
"grad_norm": 5.098508834838867,
"learning_rate": 3.104513064133017e-06,
"loss": 0.2776,
"step": 8050
},
{
"epoch": 1.721889696451475,
"grad_norm": 2.9823100566864014,
"learning_rate": 3.0926365795724465e-06,
"loss": 0.1612,
"step": 8055
},
{
"epoch": 1.7229585292860197,
"grad_norm": 3.906702995300293,
"learning_rate": 3.0807600950118767e-06,
"loss": 0.1803,
"step": 8060
},
{
"epoch": 1.7240273621205642,
"grad_norm": 4.462987899780273,
"learning_rate": 3.0688836104513064e-06,
"loss": 0.2677,
"step": 8065
},
{
"epoch": 1.7250961949551091,
"grad_norm": 3.3349108695983887,
"learning_rate": 3.0570071258907365e-06,
"loss": 0.2315,
"step": 8070
},
{
"epoch": 1.7261650277896536,
"grad_norm": 3.8888843059539795,
"learning_rate": 3.0451306413301666e-06,
"loss": 0.2583,
"step": 8075
},
{
"epoch": 1.7272338606241984,
"grad_norm": 3.5807013511657715,
"learning_rate": 3.0332541567695967e-06,
"loss": 0.2488,
"step": 8080
},
{
"epoch": 1.728302693458743,
"grad_norm": 4.443240165710449,
"learning_rate": 3.0213776722090264e-06,
"loss": 0.2379,
"step": 8085
},
{
"epoch": 1.7293715262932876,
"grad_norm": 4.572385311126709,
"learning_rate": 3.0095011876484566e-06,
"loss": 0.2637,
"step": 8090
},
{
"epoch": 1.7304403591278326,
"grad_norm": 3.8426921367645264,
"learning_rate": 2.9976247030878863e-06,
"loss": 0.2101,
"step": 8095
},
{
"epoch": 1.731509191962377,
"grad_norm": 3.6695351600646973,
"learning_rate": 2.9857482185273164e-06,
"loss": 0.283,
"step": 8100
},
{
"epoch": 1.7325780247969218,
"grad_norm": 4.494965076446533,
"learning_rate": 2.973871733966746e-06,
"loss": 0.2163,
"step": 8105
},
{
"epoch": 1.7336468576314665,
"grad_norm": 4.575949192047119,
"learning_rate": 2.961995249406176e-06,
"loss": 0.2474,
"step": 8110
},
{
"epoch": 1.734715690466011,
"grad_norm": 5.060282230377197,
"learning_rate": 2.950118764845606e-06,
"loss": 0.3346,
"step": 8115
},
{
"epoch": 1.7357845233005558,
"grad_norm": 5.1213274002075195,
"learning_rate": 2.9382422802850356e-06,
"loss": 0.2031,
"step": 8120
},
{
"epoch": 1.7368533561351005,
"grad_norm": 4.754722595214844,
"learning_rate": 2.9263657957244658e-06,
"loss": 0.2301,
"step": 8125
},
{
"epoch": 1.737922188969645,
"grad_norm": 3.7561304569244385,
"learning_rate": 2.9144893111638955e-06,
"loss": 0.3413,
"step": 8130
},
{
"epoch": 1.73899102180419,
"grad_norm": 4.434960842132568,
"learning_rate": 2.9026128266033256e-06,
"loss": 0.3121,
"step": 8135
},
{
"epoch": 1.7400598546387345,
"grad_norm": 3.5216495990753174,
"learning_rate": 2.8907363420427553e-06,
"loss": 0.2696,
"step": 8140
},
{
"epoch": 1.7411286874732792,
"grad_norm": 3.2195262908935547,
"learning_rate": 2.878859857482186e-06,
"loss": 0.1871,
"step": 8145
},
{
"epoch": 1.742197520307824,
"grad_norm": 2.6963675022125244,
"learning_rate": 2.8669833729216155e-06,
"loss": 0.2427,
"step": 8150
},
{
"epoch": 1.7432663531423684,
"grad_norm": 3.3632442951202393,
"learning_rate": 2.8551068883610457e-06,
"loss": 0.215,
"step": 8155
},
{
"epoch": 1.7443351859769132,
"grad_norm": 4.627504825592041,
"learning_rate": 2.8432304038004754e-06,
"loss": 0.2603,
"step": 8160
},
{
"epoch": 1.745404018811458,
"grad_norm": 4.896625995635986,
"learning_rate": 2.8313539192399055e-06,
"loss": 0.2149,
"step": 8165
},
{
"epoch": 1.7464728516460024,
"grad_norm": 3.6175167560577393,
"learning_rate": 2.819477434679335e-06,
"loss": 0.2961,
"step": 8170
},
{
"epoch": 1.7475416844805474,
"grad_norm": 2.9704079627990723,
"learning_rate": 2.807600950118765e-06,
"loss": 0.2363,
"step": 8175
},
{
"epoch": 1.7486105173150919,
"grad_norm": 5.211386203765869,
"learning_rate": 2.795724465558195e-06,
"loss": 0.2238,
"step": 8180
},
{
"epoch": 1.7496793501496366,
"grad_norm": 4.538329601287842,
"learning_rate": 2.7838479809976247e-06,
"loss": 0.2522,
"step": 8185
},
{
"epoch": 1.7507481829841813,
"grad_norm": 4.693541049957275,
"learning_rate": 2.771971496437055e-06,
"loss": 0.2314,
"step": 8190
},
{
"epoch": 1.7518170158187258,
"grad_norm": 6.232285499572754,
"learning_rate": 2.7600950118764846e-06,
"loss": 0.34,
"step": 8195
},
{
"epoch": 1.7528858486532708,
"grad_norm": 3.3624300956726074,
"learning_rate": 2.7482185273159147e-06,
"loss": 0.3113,
"step": 8200
},
{
"epoch": 1.7539546814878153,
"grad_norm": 4.9479193687438965,
"learning_rate": 2.7363420427553444e-06,
"loss": 0.2022,
"step": 8205
},
{
"epoch": 1.75502351432236,
"grad_norm": 2.4150469303131104,
"learning_rate": 2.7244655581947745e-06,
"loss": 0.2244,
"step": 8210
},
{
"epoch": 1.7560923471569048,
"grad_norm": 2.7240800857543945,
"learning_rate": 2.7125890736342046e-06,
"loss": 0.1794,
"step": 8215
},
{
"epoch": 1.7571611799914493,
"grad_norm": 5.584763526916504,
"learning_rate": 2.7007125890736348e-06,
"loss": 0.3058,
"step": 8220
},
{
"epoch": 1.758230012825994,
"grad_norm": 6.3999505043029785,
"learning_rate": 2.6888361045130645e-06,
"loss": 0.2495,
"step": 8225
},
{
"epoch": 1.7592988456605387,
"grad_norm": 3.8963570594787598,
"learning_rate": 2.6769596199524946e-06,
"loss": 0.3586,
"step": 8230
},
{
"epoch": 1.7603676784950832,
"grad_norm": 4.307738780975342,
"learning_rate": 2.6650831353919243e-06,
"loss": 0.2726,
"step": 8235
},
{
"epoch": 1.7614365113296282,
"grad_norm": 4.685522079467773,
"learning_rate": 2.653206650831354e-06,
"loss": 0.2774,
"step": 8240
},
{
"epoch": 1.7625053441641727,
"grad_norm": 3.685218095779419,
"learning_rate": 2.641330166270784e-06,
"loss": 0.3009,
"step": 8245
},
{
"epoch": 1.7635741769987174,
"grad_norm": 3.9087140560150146,
"learning_rate": 2.629453681710214e-06,
"loss": 0.2813,
"step": 8250
},
{
"epoch": 1.7646430098332622,
"grad_norm": 4.455633163452148,
"learning_rate": 2.617577197149644e-06,
"loss": 0.2942,
"step": 8255
},
{
"epoch": 1.7657118426678067,
"grad_norm": 3.3832907676696777,
"learning_rate": 2.6057007125890737e-06,
"loss": 0.2463,
"step": 8260
},
{
"epoch": 1.7667806755023514,
"grad_norm": 4.235377788543701,
"learning_rate": 2.5938242280285038e-06,
"loss": 0.2399,
"step": 8265
},
{
"epoch": 1.7678495083368961,
"grad_norm": 5.997225761413574,
"learning_rate": 2.5819477434679335e-06,
"loss": 0.2725,
"step": 8270
},
{
"epoch": 1.7689183411714406,
"grad_norm": 3.9668803215026855,
"learning_rate": 2.5700712589073636e-06,
"loss": 0.2171,
"step": 8275
},
{
"epoch": 1.7699871740059856,
"grad_norm": 6.379711151123047,
"learning_rate": 2.5581947743467933e-06,
"loss": 0.3037,
"step": 8280
},
{
"epoch": 1.77105600684053,
"grad_norm": 4.1840901374816895,
"learning_rate": 2.546318289786224e-06,
"loss": 0.1921,
"step": 8285
},
{
"epoch": 1.7721248396750748,
"grad_norm": 3.4607646465301514,
"learning_rate": 2.5344418052256536e-06,
"loss": 0.2519,
"step": 8290
},
{
"epoch": 1.7731936725096196,
"grad_norm": 4.899019241333008,
"learning_rate": 2.5225653206650837e-06,
"loss": 0.2644,
"step": 8295
},
{
"epoch": 1.774262505344164,
"grad_norm": 3.769134283065796,
"learning_rate": 2.5106888361045134e-06,
"loss": 0.3707,
"step": 8300
},
{
"epoch": 1.7753313381787088,
"grad_norm": 3.0456831455230713,
"learning_rate": 2.4988123515439435e-06,
"loss": 0.1496,
"step": 8305
},
{
"epoch": 1.7764001710132535,
"grad_norm": 4.198024749755859,
"learning_rate": 2.4869358669833732e-06,
"loss": 0.2251,
"step": 8310
},
{
"epoch": 1.777469003847798,
"grad_norm": 3.964083194732666,
"learning_rate": 2.475059382422803e-06,
"loss": 0.2426,
"step": 8315
},
{
"epoch": 1.778537836682343,
"grad_norm": 4.519120216369629,
"learning_rate": 2.463182897862233e-06,
"loss": 0.2853,
"step": 8320
},
{
"epoch": 1.7796066695168875,
"grad_norm": 4.322653293609619,
"learning_rate": 2.4513064133016627e-06,
"loss": 0.2156,
"step": 8325
},
{
"epoch": 1.7806755023514322,
"grad_norm": 2.6961798667907715,
"learning_rate": 2.439429928741093e-06,
"loss": 0.2293,
"step": 8330
},
{
"epoch": 1.781744335185977,
"grad_norm": 4.139772415161133,
"learning_rate": 2.4275534441805226e-06,
"loss": 0.2516,
"step": 8335
},
{
"epoch": 1.7828131680205215,
"grad_norm": 3.3040573596954346,
"learning_rate": 2.4156769596199527e-06,
"loss": 0.2272,
"step": 8340
},
{
"epoch": 1.7838820008550664,
"grad_norm": 4.51014518737793,
"learning_rate": 2.403800475059383e-06,
"loss": 0.2995,
"step": 8345
},
{
"epoch": 1.784950833689611,
"grad_norm": 3.647020101547241,
"learning_rate": 2.3919239904988125e-06,
"loss": 0.2825,
"step": 8350
},
{
"epoch": 1.7860196665241557,
"grad_norm": 3.456620931625366,
"learning_rate": 2.3800475059382427e-06,
"loss": 0.2604,
"step": 8355
},
{
"epoch": 1.7870884993587004,
"grad_norm": 5.626756191253662,
"learning_rate": 2.3681710213776724e-06,
"loss": 0.2216,
"step": 8360
},
{
"epoch": 1.788157332193245,
"grad_norm": 4.277560710906982,
"learning_rate": 2.356294536817102e-06,
"loss": 0.3034,
"step": 8365
},
{
"epoch": 1.7892261650277896,
"grad_norm": 2.8576090335845947,
"learning_rate": 2.344418052256532e-06,
"loss": 0.229,
"step": 8370
},
{
"epoch": 1.7902949978623344,
"grad_norm": 4.79686975479126,
"learning_rate": 2.3325415676959623e-06,
"loss": 0.271,
"step": 8375
},
{
"epoch": 1.7913638306968789,
"grad_norm": 5.135036945343018,
"learning_rate": 2.320665083135392e-06,
"loss": 0.2371,
"step": 8380
},
{
"epoch": 1.7924326635314238,
"grad_norm": 5.7761406898498535,
"learning_rate": 2.308788598574822e-06,
"loss": 0.2592,
"step": 8385
},
{
"epoch": 1.7935014963659683,
"grad_norm": 2.8430325984954834,
"learning_rate": 2.296912114014252e-06,
"loss": 0.206,
"step": 8390
},
{
"epoch": 1.794570329200513,
"grad_norm": 4.540223598480225,
"learning_rate": 2.285035629453682e-06,
"loss": 0.2167,
"step": 8395
},
{
"epoch": 1.7956391620350578,
"grad_norm": 4.889501094818115,
"learning_rate": 2.2731591448931117e-06,
"loss": 0.2521,
"step": 8400
},
{
"epoch": 1.7967079948696023,
"grad_norm": 3.3274142742156982,
"learning_rate": 2.261282660332542e-06,
"loss": 0.2283,
"step": 8405
},
{
"epoch": 1.797776827704147,
"grad_norm": 3.501002073287964,
"learning_rate": 2.249406175771972e-06,
"loss": 0.2161,
"step": 8410
},
{
"epoch": 1.7988456605386918,
"grad_norm": 2.9936413764953613,
"learning_rate": 2.2375296912114016e-06,
"loss": 0.2233,
"step": 8415
},
{
"epoch": 1.7999144933732363,
"grad_norm": 4.086530685424805,
"learning_rate": 2.2256532066508318e-06,
"loss": 0.2799,
"step": 8420
},
{
"epoch": 1.8009833262077812,
"grad_norm": 4.791090965270996,
"learning_rate": 2.2137767220902615e-06,
"loss": 0.2558,
"step": 8425
},
{
"epoch": 1.8020521590423257,
"grad_norm": 4.07485294342041,
"learning_rate": 2.201900237529691e-06,
"loss": 0.3093,
"step": 8430
},
{
"epoch": 1.8031209918768705,
"grad_norm": 4.454413414001465,
"learning_rate": 2.1900237529691213e-06,
"loss": 0.2751,
"step": 8435
},
{
"epoch": 1.8041898247114152,
"grad_norm": 4.849613666534424,
"learning_rate": 2.178147268408551e-06,
"loss": 0.268,
"step": 8440
},
{
"epoch": 1.8052586575459597,
"grad_norm": 4.424874782562256,
"learning_rate": 2.166270783847981e-06,
"loss": 0.2473,
"step": 8445
},
{
"epoch": 1.8063274903805044,
"grad_norm": 5.070244789123535,
"learning_rate": 2.1543942992874112e-06,
"loss": 0.3218,
"step": 8450
},
{
"epoch": 1.8073963232150492,
"grad_norm": 4.407561302185059,
"learning_rate": 2.142517814726841e-06,
"loss": 0.2602,
"step": 8455
},
{
"epoch": 1.8084651560495937,
"grad_norm": 3.2732160091400146,
"learning_rate": 2.130641330166271e-06,
"loss": 0.2118,
"step": 8460
},
{
"epoch": 1.8095339888841386,
"grad_norm": 6.757079124450684,
"learning_rate": 2.1187648456057008e-06,
"loss": 0.2526,
"step": 8465
},
{
"epoch": 1.8106028217186831,
"grad_norm": 3.9517734050750732,
"learning_rate": 2.106888361045131e-06,
"loss": 0.2797,
"step": 8470
},
{
"epoch": 1.8116716545532279,
"grad_norm": 3.6137807369232178,
"learning_rate": 2.0950118764845606e-06,
"loss": 0.2177,
"step": 8475
},
{
"epoch": 1.8127404873877726,
"grad_norm": 3.5731587409973145,
"learning_rate": 2.0831353919239907e-06,
"loss": 0.2264,
"step": 8480
},
{
"epoch": 1.8138093202223171,
"grad_norm": 4.859638690948486,
"learning_rate": 2.071258907363421e-06,
"loss": 0.253,
"step": 8485
},
{
"epoch": 1.814878153056862,
"grad_norm": 4.231696605682373,
"learning_rate": 2.0593824228028506e-06,
"loss": 0.199,
"step": 8490
},
{
"epoch": 1.8159469858914066,
"grad_norm": 3.7343459129333496,
"learning_rate": 2.0475059382422803e-06,
"loss": 0.2484,
"step": 8495
},
{
"epoch": 1.8170158187259513,
"grad_norm": 4.6749958992004395,
"learning_rate": 2.0356294536817104e-06,
"loss": 0.2666,
"step": 8500
},
{
"epoch": 1.818084651560496,
"grad_norm": 3.5160164833068848,
"learning_rate": 2.02375296912114e-06,
"loss": 0.2423,
"step": 8505
},
{
"epoch": 1.8191534843950405,
"grad_norm": 5.324501037597656,
"learning_rate": 2.01187648456057e-06,
"loss": 0.3206,
"step": 8510
},
{
"epoch": 1.8202223172295853,
"grad_norm": 4.3562092781066895,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.1885,
"step": 8515
},
{
"epoch": 1.82129115006413,
"grad_norm": 3.188398838043213,
"learning_rate": 1.98812351543943e-06,
"loss": 0.3103,
"step": 8520
},
{
"epoch": 1.8223599828986745,
"grad_norm": 3.9081082344055176,
"learning_rate": 1.97624703087886e-06,
"loss": 0.2204,
"step": 8525
},
{
"epoch": 1.8234288157332195,
"grad_norm": 4.220818519592285,
"learning_rate": 1.96437054631829e-06,
"loss": 0.2269,
"step": 8530
},
{
"epoch": 1.824497648567764,
"grad_norm": 4.2256035804748535,
"learning_rate": 1.95249406175772e-06,
"loss": 0.3602,
"step": 8535
},
{
"epoch": 1.8255664814023087,
"grad_norm": 3.092357635498047,
"learning_rate": 1.9406175771971497e-06,
"loss": 0.1735,
"step": 8540
},
{
"epoch": 1.8266353142368534,
"grad_norm": 5.8758649826049805,
"learning_rate": 1.9287410926365794e-06,
"loss": 0.3107,
"step": 8545
},
{
"epoch": 1.827704147071398,
"grad_norm": 4.43316650390625,
"learning_rate": 1.91686460807601e-06,
"loss": 0.2346,
"step": 8550
},
{
"epoch": 1.8287729799059427,
"grad_norm": 4.877310276031494,
"learning_rate": 1.9049881235154396e-06,
"loss": 0.2372,
"step": 8555
},
{
"epoch": 1.8298418127404874,
"grad_norm": 5.378355026245117,
"learning_rate": 1.8931116389548696e-06,
"loss": 0.2665,
"step": 8560
},
{
"epoch": 1.830910645575032,
"grad_norm": 4.576028347015381,
"learning_rate": 1.8812351543942995e-06,
"loss": 0.2722,
"step": 8565
},
{
"epoch": 1.8319794784095769,
"grad_norm": 2.452864646911621,
"learning_rate": 1.8693586698337294e-06,
"loss": 0.1879,
"step": 8570
},
{
"epoch": 1.8330483112441214,
"grad_norm": 4.013648509979248,
"learning_rate": 1.8574821852731593e-06,
"loss": 0.2434,
"step": 8575
},
{
"epoch": 1.834117144078666,
"grad_norm": 4.295891761779785,
"learning_rate": 1.845605700712589e-06,
"loss": 0.3408,
"step": 8580
},
{
"epoch": 1.8351859769132108,
"grad_norm": 5.399013042449951,
"learning_rate": 1.8337292161520193e-06,
"loss": 0.2819,
"step": 8585
},
{
"epoch": 1.8362548097477553,
"grad_norm": 5.267197608947754,
"learning_rate": 1.8218527315914493e-06,
"loss": 0.2828,
"step": 8590
},
{
"epoch": 1.8373236425823,
"grad_norm": 4.1791672706604,
"learning_rate": 1.809976247030879e-06,
"loss": 0.1821,
"step": 8595
},
{
"epoch": 1.8383924754168448,
"grad_norm": 4.158424377441406,
"learning_rate": 1.7980997624703089e-06,
"loss": 0.2193,
"step": 8600
},
{
"epoch": 1.8394613082513895,
"grad_norm": 3.101128101348877,
"learning_rate": 1.7862232779097388e-06,
"loss": 0.3155,
"step": 8605
},
{
"epoch": 1.8405301410859343,
"grad_norm": 3.661057233810425,
"learning_rate": 1.7743467933491687e-06,
"loss": 0.2129,
"step": 8610
},
{
"epoch": 1.8415989739204788,
"grad_norm": 3.7547378540039062,
"learning_rate": 1.7624703087885986e-06,
"loss": 0.2321,
"step": 8615
},
{
"epoch": 1.8426678067550235,
"grad_norm": 4.53202486038208,
"learning_rate": 1.7505938242280287e-06,
"loss": 0.1748,
"step": 8620
},
{
"epoch": 1.8437366395895682,
"grad_norm": 3.7189040184020996,
"learning_rate": 1.7387173396674587e-06,
"loss": 0.2355,
"step": 8625
},
{
"epoch": 1.8448054724241127,
"grad_norm": 5.827390670776367,
"learning_rate": 1.7268408551068886e-06,
"loss": 0.2226,
"step": 8630
},
{
"epoch": 1.8458743052586577,
"grad_norm": 4.365615367889404,
"learning_rate": 1.7149643705463185e-06,
"loss": 0.2812,
"step": 8635
},
{
"epoch": 1.8469431380932022,
"grad_norm": 4.593905925750732,
"learning_rate": 1.7030878859857484e-06,
"loss": 0.2542,
"step": 8640
},
{
"epoch": 1.848011970927747,
"grad_norm": 4.3599419593811035,
"learning_rate": 1.691211401425178e-06,
"loss": 0.214,
"step": 8645
},
{
"epoch": 1.8490808037622917,
"grad_norm": 5.342328071594238,
"learning_rate": 1.679334916864608e-06,
"loss": 0.2157,
"step": 8650
},
{
"epoch": 1.8501496365968362,
"grad_norm": 3.1678943634033203,
"learning_rate": 1.6674584323040384e-06,
"loss": 0.2336,
"step": 8655
},
{
"epoch": 1.851218469431381,
"grad_norm": 4.464089870452881,
"learning_rate": 1.655581947743468e-06,
"loss": 0.3409,
"step": 8660
},
{
"epoch": 1.8522873022659256,
"grad_norm": 4.1919755935668945,
"learning_rate": 1.643705463182898e-06,
"loss": 0.2902,
"step": 8665
},
{
"epoch": 1.8533561351004701,
"grad_norm": 3.814858913421631,
"learning_rate": 1.6318289786223279e-06,
"loss": 0.2329,
"step": 8670
},
{
"epoch": 1.854424967935015,
"grad_norm": 3.2706382274627686,
"learning_rate": 1.6199524940617578e-06,
"loss": 0.1849,
"step": 8675
},
{
"epoch": 1.8554938007695596,
"grad_norm": 3.6442952156066895,
"learning_rate": 1.6080760095011877e-06,
"loss": 0.2919,
"step": 8680
},
{
"epoch": 1.8565626336041043,
"grad_norm": 3.179872512817383,
"learning_rate": 1.5961995249406176e-06,
"loss": 0.2157,
"step": 8685
},
{
"epoch": 1.857631466438649,
"grad_norm": 3.71156644821167,
"learning_rate": 1.5843230403800478e-06,
"loss": 0.2286,
"step": 8690
},
{
"epoch": 1.8587002992731936,
"grad_norm": 5.000162124633789,
"learning_rate": 1.5724465558194777e-06,
"loss": 0.1957,
"step": 8695
},
{
"epoch": 1.8597691321077383,
"grad_norm": 3.7217514514923096,
"learning_rate": 1.5605700712589076e-06,
"loss": 0.2102,
"step": 8700
},
{
"epoch": 1.860837964942283,
"grad_norm": 5.23848295211792,
"learning_rate": 1.5486935866983375e-06,
"loss": 0.3172,
"step": 8705
},
{
"epoch": 1.8619067977768275,
"grad_norm": 3.95940899848938,
"learning_rate": 1.5368171021377672e-06,
"loss": 0.2619,
"step": 8710
},
{
"epoch": 1.8629756306113725,
"grad_norm": 4.389864921569824,
"learning_rate": 1.5249406175771971e-06,
"loss": 0.2898,
"step": 8715
},
{
"epoch": 1.864044463445917,
"grad_norm": 4.196899890899658,
"learning_rate": 1.513064133016627e-06,
"loss": 0.2523,
"step": 8720
},
{
"epoch": 1.8651132962804617,
"grad_norm": 4.35107946395874,
"learning_rate": 1.5011876484560572e-06,
"loss": 0.2534,
"step": 8725
},
{
"epoch": 1.8661821291150065,
"grad_norm": 5.233465194702148,
"learning_rate": 1.489311163895487e-06,
"loss": 0.2546,
"step": 8730
},
{
"epoch": 1.867250961949551,
"grad_norm": 4.285619735717773,
"learning_rate": 1.477434679334917e-06,
"loss": 0.2171,
"step": 8735
},
{
"epoch": 1.868319794784096,
"grad_norm": 5.0237579345703125,
"learning_rate": 1.465558194774347e-06,
"loss": 0.2617,
"step": 8740
},
{
"epoch": 1.8693886276186404,
"grad_norm": 3.848062753677368,
"learning_rate": 1.4536817102137768e-06,
"loss": 0.1917,
"step": 8745
},
{
"epoch": 1.8704574604531852,
"grad_norm": 3.6329150199890137,
"learning_rate": 1.4418052256532067e-06,
"loss": 0.2256,
"step": 8750
},
{
"epoch": 1.87152629328773,
"grad_norm": 4.504333019256592,
"learning_rate": 1.4299287410926366e-06,
"loss": 0.2319,
"step": 8755
},
{
"epoch": 1.8725951261222744,
"grad_norm": 6.011372089385986,
"learning_rate": 1.4180522565320668e-06,
"loss": 0.2783,
"step": 8760
},
{
"epoch": 1.8736639589568191,
"grad_norm": 4.750868320465088,
"learning_rate": 1.4061757719714967e-06,
"loss": 0.2885,
"step": 8765
},
{
"epoch": 1.8747327917913639,
"grad_norm": 3.2728309631347656,
"learning_rate": 1.3942992874109266e-06,
"loss": 0.2586,
"step": 8770
},
{
"epoch": 1.8758016246259084,
"grad_norm": 3.3371262550354004,
"learning_rate": 1.3824228028503565e-06,
"loss": 0.2009,
"step": 8775
},
{
"epoch": 1.8768704574604533,
"grad_norm": 3.7395825386047363,
"learning_rate": 1.3705463182897862e-06,
"loss": 0.273,
"step": 8780
},
{
"epoch": 1.8779392902949978,
"grad_norm": 4.672481060028076,
"learning_rate": 1.3586698337292161e-06,
"loss": 0.2502,
"step": 8785
},
{
"epoch": 1.8790081231295426,
"grad_norm": 2.957099676132202,
"learning_rate": 1.346793349168646e-06,
"loss": 0.2174,
"step": 8790
},
{
"epoch": 1.8800769559640873,
"grad_norm": 4.8943915367126465,
"learning_rate": 1.3349168646080762e-06,
"loss": 0.2723,
"step": 8795
},
{
"epoch": 1.8811457887986318,
"grad_norm": 4.067677021026611,
"learning_rate": 1.323040380047506e-06,
"loss": 0.2633,
"step": 8800
},
{
"epoch": 1.8822146216331765,
"grad_norm": 4.314869403839111,
"learning_rate": 1.311163895486936e-06,
"loss": 0.2794,
"step": 8805
},
{
"epoch": 1.8832834544677213,
"grad_norm": 4.225076675415039,
"learning_rate": 1.299287410926366e-06,
"loss": 0.2961,
"step": 8810
},
{
"epoch": 1.8843522873022658,
"grad_norm": 3.992135763168335,
"learning_rate": 1.2874109263657958e-06,
"loss": 0.2598,
"step": 8815
},
{
"epoch": 1.8854211201368107,
"grad_norm": 4.5158586502075195,
"learning_rate": 1.2755344418052257e-06,
"loss": 0.2794,
"step": 8820
},
{
"epoch": 1.8864899529713552,
"grad_norm": 4.226551055908203,
"learning_rate": 1.2636579572446556e-06,
"loss": 0.2447,
"step": 8825
},
{
"epoch": 1.8875587858059,
"grad_norm": 3.2052338123321533,
"learning_rate": 1.2517814726840858e-06,
"loss": 0.2741,
"step": 8830
},
{
"epoch": 1.8886276186404447,
"grad_norm": 3.315537929534912,
"learning_rate": 1.2399049881235155e-06,
"loss": 0.2192,
"step": 8835
},
{
"epoch": 1.8896964514749892,
"grad_norm": 4.095473289489746,
"learning_rate": 1.2280285035629456e-06,
"loss": 0.3188,
"step": 8840
},
{
"epoch": 1.890765284309534,
"grad_norm": 4.654134273529053,
"learning_rate": 1.2161520190023753e-06,
"loss": 0.294,
"step": 8845
},
{
"epoch": 1.8918341171440787,
"grad_norm": 3.982452154159546,
"learning_rate": 1.2042755344418052e-06,
"loss": 0.2961,
"step": 8850
},
{
"epoch": 1.8929029499786232,
"grad_norm": 3.594325542449951,
"learning_rate": 1.1923990498812353e-06,
"loss": 0.2288,
"step": 8855
},
{
"epoch": 1.8939717828131681,
"grad_norm": 4.437509059906006,
"learning_rate": 1.1805225653206653e-06,
"loss": 0.2796,
"step": 8860
},
{
"epoch": 1.8950406156477126,
"grad_norm": 4.6788716316223145,
"learning_rate": 1.1686460807600952e-06,
"loss": 0.2464,
"step": 8865
},
{
"epoch": 1.8961094484822574,
"grad_norm": 4.381009578704834,
"learning_rate": 1.1567695961995249e-06,
"loss": 0.2435,
"step": 8870
},
{
"epoch": 1.897178281316802,
"grad_norm": 4.203982353210449,
"learning_rate": 1.144893111638955e-06,
"loss": 0.2993,
"step": 8875
},
{
"epoch": 1.8982471141513466,
"grad_norm": 3.9560775756835938,
"learning_rate": 1.133016627078385e-06,
"loss": 0.2049,
"step": 8880
},
{
"epoch": 1.8993159469858916,
"grad_norm": 4.908998012542725,
"learning_rate": 1.1211401425178148e-06,
"loss": 0.2588,
"step": 8885
},
{
"epoch": 1.900384779820436,
"grad_norm": 2.399383544921875,
"learning_rate": 1.1092636579572447e-06,
"loss": 0.2559,
"step": 8890
},
{
"epoch": 1.9014536126549808,
"grad_norm": 5.100274085998535,
"learning_rate": 1.0973871733966747e-06,
"loss": 0.261,
"step": 8895
},
{
"epoch": 1.9025224454895255,
"grad_norm": 1.9479761123657227,
"learning_rate": 1.0855106888361046e-06,
"loss": 0.2132,
"step": 8900
},
{
"epoch": 1.90359127832407,
"grad_norm": 4.266331195831299,
"learning_rate": 1.0736342042755345e-06,
"loss": 0.2184,
"step": 8905
},
{
"epoch": 1.9046601111586148,
"grad_norm": 3.761469841003418,
"learning_rate": 1.0617577197149644e-06,
"loss": 0.2551,
"step": 8910
},
{
"epoch": 1.9057289439931595,
"grad_norm": 5.301465034484863,
"learning_rate": 1.0498812351543943e-06,
"loss": 0.2302,
"step": 8915
},
{
"epoch": 1.906797776827704,
"grad_norm": 4.8627095222473145,
"learning_rate": 1.0380047505938242e-06,
"loss": 0.2441,
"step": 8920
},
{
"epoch": 1.907866609662249,
"grad_norm": 3.7152163982391357,
"learning_rate": 1.0261282660332544e-06,
"loss": 0.2176,
"step": 8925
},
{
"epoch": 1.9089354424967935,
"grad_norm": 4.612980365753174,
"learning_rate": 1.0142517814726843e-06,
"loss": 0.3041,
"step": 8930
},
{
"epoch": 1.9100042753313382,
"grad_norm": 3.9601426124572754,
"learning_rate": 1.002375296912114e-06,
"loss": 0.2325,
"step": 8935
},
{
"epoch": 1.911073108165883,
"grad_norm": 3.773958921432495,
"learning_rate": 9.904988123515439e-07,
"loss": 0.2463,
"step": 8940
},
{
"epoch": 1.9121419410004274,
"grad_norm": 5.172873020172119,
"learning_rate": 9.78622327790974e-07,
"loss": 0.2997,
"step": 8945
},
{
"epoch": 1.9132107738349722,
"grad_norm": 3.382683038711548,
"learning_rate": 9.66745843230404e-07,
"loss": 0.2202,
"step": 8950
},
{
"epoch": 1.914279606669517,
"grad_norm": 5.699649333953857,
"learning_rate": 9.548693586698338e-07,
"loss": 0.2745,
"step": 8955
},
{
"epoch": 1.9153484395040614,
"grad_norm": 4.574731349945068,
"learning_rate": 9.429928741092638e-07,
"loss": 0.2642,
"step": 8960
},
{
"epoch": 1.9164172723386064,
"grad_norm": 7.173608303070068,
"learning_rate": 9.311163895486937e-07,
"loss": 0.2782,
"step": 8965
},
{
"epoch": 1.9174861051731509,
"grad_norm": 3.9324846267700195,
"learning_rate": 9.192399049881236e-07,
"loss": 0.2435,
"step": 8970
},
{
"epoch": 1.9185549380076956,
"grad_norm": 3.742494583129883,
"learning_rate": 9.073634204275535e-07,
"loss": 0.2492,
"step": 8975
},
{
"epoch": 1.9196237708422403,
"grad_norm": 5.236582279205322,
"learning_rate": 8.954869358669835e-07,
"loss": 0.2161,
"step": 8980
},
{
"epoch": 1.9206926036767848,
"grad_norm": 3.473259449005127,
"learning_rate": 8.836104513064133e-07,
"loss": 0.2549,
"step": 8985
},
{
"epoch": 1.9217614365113296,
"grad_norm": 3.2006514072418213,
"learning_rate": 8.717339667458432e-07,
"loss": 0.2217,
"step": 8990
},
{
"epoch": 1.9228302693458743,
"grad_norm": 3.0505008697509766,
"learning_rate": 8.598574821852733e-07,
"loss": 0.266,
"step": 8995
},
{
"epoch": 1.9238991021804188,
"grad_norm": 3.8124094009399414,
"learning_rate": 8.479809976247032e-07,
"loss": 0.2909,
"step": 9000
},
{
"epoch": 1.9249679350149638,
"grad_norm": 3.0390665531158447,
"learning_rate": 8.361045130641331e-07,
"loss": 0.2149,
"step": 9005
},
{
"epoch": 1.9260367678495083,
"grad_norm": 3.928755521774292,
"learning_rate": 8.24228028503563e-07,
"loss": 0.3099,
"step": 9010
},
{
"epoch": 1.927105600684053,
"grad_norm": 4.092939376831055,
"learning_rate": 8.12351543942993e-07,
"loss": 0.1929,
"step": 9015
},
{
"epoch": 1.9281744335185977,
"grad_norm": 4.7592573165893555,
"learning_rate": 8.004750593824228e-07,
"loss": 0.2854,
"step": 9020
},
{
"epoch": 1.9292432663531422,
"grad_norm": 3.904730796813965,
"learning_rate": 7.885985748218527e-07,
"loss": 0.1857,
"step": 9025
},
{
"epoch": 1.9303120991876872,
"grad_norm": 4.656405925750732,
"learning_rate": 7.767220902612828e-07,
"loss": 0.2445,
"step": 9030
},
{
"epoch": 1.9313809320222317,
"grad_norm": 3.890486240386963,
"learning_rate": 7.648456057007127e-07,
"loss": 0.226,
"step": 9035
},
{
"epoch": 1.9324497648567764,
"grad_norm": 4.5724334716796875,
"learning_rate": 7.529691211401426e-07,
"loss": 0.2822,
"step": 9040
},
{
"epoch": 1.9335185976913212,
"grad_norm": 4.720613479614258,
"learning_rate": 7.410926365795724e-07,
"loss": 0.2541,
"step": 9045
},
{
"epoch": 1.9345874305258657,
"grad_norm": 3.9262373447418213,
"learning_rate": 7.292161520190025e-07,
"loss": 0.2442,
"step": 9050
},
{
"epoch": 1.9356562633604104,
"grad_norm": 3.6456849575042725,
"learning_rate": 7.173396674584323e-07,
"loss": 0.2504,
"step": 9055
},
{
"epoch": 1.9367250961949551,
"grad_norm": 3.021383762359619,
"learning_rate": 7.054631828978623e-07,
"loss": 0.2073,
"step": 9060
},
{
"epoch": 1.9377939290294997,
"grad_norm": 4.671846389770508,
"learning_rate": 6.935866983372923e-07,
"loss": 0.2245,
"step": 9065
},
{
"epoch": 1.9388627618640446,
"grad_norm": 4.805634021759033,
"learning_rate": 6.817102137767222e-07,
"loss": 0.2442,
"step": 9070
},
{
"epoch": 1.9399315946985891,
"grad_norm": 3.9393720626831055,
"learning_rate": 6.698337292161521e-07,
"loss": 0.2382,
"step": 9075
},
{
"epoch": 1.9410004275331338,
"grad_norm": 5.1551408767700195,
"learning_rate": 6.579572446555819e-07,
"loss": 0.2482,
"step": 9080
},
{
"epoch": 1.9420692603676786,
"grad_norm": 5.381765365600586,
"learning_rate": 6.460807600950119e-07,
"loss": 0.2849,
"step": 9085
},
{
"epoch": 1.943138093202223,
"grad_norm": 3.842059850692749,
"learning_rate": 6.342042755344418e-07,
"loss": 0.2666,
"step": 9090
},
{
"epoch": 1.9442069260367678,
"grad_norm": 4.254835605621338,
"learning_rate": 6.223277909738719e-07,
"loss": 0.224,
"step": 9095
},
{
"epoch": 1.9452757588713125,
"grad_norm": 5.467522144317627,
"learning_rate": 6.104513064133017e-07,
"loss": 0.2961,
"step": 9100
},
{
"epoch": 1.946344591705857,
"grad_norm": 4.110438823699951,
"learning_rate": 5.985748218527317e-07,
"loss": 0.217,
"step": 9105
},
{
"epoch": 1.947413424540402,
"grad_norm": 4.675514221191406,
"learning_rate": 5.866983372921616e-07,
"loss": 0.2384,
"step": 9110
},
{
"epoch": 1.9484822573749465,
"grad_norm": 4.90285062789917,
"learning_rate": 5.748218527315915e-07,
"loss": 0.2205,
"step": 9115
},
{
"epoch": 1.9495510902094912,
"grad_norm": 4.838087558746338,
"learning_rate": 5.629453681710214e-07,
"loss": 0.2807,
"step": 9120
},
{
"epoch": 1.950619923044036,
"grad_norm": 4.49014949798584,
"learning_rate": 5.510688836104513e-07,
"loss": 0.2577,
"step": 9125
},
{
"epoch": 1.9516887558785805,
"grad_norm": 6.248046398162842,
"learning_rate": 5.391923990498813e-07,
"loss": 0.3212,
"step": 9130
},
{
"epoch": 1.9527575887131252,
"grad_norm": 2.6727161407470703,
"learning_rate": 5.273159144893112e-07,
"loss": 0.239,
"step": 9135
},
{
"epoch": 1.95382642154767,
"grad_norm": 5.567617416381836,
"learning_rate": 5.154394299287412e-07,
"loss": 0.2283,
"step": 9140
},
{
"epoch": 1.9548952543822147,
"grad_norm": 4.877483367919922,
"learning_rate": 5.03562945368171e-07,
"loss": 0.2606,
"step": 9145
},
{
"epoch": 1.9559640872167594,
"grad_norm": 4.150485515594482,
"learning_rate": 4.91686460807601e-07,
"loss": 0.2684,
"step": 9150
},
{
"epoch": 1.957032920051304,
"grad_norm": 4.878507614135742,
"learning_rate": 4.798099762470309e-07,
"loss": 0.2857,
"step": 9155
},
{
"epoch": 1.9581017528858486,
"grad_norm": 5.343387126922607,
"learning_rate": 4.6793349168646085e-07,
"loss": 0.2962,
"step": 9160
},
{
"epoch": 1.9591705857203934,
"grad_norm": 4.346437454223633,
"learning_rate": 4.560570071258908e-07,
"loss": 0.2639,
"step": 9165
},
{
"epoch": 1.9602394185549379,
"grad_norm": 5.331128120422363,
"learning_rate": 4.441805225653207e-07,
"loss": 0.2549,
"step": 9170
},
{
"epoch": 1.9613082513894828,
"grad_norm": 4.075921535491943,
"learning_rate": 4.3230403800475065e-07,
"loss": 0.2561,
"step": 9175
},
{
"epoch": 1.9623770842240273,
"grad_norm": 4.879267692565918,
"learning_rate": 4.2042755344418056e-07,
"loss": 0.2324,
"step": 9180
},
{
"epoch": 1.963445917058572,
"grad_norm": 4.325537204742432,
"learning_rate": 4.085510688836105e-07,
"loss": 0.3142,
"step": 9185
},
{
"epoch": 1.9645147498931168,
"grad_norm": 3.530134439468384,
"learning_rate": 3.966745843230404e-07,
"loss": 0.2778,
"step": 9190
},
{
"epoch": 1.9655835827276613,
"grad_norm": 2.6315903663635254,
"learning_rate": 3.8479809976247036e-07,
"loss": 0.2603,
"step": 9195
},
{
"epoch": 1.966652415562206,
"grad_norm": 4.100142002105713,
"learning_rate": 3.729216152019002e-07,
"loss": 0.2191,
"step": 9200
},
{
"epoch": 1.9677212483967508,
"grad_norm": 3.4908711910247803,
"learning_rate": 3.610451306413302e-07,
"loss": 0.2658,
"step": 9205
},
{
"epoch": 1.9687900812312953,
"grad_norm": 4.331186771392822,
"learning_rate": 3.4916864608076015e-07,
"loss": 0.2701,
"step": 9210
},
{
"epoch": 1.9698589140658402,
"grad_norm": 6.090305805206299,
"learning_rate": 3.3729216152019e-07,
"loss": 0.3,
"step": 9215
},
{
"epoch": 1.9709277469003847,
"grad_norm": 3.7345423698425293,
"learning_rate": 3.2541567695962e-07,
"loss": 0.2728,
"step": 9220
},
{
"epoch": 1.9719965797349295,
"grad_norm": 6.370054244995117,
"learning_rate": 3.135391923990499e-07,
"loss": 0.2724,
"step": 9225
},
{
"epoch": 1.9730654125694742,
"grad_norm": 3.2030200958251953,
"learning_rate": 3.0166270783847986e-07,
"loss": 0.1735,
"step": 9230
},
{
"epoch": 1.9741342454040187,
"grad_norm": 3.904633045196533,
"learning_rate": 2.897862232779098e-07,
"loss": 0.25,
"step": 9235
},
{
"epoch": 1.9752030782385634,
"grad_norm": 5.196364402770996,
"learning_rate": 2.779097387173397e-07,
"loss": 0.276,
"step": 9240
},
{
"epoch": 1.9762719110731082,
"grad_norm": 5.8324785232543945,
"learning_rate": 2.660332541567696e-07,
"loss": 0.2581,
"step": 9245
},
{
"epoch": 1.9773407439076527,
"grad_norm": 3.4866878986358643,
"learning_rate": 2.541567695961995e-07,
"loss": 0.2545,
"step": 9250
},
{
"epoch": 1.9784095767421976,
"grad_norm": 5.080046653747559,
"learning_rate": 2.422802850356295e-07,
"loss": 0.2251,
"step": 9255
},
{
"epoch": 1.9794784095767421,
"grad_norm": 4.654627799987793,
"learning_rate": 2.304038004750594e-07,
"loss": 0.269,
"step": 9260
},
{
"epoch": 1.9805472424112869,
"grad_norm": 4.3756327629089355,
"learning_rate": 2.1852731591448934e-07,
"loss": 0.2026,
"step": 9265
},
{
"epoch": 1.9816160752458316,
"grad_norm": 4.612358093261719,
"learning_rate": 2.0665083135391925e-07,
"loss": 0.2297,
"step": 9270
},
{
"epoch": 1.9826849080803761,
"grad_norm": 4.363190174102783,
"learning_rate": 1.9477434679334917e-07,
"loss": 0.2414,
"step": 9275
},
{
"epoch": 1.983753740914921,
"grad_norm": 4.239806175231934,
"learning_rate": 1.828978622327791e-07,
"loss": 0.2724,
"step": 9280
},
{
"epoch": 1.9848225737494656,
"grad_norm": 3.087779998779297,
"learning_rate": 1.7102137767220902e-07,
"loss": 0.2338,
"step": 9285
},
{
"epoch": 1.9858914065840103,
"grad_norm": 5.1465277671813965,
"learning_rate": 1.59144893111639e-07,
"loss": 0.2606,
"step": 9290
},
{
"epoch": 1.986960239418555,
"grad_norm": 3.789433240890503,
"learning_rate": 1.4726840855106888e-07,
"loss": 0.2747,
"step": 9295
},
{
"epoch": 1.9880290722530995,
"grad_norm": 3.880868673324585,
"learning_rate": 1.3539192399049882e-07,
"loss": 0.1792,
"step": 9300
},
{
"epoch": 1.9890979050876443,
"grad_norm": 4.200949668884277,
"learning_rate": 1.2351543942992876e-07,
"loss": 0.2479,
"step": 9305
},
{
"epoch": 1.990166737922189,
"grad_norm": 4.372617721557617,
"learning_rate": 1.1163895486935867e-07,
"loss": 0.2554,
"step": 9310
},
{
"epoch": 1.9912355707567335,
"grad_norm": 3.7008919715881348,
"learning_rate": 9.97624703087886e-08,
"loss": 0.25,
"step": 9315
},
{
"epoch": 1.9923044035912785,
"grad_norm": 3.9479458332061768,
"learning_rate": 8.788598574821854e-08,
"loss": 0.2814,
"step": 9320
},
{
"epoch": 1.993373236425823,
"grad_norm": 4.310093402862549,
"learning_rate": 7.600950118764846e-08,
"loss": 0.2102,
"step": 9325
},
{
"epoch": 1.9944420692603677,
"grad_norm": 3.808363199234009,
"learning_rate": 6.41330166270784e-08,
"loss": 0.2466,
"step": 9330
},
{
"epoch": 1.9955109020949124,
"grad_norm": 4.076649188995361,
"learning_rate": 5.225653206650832e-08,
"loss": 0.2547,
"step": 9335
},
{
"epoch": 1.996579734929457,
"grad_norm": 3.773390531539917,
"learning_rate": 4.0380047505938245e-08,
"loss": 0.2216,
"step": 9340
},
{
"epoch": 1.9976485677640017,
"grad_norm": 3.149965286254883,
"learning_rate": 2.8503562945368176e-08,
"loss": 0.2521,
"step": 9345
},
{
"epoch": 1.9987174005985464,
"grad_norm": 3.375763177871704,
"learning_rate": 1.66270783847981e-08,
"loss": 0.2558,
"step": 9350
},
{
"epoch": 1.999786233433091,
"grad_norm": 5.134764194488525,
"learning_rate": 4.7505938242280285e-09,
"loss": 0.2345,
"step": 9355
},
{
"epoch": 2.0,
"eval_loss": 0.12192188948392868,
"eval_mrr": 0.9798825256975033,
"eval_runtime": 315.6223,
"eval_samples_per_second": 7.192,
"eval_steps_per_second": 0.9,
"step": 9356
}
],
"logging_steps": 5,
"max_steps": 9356,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 1,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}