VPTracker / trainer_state.json
jcwang0602's picture
Upload 27 files
ab7284e verified
{
"best_global_step": 3300,
"best_metric": 0.32621017,
"best_model_checkpoint": "/mnt/shared-storage-user/mineru4s/jcwang/VPLT/outputs/checkpoints/29_lr2e-5_bs128_e1_VLT_TT_vp_ib09_1m_full/v0-20251204-195443/checkpoint-3300",
"epoch": 1.0,
"eval_steps": 100,
"global_step": 7806,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00012810658467845247,
"grad_norm": 61.75,
"learning_rate": 5.115089514066497e-08,
"loss": 1.7527258396148682,
"step": 1,
"token_acc": 0.6929900475984422
},
{
"epoch": 0.0006405329233922624,
"grad_norm": 73.0,
"learning_rate": 2.5575447570332484e-07,
"loss": 1.7571117877960205,
"step": 5,
"token_acc": 0.6861388813834098
},
{
"epoch": 0.0012810658467845247,
"grad_norm": 58.5,
"learning_rate": 5.115089514066497e-07,
"loss": 1.7617622375488282,
"step": 10,
"token_acc": 0.6896253352946267
},
{
"epoch": 0.001921598770176787,
"grad_norm": 71.0,
"learning_rate": 7.672634271099745e-07,
"loss": 1.7400665283203125,
"step": 15,
"token_acc": 0.6863151530854601
},
{
"epoch": 0.0025621316935690495,
"grad_norm": 53.5,
"learning_rate": 1.0230179028132994e-06,
"loss": 1.7050804138183593,
"step": 20,
"token_acc": 0.6901353798396137
},
{
"epoch": 0.003202664616961312,
"grad_norm": 54.25,
"learning_rate": 1.2787723785166241e-06,
"loss": 1.6199134826660155,
"step": 25,
"token_acc": 0.7034818581401362
},
{
"epoch": 0.003843197540353574,
"grad_norm": 47.25,
"learning_rate": 1.534526854219949e-06,
"loss": 1.4921659469604491,
"step": 30,
"token_acc": 0.7096607764736231
},
{
"epoch": 0.004483730463745837,
"grad_norm": 43.25,
"learning_rate": 1.7902813299232737e-06,
"loss": 1.3593175888061524,
"step": 35,
"token_acc": 0.71506187745246
},
{
"epoch": 0.005124263387138099,
"grad_norm": 30.25,
"learning_rate": 2.0460358056265987e-06,
"loss": 1.1928886413574218,
"step": 40,
"token_acc": 0.7095462606514122
},
{
"epoch": 0.005764796310530361,
"grad_norm": 61.0,
"learning_rate": 2.3017902813299235e-06,
"loss": 1.000108528137207,
"step": 45,
"token_acc": 0.6974425102084677
},
{
"epoch": 0.006405329233922624,
"grad_norm": 47.75,
"learning_rate": 2.5575447570332483e-06,
"loss": 0.8234397888183593,
"step": 50,
"token_acc": 0.7172779381976468
},
{
"epoch": 0.007045862157314886,
"grad_norm": 15.0625,
"learning_rate": 2.813299232736573e-06,
"loss": 0.6950876235961914,
"step": 55,
"token_acc": 0.7412205198829402
},
{
"epoch": 0.007686395080707148,
"grad_norm": 9.6875,
"learning_rate": 3.069053708439898e-06,
"loss": 0.6445055961608886,
"step": 60,
"token_acc": 0.7602104627593048
},
{
"epoch": 0.00832692800409941,
"grad_norm": 4.46875,
"learning_rate": 3.3248081841432226e-06,
"loss": 0.6182816982269287,
"step": 65,
"token_acc": 0.7633587786259542
},
{
"epoch": 0.008967460927491674,
"grad_norm": 4.21875,
"learning_rate": 3.5805626598465474e-06,
"loss": 0.6172842979431152,
"step": 70,
"token_acc": 0.7656499417576255
},
{
"epoch": 0.009607993850883935,
"grad_norm": 3.359375,
"learning_rate": 3.836317135549873e-06,
"loss": 0.6099654197692871,
"step": 75,
"token_acc": 0.7649229712912501
},
{
"epoch": 0.010248526774276198,
"grad_norm": 3.265625,
"learning_rate": 4.092071611253197e-06,
"loss": 0.6063261985778808,
"step": 80,
"token_acc": 0.7644379511859746
},
{
"epoch": 0.01088905969766846,
"grad_norm": 6.34375,
"learning_rate": 4.347826086956522e-06,
"loss": 0.5800480842590332,
"step": 85,
"token_acc": 0.7750280729031701
},
{
"epoch": 0.011529592621060722,
"grad_norm": 4.03125,
"learning_rate": 4.603580562659847e-06,
"loss": 0.5782370567321777,
"step": 90,
"token_acc": 0.7780650721827193
},
{
"epoch": 0.012170125544452985,
"grad_norm": 21.125,
"learning_rate": 4.859335038363172e-06,
"loss": 0.5692886352539063,
"step": 95,
"token_acc": 0.7785625080745877
},
{
"epoch": 0.012810658467845248,
"grad_norm": 4.03125,
"learning_rate": 5.1150895140664966e-06,
"loss": 0.5636235237121582,
"step": 100,
"token_acc": 0.7817540539378037
},
{
"epoch": 0.012810658467845248,
"eval_loss": 0.5616942644119263,
"eval_runtime": 109.4288,
"eval_samples_per_second": 91.384,
"eval_steps_per_second": 11.423,
"eval_token_acc": 0.782156125595894,
"step": 100
},
{
"epoch": 0.013451191391237509,
"grad_norm": 4.34375,
"learning_rate": 5.370843989769821e-06,
"loss": 0.5511586189270019,
"step": 105,
"token_acc": 0.7842739323805
},
{
"epoch": 0.014091724314629772,
"grad_norm": 7.34375,
"learning_rate": 5.626598465473146e-06,
"loss": 0.5501980781555176,
"step": 110,
"token_acc": 0.7883570504527814
},
{
"epoch": 0.014732257238022035,
"grad_norm": 4.0,
"learning_rate": 5.882352941176471e-06,
"loss": 0.5424150466918946,
"step": 115,
"token_acc": 0.7874757908327954
},
{
"epoch": 0.015372790161414296,
"grad_norm": 4.9375,
"learning_rate": 6.138107416879796e-06,
"loss": 0.5345050811767578,
"step": 120,
"token_acc": 0.7919321508524195
},
{
"epoch": 0.01601332308480656,
"grad_norm": 3.9375,
"learning_rate": 6.3938618925831205e-06,
"loss": 0.5287456512451172,
"step": 125,
"token_acc": 0.7929382311045884
},
{
"epoch": 0.01665385600819882,
"grad_norm": 4.1875,
"learning_rate": 6.649616368286445e-06,
"loss": 0.5286868572235107,
"step": 130,
"token_acc": 0.7934768540489235
},
{
"epoch": 0.017294388931591083,
"grad_norm": 6.25,
"learning_rate": 6.90537084398977e-06,
"loss": 0.5210060119628906,
"step": 135,
"token_acc": 0.7963354171157577
},
{
"epoch": 0.017934921854983348,
"grad_norm": 5.0625,
"learning_rate": 7.161125319693095e-06,
"loss": 0.5186363697052002,
"step": 140,
"token_acc": 0.7987665502221072
},
{
"epoch": 0.01857545477837561,
"grad_norm": 6.09375,
"learning_rate": 7.41687979539642e-06,
"loss": 0.5118862152099609,
"step": 145,
"token_acc": 0.7989200863930885
},
{
"epoch": 0.01921598770176787,
"grad_norm": 6.15625,
"learning_rate": 7.672634271099745e-06,
"loss": 0.5256869316101074,
"step": 150,
"token_acc": 0.795885056483828
},
{
"epoch": 0.019856520625160134,
"grad_norm": 4.25,
"learning_rate": 7.92838874680307e-06,
"loss": 0.5057379722595214,
"step": 155,
"token_acc": 0.8006816514948876
},
{
"epoch": 0.020497053548552396,
"grad_norm": 9.9375,
"learning_rate": 8.184143222506395e-06,
"loss": 0.49903292655944825,
"step": 160,
"token_acc": 0.8046234796860174
},
{
"epoch": 0.021137586471944657,
"grad_norm": 6.78125,
"learning_rate": 8.43989769820972e-06,
"loss": 0.5005066871643067,
"step": 165,
"token_acc": 0.8053670973596647
},
{
"epoch": 0.02177811939533692,
"grad_norm": 5.46875,
"learning_rate": 8.695652173913044e-06,
"loss": 0.4884012222290039,
"step": 170,
"token_acc": 0.8094040079812613
},
{
"epoch": 0.022418652318729183,
"grad_norm": 7.90625,
"learning_rate": 8.95140664961637e-06,
"loss": 0.4938325881958008,
"step": 175,
"token_acc": 0.806672997237569
},
{
"epoch": 0.023059185242121444,
"grad_norm": 9.9375,
"learning_rate": 9.207161125319694e-06,
"loss": 0.5015275478363037,
"step": 180,
"token_acc": 0.8040328474998926
},
{
"epoch": 0.02369971816551371,
"grad_norm": 6.59375,
"learning_rate": 9.462915601023019e-06,
"loss": 0.4769923686981201,
"step": 185,
"token_acc": 0.8167363295557375
},
{
"epoch": 0.02434025108890597,
"grad_norm": 8.875,
"learning_rate": 9.718670076726344e-06,
"loss": 0.48226518630981446,
"step": 190,
"token_acc": 0.8120698554714384
},
{
"epoch": 0.02498078401229823,
"grad_norm": 5.875,
"learning_rate": 9.974424552429668e-06,
"loss": 0.48815107345581055,
"step": 195,
"token_acc": 0.8090380890897353
},
{
"epoch": 0.025621316935690495,
"grad_norm": 8.1875,
"learning_rate": 1.0230179028132993e-05,
"loss": 0.4772751808166504,
"step": 200,
"token_acc": 0.816347690845466
},
{
"epoch": 0.025621316935690495,
"eval_loss": 0.47741714119911194,
"eval_runtime": 103.9123,
"eval_samples_per_second": 96.235,
"eval_steps_per_second": 12.029,
"eval_token_acc": 0.8152050539557392,
"step": 200
},
{
"epoch": 0.026261849859082757,
"grad_norm": 8.9375,
"learning_rate": 1.0485933503836318e-05,
"loss": 0.47005443572998046,
"step": 205,
"token_acc": 0.8176898432764742
},
{
"epoch": 0.026902382782475018,
"grad_norm": 10.0625,
"learning_rate": 1.0741687979539643e-05,
"loss": 0.466142463684082,
"step": 210,
"token_acc": 0.818608860541286
},
{
"epoch": 0.027542915705867282,
"grad_norm": 7.875,
"learning_rate": 1.0997442455242967e-05,
"loss": 0.4647233963012695,
"step": 215,
"token_acc": 0.8217214883881551
},
{
"epoch": 0.028183448629259544,
"grad_norm": 23.125,
"learning_rate": 1.1253196930946292e-05,
"loss": 0.46241116523742676,
"step": 220,
"token_acc": 0.8234608913240433
},
{
"epoch": 0.028823981552651805,
"grad_norm": 10.9375,
"learning_rate": 1.1508951406649617e-05,
"loss": 0.4518951416015625,
"step": 225,
"token_acc": 0.8269572375546986
},
{
"epoch": 0.02946451447604407,
"grad_norm": 16.875,
"learning_rate": 1.1764705882352942e-05,
"loss": 0.44137048721313477,
"step": 230,
"token_acc": 0.8298306556665219
},
{
"epoch": 0.03010504739943633,
"grad_norm": 12.4375,
"learning_rate": 1.2020460358056267e-05,
"loss": 0.453232479095459,
"step": 235,
"token_acc": 0.8256595964821521
},
{
"epoch": 0.030745580322828592,
"grad_norm": 8.0,
"learning_rate": 1.2276214833759591e-05,
"loss": 0.4504352569580078,
"step": 240,
"token_acc": 0.8276514337302782
},
{
"epoch": 0.031386113246220856,
"grad_norm": 16.25,
"learning_rate": 1.2531969309462916e-05,
"loss": 0.43747830390930176,
"step": 245,
"token_acc": 0.829782636878268
},
{
"epoch": 0.03202664616961312,
"grad_norm": 6.96875,
"learning_rate": 1.2787723785166241e-05,
"loss": 0.4497882843017578,
"step": 250,
"token_acc": 0.8272813524236674
},
{
"epoch": 0.03266717909300538,
"grad_norm": 7.25,
"learning_rate": 1.3043478260869566e-05,
"loss": 0.4439809322357178,
"step": 255,
"token_acc": 0.8280041258380608
},
{
"epoch": 0.03330771201639764,
"grad_norm": 12.375,
"learning_rate": 1.329923273657289e-05,
"loss": 0.4415604591369629,
"step": 260,
"token_acc": 0.8288237828522189
},
{
"epoch": 0.03394824493978991,
"grad_norm": 9.1875,
"learning_rate": 1.3554987212276215e-05,
"loss": 0.4370439529418945,
"step": 265,
"token_acc": 0.8311521132804119
},
{
"epoch": 0.034588777863182166,
"grad_norm": 8.5625,
"learning_rate": 1.381074168797954e-05,
"loss": 0.4249903678894043,
"step": 270,
"token_acc": 0.8351054633471646
},
{
"epoch": 0.03522931078657443,
"grad_norm": 7.65625,
"learning_rate": 1.4066496163682865e-05,
"loss": 0.42871723175048826,
"step": 275,
"token_acc": 0.8337790045717243
},
{
"epoch": 0.035869843709966695,
"grad_norm": 17.25,
"learning_rate": 1.432225063938619e-05,
"loss": 0.42778358459472654,
"step": 280,
"token_acc": 0.8345190359160092
},
{
"epoch": 0.03651037663335895,
"grad_norm": 5.40625,
"learning_rate": 1.4578005115089514e-05,
"loss": 0.42468814849853515,
"step": 285,
"token_acc": 0.8370123979437557
},
{
"epoch": 0.03715090955675122,
"grad_norm": 9.3125,
"learning_rate": 1.483375959079284e-05,
"loss": 0.42493228912353515,
"step": 290,
"token_acc": 0.8359849954727719
},
{
"epoch": 0.03779144248014348,
"grad_norm": 52.25,
"learning_rate": 1.5089514066496164e-05,
"loss": 0.42238712310791016,
"step": 295,
"token_acc": 0.8344741486934435
},
{
"epoch": 0.03843197540353574,
"grad_norm": 12.4375,
"learning_rate": 1.534526854219949e-05,
"loss": 0.4189589023590088,
"step": 300,
"token_acc": 0.8357866481946489
},
{
"epoch": 0.03843197540353574,
"eval_loss": 0.42732954025268555,
"eval_runtime": 102.1151,
"eval_samples_per_second": 97.929,
"eval_steps_per_second": 12.241,
"eval_token_acc": 0.8347027589681691,
"step": 300
},
{
"epoch": 0.039072508326928004,
"grad_norm": 8.375,
"learning_rate": 1.5601023017902815e-05,
"loss": 0.4307071685791016,
"step": 305,
"token_acc": 0.8350831713112984
},
{
"epoch": 0.03971304125032027,
"grad_norm": 9.25,
"learning_rate": 1.585677749360614e-05,
"loss": 0.41645016670227053,
"step": 310,
"token_acc": 0.8390616240458838
},
{
"epoch": 0.04035357417371253,
"grad_norm": 239.0,
"learning_rate": 1.6112531969309465e-05,
"loss": 0.42703800201416015,
"step": 315,
"token_acc": 0.8354931760451199
},
{
"epoch": 0.04099410709710479,
"grad_norm": 10.6875,
"learning_rate": 1.636828644501279e-05,
"loss": 0.41779098510742185,
"step": 320,
"token_acc": 0.8380400467067423
},
{
"epoch": 0.041634640020497056,
"grad_norm": 7.03125,
"learning_rate": 1.6624040920716114e-05,
"loss": 0.418929386138916,
"step": 325,
"token_acc": 0.8373385012919896
},
{
"epoch": 0.042275172943889314,
"grad_norm": 10.0,
"learning_rate": 1.687979539641944e-05,
"loss": 0.4039362907409668,
"step": 330,
"token_acc": 0.8435945139099208
},
{
"epoch": 0.04291570586728158,
"grad_norm": 8.375,
"learning_rate": 1.7135549872122764e-05,
"loss": 0.4099921226501465,
"step": 335,
"token_acc": 0.8431601226728866
},
{
"epoch": 0.04355623879067384,
"grad_norm": 13.5,
"learning_rate": 1.739130434782609e-05,
"loss": 0.3967348575592041,
"step": 340,
"token_acc": 0.8471802400898177
},
{
"epoch": 0.0441967717140661,
"grad_norm": 26.0,
"learning_rate": 1.7647058823529414e-05,
"loss": 0.4075979709625244,
"step": 345,
"token_acc": 0.8455312553814363
},
{
"epoch": 0.044837304637458365,
"grad_norm": 11.4375,
"learning_rate": 1.790281329923274e-05,
"loss": 0.4026969909667969,
"step": 350,
"token_acc": 0.8446702255898054
},
{
"epoch": 0.04547783756085063,
"grad_norm": 7.28125,
"learning_rate": 1.8158567774936063e-05,
"loss": 0.4052872657775879,
"step": 355,
"token_acc": 0.8408119196717772
},
{
"epoch": 0.04611837048424289,
"grad_norm": 7.0,
"learning_rate": 1.8414322250639388e-05,
"loss": 0.3986091136932373,
"step": 360,
"token_acc": 0.8454004142216086
},
{
"epoch": 0.04675890340763515,
"grad_norm": 7.0625,
"learning_rate": 1.8670076726342713e-05,
"loss": 0.4026648044586182,
"step": 365,
"token_acc": 0.844288421778084
},
{
"epoch": 0.04739943633102742,
"grad_norm": 8.9375,
"learning_rate": 1.8925831202046038e-05,
"loss": 0.38652160167694094,
"step": 370,
"token_acc": 0.851685393258427
},
{
"epoch": 0.048039969254419675,
"grad_norm": 35.0,
"learning_rate": 1.9181585677749362e-05,
"loss": 0.3907599687576294,
"step": 375,
"token_acc": 0.8490174010908147
},
{
"epoch": 0.04868050217781194,
"grad_norm": 8.25,
"learning_rate": 1.9437340153452687e-05,
"loss": 0.39287233352661133,
"step": 380,
"token_acc": 0.8511720096518441
},
{
"epoch": 0.049321035101204204,
"grad_norm": 12.25,
"learning_rate": 1.9693094629156012e-05,
"loss": 0.3889561653137207,
"step": 385,
"token_acc": 0.848631170510886
},
{
"epoch": 0.04996156802459646,
"grad_norm": 8.9375,
"learning_rate": 1.9948849104859337e-05,
"loss": 0.38450467586517334,
"step": 390,
"token_acc": 0.8524625544956188
},
{
"epoch": 0.050602100947988726,
"grad_norm": 11.3125,
"learning_rate": 1.999998563957419e-05,
"loss": 0.39484443664550783,
"step": 395,
"token_acc": 0.8477848646785037
},
{
"epoch": 0.05124263387138099,
"grad_norm": 14.5,
"learning_rate": 1.9999927300415016e-05,
"loss": 0.3870053768157959,
"step": 400,
"token_acc": 0.8513297986982198
},
{
"epoch": 0.05124263387138099,
"eval_loss": 0.40171390771865845,
"eval_runtime": 107.3275,
"eval_samples_per_second": 93.173,
"eval_steps_per_second": 11.647,
"eval_token_acc": 0.8490457391853209,
"step": 400
},
{
"epoch": 0.05188316679477325,
"grad_norm": 7.28125,
"learning_rate": 1.9999824085257465e-05,
"loss": 0.3954286575317383,
"step": 405,
"token_acc": 0.8484640466792518
},
{
"epoch": 0.05252369971816551,
"grad_norm": 8.9375,
"learning_rate": 1.9999675994564737e-05,
"loss": 0.3951833248138428,
"step": 410,
"token_acc": 0.8472842901340667
},
{
"epoch": 0.05316423264155778,
"grad_norm": 8.5625,
"learning_rate": 1.99994830290014e-05,
"loss": 0.3877572536468506,
"step": 415,
"token_acc": 0.8527802903045183
},
{
"epoch": 0.053804765564950036,
"grad_norm": 6.3125,
"learning_rate": 1.999924518943342e-05,
"loss": 0.3790754318237305,
"step": 420,
"token_acc": 0.8536078906385188
},
{
"epoch": 0.0544452984883423,
"grad_norm": 7.21875,
"learning_rate": 1.999896247692813e-05,
"loss": 0.37895793914794923,
"step": 425,
"token_acc": 0.8564549713690786
},
{
"epoch": 0.055085831411734565,
"grad_norm": 7.625,
"learning_rate": 1.999863489275424e-05,
"loss": 0.3699763774871826,
"step": 430,
"token_acc": 0.8588138812154696
},
{
"epoch": 0.05572636433512682,
"grad_norm": 6.1875,
"learning_rate": 1.9998262438381828e-05,
"loss": 0.3807647228240967,
"step": 435,
"token_acc": 0.8515137160329013
},
{
"epoch": 0.05636689725851909,
"grad_norm": 4.875,
"learning_rate": 1.9997845115482334e-05,
"loss": 0.37743220329284666,
"step": 440,
"token_acc": 0.853497694064911
},
{
"epoch": 0.05700743018191135,
"grad_norm": 11.0,
"learning_rate": 1.9997382925928544e-05,
"loss": 0.36346435546875,
"step": 445,
"token_acc": 0.8598952244880288
},
{
"epoch": 0.05764796310530361,
"grad_norm": 6.0625,
"learning_rate": 1.99968758717946e-05,
"loss": 0.36632614135742186,
"step": 450,
"token_acc": 0.8593689131281652
},
{
"epoch": 0.058288496028695874,
"grad_norm": 22.375,
"learning_rate": 1.9996323955355972e-05,
"loss": 0.38116629123687745,
"step": 455,
"token_acc": 0.8530322580645161
},
{
"epoch": 0.05892902895208814,
"grad_norm": 8.9375,
"learning_rate": 1.9995727179089463e-05,
"loss": 0.3787653684616089,
"step": 460,
"token_acc": 0.8553301683211049
},
{
"epoch": 0.0595695618754804,
"grad_norm": 5.71875,
"learning_rate": 1.9995085545673177e-05,
"loss": 0.37586026191711425,
"step": 465,
"token_acc": 0.8558023415977961
},
{
"epoch": 0.06021009479887266,
"grad_norm": 8.5625,
"learning_rate": 1.9994399057986537e-05,
"loss": 0.36600193977355955,
"step": 470,
"token_acc": 0.8605023127134397
},
{
"epoch": 0.060850627722264926,
"grad_norm": 21.25,
"learning_rate": 1.9993667719110245e-05,
"loss": 0.37952864170074463,
"step": 475,
"token_acc": 0.8555727954486683
},
{
"epoch": 0.061491160645657184,
"grad_norm": 16.0,
"learning_rate": 1.9992891532326277e-05,
"loss": 0.379518985748291,
"step": 480,
"token_acc": 0.8553931082071851
},
{
"epoch": 0.06213169356904945,
"grad_norm": 7.0,
"learning_rate": 1.9992070501117877e-05,
"loss": 0.3733321189880371,
"step": 485,
"token_acc": 0.8571675153188919
},
{
"epoch": 0.06277222649244171,
"grad_norm": 11.5,
"learning_rate": 1.9991204629169534e-05,
"loss": 0.36601009368896487,
"step": 490,
"token_acc": 0.8613763013521103
},
{
"epoch": 0.06341275941583398,
"grad_norm": 6.59375,
"learning_rate": 1.9990293920366957e-05,
"loss": 0.3734764814376831,
"step": 495,
"token_acc": 0.8549542551355084
},
{
"epoch": 0.06405329233922624,
"grad_norm": 446.0,
"learning_rate": 1.998933837879708e-05,
"loss": 0.3742378234863281,
"step": 500,
"token_acc": 0.8572477856988551
},
{
"epoch": 0.06405329233922624,
"eval_loss": 0.3812016248703003,
"eval_runtime": 102.75,
"eval_samples_per_second": 97.324,
"eval_steps_per_second": 12.165,
"eval_token_acc": 0.8565535875445017,
"step": 500
},
{
"epoch": 0.06469382526261849,
"grad_norm": 9.875,
"learning_rate": 1.998833800874802e-05,
"loss": 0.377778148651123,
"step": 505,
"token_acc": 0.8549496860755139
},
{
"epoch": 0.06533435818601076,
"grad_norm": 8.5,
"learning_rate": 1.9987292814709064e-05,
"loss": 0.36702466011047363,
"step": 510,
"token_acc": 0.8602703052808843
},
{
"epoch": 0.06597489110940302,
"grad_norm": 8.3125,
"learning_rate": 1.9986202801370665e-05,
"loss": 0.3668750047683716,
"step": 515,
"token_acc": 0.8610121474868009
},
{
"epoch": 0.06661542403279529,
"grad_norm": 5.5,
"learning_rate": 1.9985067973624402e-05,
"loss": 0.368256139755249,
"step": 520,
"token_acc": 0.8598641210870313
},
{
"epoch": 0.06725595695618755,
"grad_norm": 6.1875,
"learning_rate": 1.9983888336562962e-05,
"loss": 0.3637028694152832,
"step": 525,
"token_acc": 0.8616238543952498
},
{
"epoch": 0.06789648987957982,
"grad_norm": 8.8125,
"learning_rate": 1.9982663895480125e-05,
"loss": 0.3575170040130615,
"step": 530,
"token_acc": 0.8624665342430262
},
{
"epoch": 0.06853702280297207,
"grad_norm": 4.46875,
"learning_rate": 1.9981394655870728e-05,
"loss": 0.3676267147064209,
"step": 535,
"token_acc": 0.8586340206185566
},
{
"epoch": 0.06917755572636433,
"grad_norm": 5.28125,
"learning_rate": 1.998008062343066e-05,
"loss": 0.3628795385360718,
"step": 540,
"token_acc": 0.8599716507022894
},
{
"epoch": 0.0698180886497566,
"grad_norm": 4.40625,
"learning_rate": 1.9978721804056806e-05,
"loss": 0.351765513420105,
"step": 545,
"token_acc": 0.8640237603305785
},
{
"epoch": 0.07045862157314886,
"grad_norm": 7.4375,
"learning_rate": 1.9977318203847056e-05,
"loss": 0.35065426826477053,
"step": 550,
"token_acc": 0.864151596435061
},
{
"epoch": 0.07109915449654113,
"grad_norm": 5.53125,
"learning_rate": 1.9975869829100248e-05,
"loss": 0.3636244535446167,
"step": 555,
"token_acc": 0.8597329888027563
},
{
"epoch": 0.07173968741993339,
"grad_norm": 5.34375,
"learning_rate": 1.9974376686316158e-05,
"loss": 0.3594621181488037,
"step": 560,
"token_acc": 0.8631669907107367
},
{
"epoch": 0.07238022034332564,
"grad_norm": 4.78125,
"learning_rate": 1.9972838782195455e-05,
"loss": 0.36011404991149903,
"step": 565,
"token_acc": 0.8637501078609026
},
{
"epoch": 0.0730207532667179,
"grad_norm": 4.90625,
"learning_rate": 1.99712561236397e-05,
"loss": 0.36135361194610593,
"step": 570,
"token_acc": 0.8631270470608515
},
{
"epoch": 0.07366128619011017,
"grad_norm": 13.3125,
"learning_rate": 1.9969628717751267e-05,
"loss": 0.3561633825302124,
"step": 575,
"token_acc": 0.8632323755285184
},
{
"epoch": 0.07430181911350243,
"grad_norm": 4.59375,
"learning_rate": 1.9967956571833375e-05,
"loss": 0.347505521774292,
"step": 580,
"token_acc": 0.8662591581046517
},
{
"epoch": 0.0749423520368947,
"grad_norm": 5.90625,
"learning_rate": 1.9966239693389982e-05,
"loss": 0.3540546417236328,
"step": 585,
"token_acc": 0.8638228055783429
},
{
"epoch": 0.07558288496028696,
"grad_norm": 6.28125,
"learning_rate": 1.9964478090125815e-05,
"loss": 0.33905773162841796,
"step": 590,
"token_acc": 0.8724406047516199
},
{
"epoch": 0.07622341788367921,
"grad_norm": 5.125,
"learning_rate": 1.9962671769946303e-05,
"loss": 0.3554720401763916,
"step": 595,
"token_acc": 0.8650370115338268
},
{
"epoch": 0.07686395080707148,
"grad_norm": 16.25,
"learning_rate": 1.9960820740957546e-05,
"loss": 0.3572436094284058,
"step": 600,
"token_acc": 0.8659878327652414
},
{
"epoch": 0.07686395080707148,
"eval_loss": 0.36843228340148926,
"eval_runtime": 102.1933,
"eval_samples_per_second": 97.854,
"eval_steps_per_second": 12.232,
"eval_token_acc": 0.8618771835602482,
"step": 600
},
{
"epoch": 0.07750448373046374,
"grad_norm": 5.59375,
"learning_rate": 1.9958925011466283e-05,
"loss": 0.3591419458389282,
"step": 605,
"token_acc": 0.8607949646490775
},
{
"epoch": 0.07814501665385601,
"grad_norm": 7.90625,
"learning_rate": 1.9956984589979846e-05,
"loss": 0.3471505165100098,
"step": 610,
"token_acc": 0.8661698828394211
},
{
"epoch": 0.07878554957724827,
"grad_norm": 6.96875,
"learning_rate": 1.9954999485206143e-05,
"loss": 0.34771528244018557,
"step": 615,
"token_acc": 0.8679774069762428
},
{
"epoch": 0.07942608250064054,
"grad_norm": 15.4375,
"learning_rate": 1.9952969706053585e-05,
"loss": 0.35360991954803467,
"step": 620,
"token_acc": 0.8646970023722235
},
{
"epoch": 0.08006661542403279,
"grad_norm": 5.59375,
"learning_rate": 1.995089526163108e-05,
"loss": 0.3377622127532959,
"step": 625,
"token_acc": 0.8723486808070356
},
{
"epoch": 0.08070714834742505,
"grad_norm": 4.25,
"learning_rate": 1.994877616124797e-05,
"loss": 0.3453701019287109,
"step": 630,
"token_acc": 0.8684722042244396
},
{
"epoch": 0.08134768127081732,
"grad_norm": 10.5625,
"learning_rate": 1.9946612414414003e-05,
"loss": 0.35302703380584716,
"step": 635,
"token_acc": 0.8653087478559177
},
{
"epoch": 0.08198821419420958,
"grad_norm": 18.875,
"learning_rate": 1.9944404030839273e-05,
"loss": 0.3411895513534546,
"step": 640,
"token_acc": 0.8682500752979648
},
{
"epoch": 0.08262874711760185,
"grad_norm": 7.09375,
"learning_rate": 1.99421510204342e-05,
"loss": 0.34150052070617676,
"step": 645,
"token_acc": 0.8713020295837633
},
{
"epoch": 0.08326928004099411,
"grad_norm": 4.09375,
"learning_rate": 1.993985339330946e-05,
"loss": 0.3460074424743652,
"step": 650,
"token_acc": 0.8699866454142076
},
{
"epoch": 0.08390981296438636,
"grad_norm": 6.5,
"learning_rate": 1.993751115977596e-05,
"loss": 0.338161039352417,
"step": 655,
"token_acc": 0.8716904276985743
},
{
"epoch": 0.08455034588777863,
"grad_norm": 4.59375,
"learning_rate": 1.993512433034479e-05,
"loss": 0.34201898574829104,
"step": 660,
"token_acc": 0.8709761050857711
},
{
"epoch": 0.08519087881117089,
"grad_norm": 4.53125,
"learning_rate": 1.993269291572716e-05,
"loss": 0.33865838050842284,
"step": 665,
"token_acc": 0.8712311015118791
},
{
"epoch": 0.08583141173456316,
"grad_norm": 5.40625,
"learning_rate": 1.9930216926834366e-05,
"loss": 0.3336113691329956,
"step": 670,
"token_acc": 0.8717793270688088
},
{
"epoch": 0.08647194465795542,
"grad_norm": 6.3125,
"learning_rate": 1.992769637477773e-05,
"loss": 0.3334836959838867,
"step": 675,
"token_acc": 0.8705119896305897
},
{
"epoch": 0.08711247758134769,
"grad_norm": 6.21875,
"learning_rate": 1.9925131270868568e-05,
"loss": 0.34505319595336914,
"step": 680,
"token_acc": 0.8677537009225488
},
{
"epoch": 0.08775301050473995,
"grad_norm": 8.375,
"learning_rate": 1.9922521626618127e-05,
"loss": 0.34624500274658204,
"step": 685,
"token_acc": 0.8689443941158708
},
{
"epoch": 0.0883935434281322,
"grad_norm": 7.125,
"learning_rate": 1.9919867453737524e-05,
"loss": 0.34455955028533936,
"step": 690,
"token_acc": 0.8656286291883521
},
{
"epoch": 0.08903407635152447,
"grad_norm": 6.0,
"learning_rate": 1.9917168764137718e-05,
"loss": 0.3470313549041748,
"step": 695,
"token_acc": 0.8690256366212908
},
{
"epoch": 0.08967460927491673,
"grad_norm": 4.6875,
"learning_rate": 1.991442556992943e-05,
"loss": 0.3429619312286377,
"step": 700,
"token_acc": 0.8695521102497846
},
{
"epoch": 0.08967460927491673,
"eval_loss": 0.35823777318000793,
"eval_runtime": 102.8638,
"eval_samples_per_second": 97.216,
"eval_steps_per_second": 12.152,
"eval_token_acc": 0.8654677732806972,
"step": 700
},
{
"epoch": 0.090315142198309,
"grad_norm": 5.3125,
"learning_rate": 1.9911637883423115e-05,
"loss": 0.3434779167175293,
"step": 705,
"token_acc": 0.8690055962117951
},
{
"epoch": 0.09095567512170126,
"grad_norm": 4.84375,
"learning_rate": 1.9908805717128876e-05,
"loss": 0.3410910129547119,
"step": 710,
"token_acc": 0.8684765305683432
},
{
"epoch": 0.09159620804509352,
"grad_norm": 4.25,
"learning_rate": 1.9905929083756442e-05,
"loss": 0.34179034233093264,
"step": 715,
"token_acc": 0.8675887624956912
},
{
"epoch": 0.09223674096848578,
"grad_norm": 10.5,
"learning_rate": 1.990300799621508e-05,
"loss": 0.34283981323242185,
"step": 720,
"token_acc": 0.8702763191873978
},
{
"epoch": 0.09287727389187804,
"grad_norm": 5.5625,
"learning_rate": 1.9900042467613562e-05,
"loss": 0.34240546226501467,
"step": 725,
"token_acc": 0.8690830636461705
},
{
"epoch": 0.0935178068152703,
"grad_norm": 5.34375,
"learning_rate": 1.9897032511260092e-05,
"loss": 0.34098148345947266,
"step": 730,
"token_acc": 0.8700211452984077
},
{
"epoch": 0.09415833973866257,
"grad_norm": 3.890625,
"learning_rate": 1.989397814066224e-05,
"loss": 0.32979321479797363,
"step": 735,
"token_acc": 0.8732680105322226
},
{
"epoch": 0.09479887266205483,
"grad_norm": 3.6875,
"learning_rate": 1.9890879369526907e-05,
"loss": 0.33590106964111327,
"step": 740,
"token_acc": 0.868538938662991
},
{
"epoch": 0.0954394055854471,
"grad_norm": 4.3125,
"learning_rate": 1.9887736211760237e-05,
"loss": 0.33802223205566406,
"step": 745,
"token_acc": 0.8701846511427711
},
{
"epoch": 0.09607993850883935,
"grad_norm": 8.125,
"learning_rate": 1.9884548681467565e-05,
"loss": 0.3298491477966309,
"step": 750,
"token_acc": 0.8728583142721505
},
{
"epoch": 0.09672047143223161,
"grad_norm": 8.1875,
"learning_rate": 1.9881316792953352e-05,
"loss": 0.34202146530151367,
"step": 755,
"token_acc": 0.8698966408268733
},
{
"epoch": 0.09736100435562388,
"grad_norm": 4.6875,
"learning_rate": 1.987804056072113e-05,
"loss": 0.3250537872314453,
"step": 760,
"token_acc": 0.8748375920311823
},
{
"epoch": 0.09800153727901614,
"grad_norm": 6.0,
"learning_rate": 1.987471999947343e-05,
"loss": 0.34002318382263186,
"step": 765,
"token_acc": 0.8696007571846498
},
{
"epoch": 0.09864207020240841,
"grad_norm": 6.71875,
"learning_rate": 1.9871355124111704e-05,
"loss": 0.3327933311462402,
"step": 770,
"token_acc": 0.8724304715840387
},
{
"epoch": 0.09928260312580067,
"grad_norm": 5.125,
"learning_rate": 1.986794594973627e-05,
"loss": 0.334125280380249,
"step": 775,
"token_acc": 0.8736641716782763
},
{
"epoch": 0.09992313604919292,
"grad_norm": 5.375,
"learning_rate": 1.986449249164626e-05,
"loss": 0.33797569274902345,
"step": 780,
"token_acc": 0.870381508850318
},
{
"epoch": 0.10056366897258519,
"grad_norm": 5.875,
"learning_rate": 1.986099476533953e-05,
"loss": 0.337173318862915,
"step": 785,
"token_acc": 0.8695614640883977
},
{
"epoch": 0.10120420189597745,
"grad_norm": 4.1875,
"learning_rate": 1.9857452786512575e-05,
"loss": 0.31865544319152833,
"step": 790,
"token_acc": 0.8768464370803553
},
{
"epoch": 0.10184473481936972,
"grad_norm": 3.640625,
"learning_rate": 1.98538665710605e-05,
"loss": 0.3357419013977051,
"step": 795,
"token_acc": 0.8707342295760083
},
{
"epoch": 0.10248526774276198,
"grad_norm": 8.375,
"learning_rate": 1.985023613507692e-05,
"loss": 0.3254246711730957,
"step": 800,
"token_acc": 0.8745312702038706
},
{
"epoch": 0.10248526774276198,
"eval_loss": 0.3556683361530304,
"eval_runtime": 102.7565,
"eval_samples_per_second": 97.317,
"eval_steps_per_second": 12.165,
"eval_token_acc": 0.866935015032307,
"step": 800
},
{
"epoch": 0.10312580066615425,
"grad_norm": 5.09375,
"learning_rate": 1.9846561494853904e-05,
"loss": 0.33404462337493895,
"step": 805,
"token_acc": 0.8705380237972065
},
{
"epoch": 0.1037663335895465,
"grad_norm": 3.53125,
"learning_rate": 1.9842842666881885e-05,
"loss": 0.3421541690826416,
"step": 810,
"token_acc": 0.8684289705566302
},
{
"epoch": 0.10440686651293876,
"grad_norm": 4.59375,
"learning_rate": 1.983907966784959e-05,
"loss": 0.3375978469848633,
"step": 815,
"token_acc": 0.869295677630446
},
{
"epoch": 0.10504739943633103,
"grad_norm": 9.0625,
"learning_rate": 1.9835272514643978e-05,
"loss": 0.3273109674453735,
"step": 820,
"token_acc": 0.8760344827586207
},
{
"epoch": 0.10568793235972329,
"grad_norm": 6.875,
"learning_rate": 1.9831421224350156e-05,
"loss": 0.3292600154876709,
"step": 825,
"token_acc": 0.874255631310952
},
{
"epoch": 0.10632846528311556,
"grad_norm": 11.5,
"learning_rate": 1.98275258142513e-05,
"loss": 0.32775206565856935,
"step": 830,
"token_acc": 0.8719834817395793
},
{
"epoch": 0.10696899820650782,
"grad_norm": 4.0,
"learning_rate": 1.9823586301828572e-05,
"loss": 0.3248668909072876,
"step": 835,
"token_acc": 0.876129143795652
},
{
"epoch": 0.10760953112990007,
"grad_norm": 16.25,
"learning_rate": 1.9819602704761066e-05,
"loss": 0.3292513132095337,
"step": 840,
"token_acc": 0.8749297722459917
},
{
"epoch": 0.10825006405329234,
"grad_norm": 8.0,
"learning_rate": 1.9815575040925693e-05,
"loss": 0.3171013116836548,
"step": 845,
"token_acc": 0.8782070696145027
},
{
"epoch": 0.1088905969766846,
"grad_norm": 2.96875,
"learning_rate": 1.9811503328397133e-05,
"loss": 0.319035267829895,
"step": 850,
"token_acc": 0.8752912747044101
},
{
"epoch": 0.10953112990007687,
"grad_norm": 5.9375,
"learning_rate": 1.9807387585447734e-05,
"loss": 0.32436022758483884,
"step": 855,
"token_acc": 0.876150555291474
},
{
"epoch": 0.11017166282346913,
"grad_norm": 4.5625,
"learning_rate": 1.9803227830547437e-05,
"loss": 0.33043532371520995,
"step": 860,
"token_acc": 0.8730488173995763
},
{
"epoch": 0.1108121957468614,
"grad_norm": 4.71875,
"learning_rate": 1.9799024082363692e-05,
"loss": 0.3189000129699707,
"step": 865,
"token_acc": 0.8785994905668523
},
{
"epoch": 0.11145272867025365,
"grad_norm": 5.65625,
"learning_rate": 1.9794776359761378e-05,
"loss": 0.32372350692749025,
"step": 870,
"token_acc": 0.8751831107281344
},
{
"epoch": 0.11209326159364591,
"grad_norm": 4.125,
"learning_rate": 1.9790484681802707e-05,
"loss": 0.3230480670928955,
"step": 875,
"token_acc": 0.8766306695464363
},
{
"epoch": 0.11273379451703817,
"grad_norm": 4.8125,
"learning_rate": 1.9786149067747163e-05,
"loss": 0.32105169296264646,
"step": 880,
"token_acc": 0.8791981030394481
},
{
"epoch": 0.11337432744043044,
"grad_norm": 5.0625,
"learning_rate": 1.9781769537051384e-05,
"loss": 0.3278522968292236,
"step": 885,
"token_acc": 0.8761810259286423
},
{
"epoch": 0.1140148603638227,
"grad_norm": 31.375,
"learning_rate": 1.9777346109369088e-05,
"loss": 0.3238049030303955,
"step": 890,
"token_acc": 0.8749892120479849
},
{
"epoch": 0.11465539328721497,
"grad_norm": 4.9375,
"learning_rate": 1.9772878804551e-05,
"loss": 0.33077249526977537,
"step": 895,
"token_acc": 0.8735443802294488
},
{
"epoch": 0.11529592621060722,
"grad_norm": 3.703125,
"learning_rate": 1.9768367642644742e-05,
"loss": 0.32166156768798826,
"step": 900,
"token_acc": 0.8777322698857513
},
{
"epoch": 0.11529592621060722,
"eval_loss": 0.34898844361305237,
"eval_runtime": 102.6281,
"eval_samples_per_second": 97.439,
"eval_steps_per_second": 12.18,
"eval_token_acc": 0.8700798954659461,
"step": 900
},
{
"epoch": 0.11593645913399948,
"grad_norm": 5.3125,
"learning_rate": 1.9763812643894743e-05,
"loss": 0.32224602699279786,
"step": 905,
"token_acc": 0.8758474759252062
},
{
"epoch": 0.11657699205739175,
"grad_norm": 4.3125,
"learning_rate": 1.975921382874217e-05,
"loss": 0.32654175758361814,
"step": 910,
"token_acc": 0.8768330968047133
},
{
"epoch": 0.11721752498078401,
"grad_norm": 3.375,
"learning_rate": 1.9754571217824815e-05,
"loss": 0.3332622528076172,
"step": 915,
"token_acc": 0.8733866804336603
},
{
"epoch": 0.11785805790417628,
"grad_norm": 3.53125,
"learning_rate": 1.974988483197701e-05,
"loss": 0.3266796112060547,
"step": 920,
"token_acc": 0.873924638678596
},
{
"epoch": 0.11849859082756854,
"grad_norm": 2.890625,
"learning_rate": 1.9745154692229524e-05,
"loss": 0.3260995388031006,
"step": 925,
"token_acc": 0.8767960079153403
},
{
"epoch": 0.1191391237509608,
"grad_norm": 5.4375,
"learning_rate": 1.9740380819809498e-05,
"loss": 0.3234872817993164,
"step": 930,
"token_acc": 0.8770484733482836
},
{
"epoch": 0.11977965667435306,
"grad_norm": 4.34375,
"learning_rate": 1.9735563236140307e-05,
"loss": 0.3268592119216919,
"step": 935,
"token_acc": 0.8750322747224374
},
{
"epoch": 0.12042018959774532,
"grad_norm": 5.71875,
"learning_rate": 1.9730701962841504e-05,
"loss": 0.3228474140167236,
"step": 940,
"token_acc": 0.8774657593246619
},
{
"epoch": 0.12106072252113759,
"grad_norm": 10.625,
"learning_rate": 1.9725797021728687e-05,
"loss": 0.32127084732055666,
"step": 945,
"token_acc": 0.8768540876164195
},
{
"epoch": 0.12170125544452985,
"grad_norm": 4.3125,
"learning_rate": 1.9720848434813437e-05,
"loss": 0.3093282222747803,
"step": 950,
"token_acc": 0.8818652849740932
},
{
"epoch": 0.12234178836792212,
"grad_norm": 3.640625,
"learning_rate": 1.9715856224303193e-05,
"loss": 0.3240875244140625,
"step": 955,
"token_acc": 0.8766339869281046
},
{
"epoch": 0.12298232129131437,
"grad_norm": 10.125,
"learning_rate": 1.9710820412601156e-05,
"loss": 0.31369385719299314,
"step": 960,
"token_acc": 0.8786259212964959
},
{
"epoch": 0.12362285421470663,
"grad_norm": 4.5,
"learning_rate": 1.97057410223062e-05,
"loss": 0.3160251617431641,
"step": 965,
"token_acc": 0.8773067116124292
},
{
"epoch": 0.1242633871380989,
"grad_norm": 6.1875,
"learning_rate": 1.9700618076212767e-05,
"loss": 0.32041115760803224,
"step": 970,
"token_acc": 0.876734235207676
},
{
"epoch": 0.12490392006149116,
"grad_norm": 6.8125,
"learning_rate": 1.969545159731075e-05,
"loss": 0.3188473224639893,
"step": 975,
"token_acc": 0.8775827114696113
},
{
"epoch": 0.12554445298488343,
"grad_norm": 5.75,
"learning_rate": 1.9690241608785404e-05,
"loss": 0.31864352226257325,
"step": 980,
"token_acc": 0.8768779140044898
},
{
"epoch": 0.12618498590827568,
"grad_norm": 6.03125,
"learning_rate": 1.9684988134017254e-05,
"loss": 0.32373876571655275,
"step": 985,
"token_acc": 0.8762113968212948
},
{
"epoch": 0.12682551883166795,
"grad_norm": 7.625,
"learning_rate": 1.9679691196581957e-05,
"loss": 0.3241652727127075,
"step": 990,
"token_acc": 0.8755324183625177
},
{
"epoch": 0.1274660517550602,
"grad_norm": 5.0625,
"learning_rate": 1.9674350820250222e-05,
"loss": 0.31421942710876466,
"step": 995,
"token_acc": 0.8811829816672432
},
{
"epoch": 0.12810658467845248,
"grad_norm": 4.25,
"learning_rate": 1.9668967028987694e-05,
"loss": 0.3193212985992432,
"step": 1000,
"token_acc": 0.8778954619822612
},
{
"epoch": 0.12810658467845248,
"eval_loss": 0.34447741508483887,
"eval_runtime": 102.4624,
"eval_samples_per_second": 97.597,
"eval_steps_per_second": 12.2,
"eval_token_acc": 0.8711014279307462,
"step": 1000
},
{
"epoch": 0.12874711760184473,
"grad_norm": 8.5,
"learning_rate": 1.966353984695485e-05,
"loss": 0.3204664707183838,
"step": 1005,
"token_acc": 0.8778589649357948
},
{
"epoch": 0.12938765052523699,
"grad_norm": 3.140625,
"learning_rate": 1.965806929850689e-05,
"loss": 0.3205430030822754,
"step": 1010,
"token_acc": 0.877480437508106
},
{
"epoch": 0.13002818344862926,
"grad_norm": 5.15625,
"learning_rate": 1.9652555408193623e-05,
"loss": 0.31981477737426756,
"step": 1015,
"token_acc": 0.8765867722363269
},
{
"epoch": 0.13066871637202152,
"grad_norm": 5.5,
"learning_rate": 1.9646998200759366e-05,
"loss": 0.31712310314178466,
"step": 1020,
"token_acc": 0.8808243342081487
},
{
"epoch": 0.1313092492954138,
"grad_norm": 5.21875,
"learning_rate": 1.9641397701142818e-05,
"loss": 0.3185598850250244,
"step": 1025,
"token_acc": 0.8790852498703096
},
{
"epoch": 0.13194978221880604,
"grad_norm": 4.625,
"learning_rate": 1.9635753934476963e-05,
"loss": 0.31679530143737794,
"step": 1030,
"token_acc": 0.8782927355033412
},
{
"epoch": 0.13259031514219832,
"grad_norm": 4.15625,
"learning_rate": 1.963006692608896e-05,
"loss": 0.3205821990966797,
"step": 1035,
"token_acc": 0.8770858420781008
},
{
"epoch": 0.13323084806559057,
"grad_norm": 2.640625,
"learning_rate": 1.9624336701500005e-05,
"loss": 0.3191715717315674,
"step": 1040,
"token_acc": 0.8761724464331813
},
{
"epoch": 0.13387138098898282,
"grad_norm": 5.78125,
"learning_rate": 1.9618563286425236e-05,
"loss": 0.3229659080505371,
"step": 1045,
"token_acc": 0.8782109398609852
},
{
"epoch": 0.1345119139123751,
"grad_norm": 3.796875,
"learning_rate": 1.9612746706773627e-05,
"loss": 0.3189516067504883,
"step": 1050,
"token_acc": 0.8770367809136881
},
{
"epoch": 0.13515244683576735,
"grad_norm": 3.453125,
"learning_rate": 1.9606886988647846e-05,
"loss": 0.31815266609191895,
"step": 1055,
"token_acc": 0.8782769920662298
},
{
"epoch": 0.13579297975915963,
"grad_norm": 2.75,
"learning_rate": 1.9600984158344153e-05,
"loss": 0.3152862548828125,
"step": 1060,
"token_acc": 0.8782785291448818
},
{
"epoch": 0.13643351268255188,
"grad_norm": 8.625,
"learning_rate": 1.9595038242352283e-05,
"loss": 0.31676223278045657,
"step": 1065,
"token_acc": 0.8806634129486459
},
{
"epoch": 0.13707404560594413,
"grad_norm": 12.3125,
"learning_rate": 1.958904926735532e-05,
"loss": 0.31054699420928955,
"step": 1070,
"token_acc": 0.8809544356230583
},
{
"epoch": 0.1377145785293364,
"grad_norm": 7.09375,
"learning_rate": 1.958301726022958e-05,
"loss": 0.30883467197418213,
"step": 1075,
"token_acc": 0.8819924033149171
},
{
"epoch": 0.13835511145272866,
"grad_norm": 4.46875,
"learning_rate": 1.9576942248044505e-05,
"loss": 0.31630141735076905,
"step": 1080,
"token_acc": 0.8776055124892335
},
{
"epoch": 0.13899564437612094,
"grad_norm": 4.65625,
"learning_rate": 1.95708242580625e-05,
"loss": 0.30964021682739257,
"step": 1085,
"token_acc": 0.8792420327304048
},
{
"epoch": 0.1396361772995132,
"grad_norm": 4.65625,
"learning_rate": 1.956466331773887e-05,
"loss": 0.3171691417694092,
"step": 1090,
"token_acc": 0.8775633293124246
},
{
"epoch": 0.14027671022290547,
"grad_norm": 3.5625,
"learning_rate": 1.9558459454721642e-05,
"loss": 0.31899094581604004,
"step": 1095,
"token_acc": 0.878079188341
},
{
"epoch": 0.14091724314629772,
"grad_norm": 3.234375,
"learning_rate": 1.955221269685148e-05,
"loss": 0.31024134159088135,
"step": 1100,
"token_acc": 0.8816972001382648
},
{
"epoch": 0.14091724314629772,
"eval_loss": 0.3438940942287445,
"eval_runtime": 103.1967,
"eval_samples_per_second": 96.902,
"eval_steps_per_second": 12.113,
"eval_token_acc": 0.8718599642325219,
"step": 1100
},
{
"epoch": 0.14155777606968997,
"grad_norm": 4.5,
"learning_rate": 1.9545923072161534e-05,
"loss": 0.31498963832855226,
"step": 1105,
"token_acc": 0.8790587219343696
},
{
"epoch": 0.14219830899308225,
"grad_norm": 3.53125,
"learning_rate": 1.9539590608877326e-05,
"loss": 0.3086799144744873,
"step": 1110,
"token_acc": 0.8817727625118442
},
{
"epoch": 0.1428388419164745,
"grad_norm": 6.46875,
"learning_rate": 1.9533215335416623e-05,
"loss": 0.3052536487579346,
"step": 1115,
"token_acc": 0.8838970651519064
},
{
"epoch": 0.14347937483986678,
"grad_norm": 4.125,
"learning_rate": 1.9526797280389314e-05,
"loss": 0.3200625658035278,
"step": 1120,
"token_acc": 0.8772527377770113
},
{
"epoch": 0.14411990776325903,
"grad_norm": 3.5625,
"learning_rate": 1.952033647259727e-05,
"loss": 0.3077129364013672,
"step": 1125,
"token_acc": 0.8831737581039887
},
{
"epoch": 0.14476044068665128,
"grad_norm": 10.125,
"learning_rate": 1.951383294103422e-05,
"loss": 0.31737732887268066,
"step": 1130,
"token_acc": 0.8796427497960411
},
{
"epoch": 0.14540097361004356,
"grad_norm": 4.40625,
"learning_rate": 1.9507286714885623e-05,
"loss": 0.31585164070129396,
"step": 1135,
"token_acc": 0.8788506342221072
},
{
"epoch": 0.1460415065334358,
"grad_norm": 3.34375,
"learning_rate": 1.9500697823528538e-05,
"loss": 0.32686147689819334,
"step": 1140,
"token_acc": 0.8751074806534824
},
{
"epoch": 0.1466820394568281,
"grad_norm": 4.9375,
"learning_rate": 1.9494066296531484e-05,
"loss": 0.3137520790100098,
"step": 1145,
"token_acc": 0.8789589348041539
},
{
"epoch": 0.14732257238022034,
"grad_norm": 4.59375,
"learning_rate": 1.948739216365432e-05,
"loss": 0.30913615226745605,
"step": 1150,
"token_acc": 0.8794063079777366
},
{
"epoch": 0.14796310530361262,
"grad_norm": 3.5625,
"learning_rate": 1.9480675454848103e-05,
"loss": 0.3166754722595215,
"step": 1155,
"token_acc": 0.8766857684518936
},
{
"epoch": 0.14860363822700487,
"grad_norm": 3.46875,
"learning_rate": 1.947391620025495e-05,
"loss": 0.3093476057052612,
"step": 1160,
"token_acc": 0.8816090465708489
},
{
"epoch": 0.14924417115039712,
"grad_norm": 3.453125,
"learning_rate": 1.9467114430207916e-05,
"loss": 0.30673789978027344,
"step": 1165,
"token_acc": 0.8819172300934499
},
{
"epoch": 0.1498847040737894,
"grad_norm": 4.78125,
"learning_rate": 1.9460270175230834e-05,
"loss": 0.314839768409729,
"step": 1170,
"token_acc": 0.8800895239734872
},
{
"epoch": 0.15052523699718165,
"grad_norm": 5.21875,
"learning_rate": 1.9453383466038218e-05,
"loss": 0.3102754592895508,
"step": 1175,
"token_acc": 0.8781979498664829
},
{
"epoch": 0.15116576992057393,
"grad_norm": 2.515625,
"learning_rate": 1.944645433353508e-05,
"loss": 0.30159687995910645,
"step": 1180,
"token_acc": 0.8842528536838464
},
{
"epoch": 0.15180630284396618,
"grad_norm": 5.96875,
"learning_rate": 1.9439482808816823e-05,
"loss": 0.31150016784667967,
"step": 1185,
"token_acc": 0.8806630308755958
},
{
"epoch": 0.15244683576735843,
"grad_norm": 2.828125,
"learning_rate": 1.9432468923169086e-05,
"loss": 0.3075159311294556,
"step": 1190,
"token_acc": 0.8814693313765269
},
{
"epoch": 0.1530873686907507,
"grad_norm": 6.21875,
"learning_rate": 1.9425412708067612e-05,
"loss": 0.3062115669250488,
"step": 1195,
"token_acc": 0.8820224719101124
},
{
"epoch": 0.15372790161414296,
"grad_norm": 4.28125,
"learning_rate": 1.94183141951781e-05,
"loss": 0.305180287361145,
"step": 1200,
"token_acc": 0.8842530282637954
},
{
"epoch": 0.15372790161414296,
"eval_loss": 0.3408718407154083,
"eval_runtime": 103.7033,
"eval_samples_per_second": 96.429,
"eval_steps_per_second": 12.054,
"eval_token_acc": 0.8721810963894779,
"step": 1200
},
{
"epoch": 0.15436843453753524,
"grad_norm": 3.9375,
"learning_rate": 1.9411173416356065e-05,
"loss": 0.30832886695861816,
"step": 1205,
"token_acc": 0.8802281368821293
},
{
"epoch": 0.1550089674609275,
"grad_norm": 5.15625,
"learning_rate": 1.9403990403646702e-05,
"loss": 0.3051230192184448,
"step": 1210,
"token_acc": 0.8818798910458732
},
{
"epoch": 0.15564950038431977,
"grad_norm": 4.40625,
"learning_rate": 1.9396765189284726e-05,
"loss": 0.3141745090484619,
"step": 1215,
"token_acc": 0.8782784418264683
},
{
"epoch": 0.15629003330771202,
"grad_norm": 4.34375,
"learning_rate": 1.938949780569425e-05,
"loss": 0.3076632976531982,
"step": 1220,
"token_acc": 0.8809513532179393
},
{
"epoch": 0.15693056623110427,
"grad_norm": 4.1875,
"learning_rate": 1.9382188285488612e-05,
"loss": 0.30438895225524903,
"step": 1225,
"token_acc": 0.8833096682586807
},
{
"epoch": 0.15757109915449655,
"grad_norm": 4.59375,
"learning_rate": 1.9374836661470263e-05,
"loss": 0.30989761352539064,
"step": 1230,
"token_acc": 0.8809441615603694
},
{
"epoch": 0.1582116320778888,
"grad_norm": 4.34375,
"learning_rate": 1.9367442966630583e-05,
"loss": 0.3067667484283447,
"step": 1235,
"token_acc": 0.8825937096079276
},
{
"epoch": 0.15885216500128108,
"grad_norm": 7.46875,
"learning_rate": 1.9360007234149756e-05,
"loss": 0.29884748458862304,
"step": 1240,
"token_acc": 0.8846866250269223
},
{
"epoch": 0.15949269792467333,
"grad_norm": 6.75,
"learning_rate": 1.9352529497396623e-05,
"loss": 0.3064408779144287,
"step": 1245,
"token_acc": 0.8819226300615345
},
{
"epoch": 0.16013323084806558,
"grad_norm": 8.625,
"learning_rate": 1.9345009789928507e-05,
"loss": 0.3079418182373047,
"step": 1250,
"token_acc": 0.8822084303077321
},
{
"epoch": 0.16077376377145786,
"grad_norm": 6.25,
"learning_rate": 1.9337448145491106e-05,
"loss": 0.3048593044281006,
"step": 1255,
"token_acc": 0.8844390623648474
},
{
"epoch": 0.1614142966948501,
"grad_norm": 4.59375,
"learning_rate": 1.9329844598018288e-05,
"loss": 0.31249561309814455,
"step": 1260,
"token_acc": 0.8813588549749957
},
{
"epoch": 0.16205482961824239,
"grad_norm": 10.125,
"learning_rate": 1.9322199181631985e-05,
"loss": 0.30511524677276614,
"step": 1265,
"token_acc": 0.8825485961123111
},
{
"epoch": 0.16269536254163464,
"grad_norm": 4.3125,
"learning_rate": 1.9314511930642017e-05,
"loss": 0.30724005699157714,
"step": 1270,
"token_acc": 0.8831095955453878
},
{
"epoch": 0.16333589546502691,
"grad_norm": 14.375,
"learning_rate": 1.930678287954594e-05,
"loss": 0.30521693229675295,
"step": 1275,
"token_acc": 0.8835102252135646
},
{
"epoch": 0.16397642838841917,
"grad_norm": 4.6875,
"learning_rate": 1.9299012063028893e-05,
"loss": 0.2963773250579834,
"step": 1280,
"token_acc": 0.8844027981690993
},
{
"epoch": 0.16461696131181142,
"grad_norm": 2.796875,
"learning_rate": 1.9291199515963445e-05,
"loss": 0.30706090927124025,
"step": 1285,
"token_acc": 0.8825376344086021
},
{
"epoch": 0.1652574942352037,
"grad_norm": 4.3125,
"learning_rate": 1.9283345273409434e-05,
"loss": 0.2986742496490479,
"step": 1290,
"token_acc": 0.8834094237229736
},
{
"epoch": 0.16589802715859595,
"grad_norm": 26.875,
"learning_rate": 1.927544937061382e-05,
"loss": 0.30177807807922363,
"step": 1295,
"token_acc": 0.8840617188173433
},
{
"epoch": 0.16653856008198822,
"grad_norm": 5.25,
"learning_rate": 1.9267511843010508e-05,
"loss": 0.3020944356918335,
"step": 1300,
"token_acc": 0.8823529411764706
},
{
"epoch": 0.16653856008198822,
"eval_loss": 0.34260889887809753,
"eval_runtime": 102.5088,
"eval_samples_per_second": 97.553,
"eval_steps_per_second": 12.194,
"eval_token_acc": 0.8718101333805803,
"step": 1300
},
{
"epoch": 0.16717909300538047,
"grad_norm": 3.734375,
"learning_rate": 1.925953272622021e-05,
"loss": 0.30364112854003905,
"step": 1305,
"token_acc": 0.8823174931129476
},
{
"epoch": 0.16781962592877273,
"grad_norm": 3.609375,
"learning_rate": 1.9251512056050257e-05,
"loss": 0.3062715768814087,
"step": 1310,
"token_acc": 0.8822084303077321
},
{
"epoch": 0.168460158852165,
"grad_norm": 6.09375,
"learning_rate": 1.9243449868494482e-05,
"loss": 0.3047629356384277,
"step": 1315,
"token_acc": 0.8817324665260258
},
{
"epoch": 0.16910069177555725,
"grad_norm": 4.34375,
"learning_rate": 1.9235346199733013e-05,
"loss": 0.30484819412231445,
"step": 1320,
"token_acc": 0.8817362039953403
},
{
"epoch": 0.16974122469894953,
"grad_norm": 3.234375,
"learning_rate": 1.9227201086132138e-05,
"loss": 0.29434518814086913,
"step": 1325,
"token_acc": 0.8883073063113726
},
{
"epoch": 0.17038175762234178,
"grad_norm": 3.25,
"learning_rate": 1.9219014564244135e-05,
"loss": 0.30238900184631345,
"step": 1330,
"token_acc": 0.8828684914946896
},
{
"epoch": 0.17102229054573406,
"grad_norm": 5.9375,
"learning_rate": 1.9210786670807103e-05,
"loss": 0.30103113651275637,
"step": 1335,
"token_acc": 0.8840491860062348
},
{
"epoch": 0.1716628234691263,
"grad_norm": 2.84375,
"learning_rate": 1.9202517442744804e-05,
"loss": 0.3020737409591675,
"step": 1340,
"token_acc": 0.8843631342768381
},
{
"epoch": 0.17230335639251856,
"grad_norm": 3.828125,
"learning_rate": 1.9194206917166496e-05,
"loss": 0.30103378295898436,
"step": 1345,
"token_acc": 0.8852183650615901
},
{
"epoch": 0.17294388931591084,
"grad_norm": 13.9375,
"learning_rate": 1.9185855131366762e-05,
"loss": 0.3041229248046875,
"step": 1350,
"token_acc": 0.884064226519337
},
{
"epoch": 0.1735844222393031,
"grad_norm": 20.25,
"learning_rate": 1.9177462122825344e-05,
"loss": 0.308376407623291,
"step": 1355,
"token_acc": 0.8811586706323549
},
{
"epoch": 0.17422495516269537,
"grad_norm": 3.390625,
"learning_rate": 1.9169027929206987e-05,
"loss": 0.3022352695465088,
"step": 1360,
"token_acc": 0.8822768434670116
},
{
"epoch": 0.17486548808608762,
"grad_norm": 6.5625,
"learning_rate": 1.916055258836125e-05,
"loss": 0.3043084621429443,
"step": 1365,
"token_acc": 0.8829251495717299
},
{
"epoch": 0.1755060210094799,
"grad_norm": 3.3125,
"learning_rate": 1.9152036138322345e-05,
"loss": 0.30508239269256593,
"step": 1370,
"token_acc": 0.882540092007395
},
{
"epoch": 0.17614655393287215,
"grad_norm": 4.1875,
"learning_rate": 1.9143478617308966e-05,
"loss": 0.3004749059677124,
"step": 1375,
"token_acc": 0.8839621418384546
},
{
"epoch": 0.1767870868562644,
"grad_norm": 6.125,
"learning_rate": 1.913488006372413e-05,
"loss": 0.3041959762573242,
"step": 1380,
"token_acc": 0.8817213611568101
},
{
"epoch": 0.17742761977965668,
"grad_norm": 11.625,
"learning_rate": 1.912624051615498e-05,
"loss": 0.3068222522735596,
"step": 1385,
"token_acc": 0.8823149463893554
},
{
"epoch": 0.17806815270304893,
"grad_norm": 6.84375,
"learning_rate": 1.9117560013372633e-05,
"loss": 0.29890620708465576,
"step": 1390,
"token_acc": 0.885037126575721
},
{
"epoch": 0.1787086856264412,
"grad_norm": 3.359375,
"learning_rate": 1.9108838594331997e-05,
"loss": 0.308072566986084,
"step": 1395,
"token_acc": 0.8794112583921501
},
{
"epoch": 0.17934921854983346,
"grad_norm": 6.3125,
"learning_rate": 1.9100076298171587e-05,
"loss": 0.29462456703186035,
"step": 1400,
"token_acc": 0.885492563052382
},
{
"epoch": 0.17934921854983346,
"eval_loss": 0.3468918800354004,
"eval_runtime": 107.9248,
"eval_samples_per_second": 92.657,
"eval_steps_per_second": 11.582,
"eval_token_acc": 0.8731887869509609,
"step": 1400
},
{
"epoch": 0.1799897514732257,
"grad_norm": 7.59375,
"learning_rate": 1.9091273164213374e-05,
"loss": 0.29882164001464845,
"step": 1405,
"token_acc": 0.8843217597584645
},
{
"epoch": 0.180630284396618,
"grad_norm": 3.078125,
"learning_rate": 1.9082429231962586e-05,
"loss": 0.29425759315490724,
"step": 1410,
"token_acc": 0.8862321968062149
},
{
"epoch": 0.18127081732001024,
"grad_norm": 6.0625,
"learning_rate": 1.9073544541107544e-05,
"loss": 0.2935910701751709,
"step": 1415,
"token_acc": 0.8873945945945946
},
{
"epoch": 0.18191135024340252,
"grad_norm": 3.3125,
"learning_rate": 1.906461913151947e-05,
"loss": 0.29699931144714353,
"step": 1420,
"token_acc": 0.8851322249978465
},
{
"epoch": 0.18255188316679477,
"grad_norm": 3.515625,
"learning_rate": 1.9055653043252324e-05,
"loss": 0.2873663902282715,
"step": 1425,
"token_acc": 0.8880069025021571
},
{
"epoch": 0.18319241609018705,
"grad_norm": 4.3125,
"learning_rate": 1.9046646316542613e-05,
"loss": 0.3060638904571533,
"step": 1430,
"token_acc": 0.8829251495717299
},
{
"epoch": 0.1838329490135793,
"grad_norm": 107.5,
"learning_rate": 1.9037598991809225e-05,
"loss": 0.3029170513153076,
"step": 1435,
"token_acc": 0.8842598563996732
},
{
"epoch": 0.18447348193697155,
"grad_norm": 3.984375,
"learning_rate": 1.9028511109653212e-05,
"loss": 0.29811413288116456,
"step": 1440,
"token_acc": 0.884185544768069
},
{
"epoch": 0.18511401486036383,
"grad_norm": 3.671875,
"learning_rate": 1.9019382710857663e-05,
"loss": 0.291642951965332,
"step": 1445,
"token_acc": 0.888984918542846
},
{
"epoch": 0.18575454778375608,
"grad_norm": 5.84375,
"learning_rate": 1.901021383638747e-05,
"loss": 0.29584593772888185,
"step": 1450,
"token_acc": 0.8839431769263882
},
{
"epoch": 0.18639508070714836,
"grad_norm": 5.125,
"learning_rate": 1.900100452738917e-05,
"loss": 0.29843716621398925,
"step": 1455,
"token_acc": 0.8849805783340526
},
{
"epoch": 0.1870356136305406,
"grad_norm": 4.28125,
"learning_rate": 1.899175482519077e-05,
"loss": 0.3069281578063965,
"step": 1460,
"token_acc": 0.8841452917886039
},
{
"epoch": 0.18767614655393286,
"grad_norm": 3.875,
"learning_rate": 1.898246477130152e-05,
"loss": 0.304925274848938,
"step": 1465,
"token_acc": 0.8836258819480296
},
{
"epoch": 0.18831667947732514,
"grad_norm": 3.21875,
"learning_rate": 1.8973134407411768e-05,
"loss": 0.29193341732025146,
"step": 1470,
"token_acc": 0.8880477570619025
},
{
"epoch": 0.1889572124007174,
"grad_norm": 3.84375,
"learning_rate": 1.8963763775392766e-05,
"loss": 0.2908176898956299,
"step": 1475,
"token_acc": 0.8886537381764782
},
{
"epoch": 0.18959774532410967,
"grad_norm": 5.75,
"learning_rate": 1.895435291729646e-05,
"loss": 0.2873049259185791,
"step": 1480,
"token_acc": 0.8900970369086814
},
{
"epoch": 0.19023827824750192,
"grad_norm": 13.0625,
"learning_rate": 1.8944901875355325e-05,
"loss": 0.29516000747680665,
"step": 1485,
"token_acc": 0.8862587849782262
},
{
"epoch": 0.1908788111708942,
"grad_norm": 4.71875,
"learning_rate": 1.8935410691982163e-05,
"loss": 0.2935316562652588,
"step": 1490,
"token_acc": 0.8873926367128491
},
{
"epoch": 0.19151934409428645,
"grad_norm": 4.625,
"learning_rate": 1.8925879409769915e-05,
"loss": 0.293272590637207,
"step": 1495,
"token_acc": 0.8866402002243894
},
{
"epoch": 0.1921598770176787,
"grad_norm": 7.3125,
"learning_rate": 1.8916308071491474e-05,
"loss": 0.28766617774963377,
"step": 1500,
"token_acc": 0.8866454279318065
},
{
"epoch": 0.1921598770176787,
"eval_loss": 0.33948463201522827,
"eval_runtime": 105.2784,
"eval_samples_per_second": 94.986,
"eval_steps_per_second": 11.873,
"eval_token_acc": 0.8743791906362293,
"step": 1500
},
{
"epoch": 0.19280040994107098,
"grad_norm": 6.8125,
"learning_rate": 1.8906696720099492e-05,
"loss": 0.2923029899597168,
"step": 1505,
"token_acc": 0.8879117089153302
},
{
"epoch": 0.19344094286446323,
"grad_norm": 3.578125,
"learning_rate": 1.8897045398726176e-05,
"loss": 0.29394724369049074,
"step": 1510,
"token_acc": 0.8863872082973207
},
{
"epoch": 0.1940814757878555,
"grad_norm": 4.28125,
"learning_rate": 1.8887354150683108e-05,
"loss": 0.2944790840148926,
"step": 1515,
"token_acc": 0.8862324459377962
},
{
"epoch": 0.19472200871124776,
"grad_norm": 3.296875,
"learning_rate": 1.8877623019461053e-05,
"loss": 0.292703914642334,
"step": 1520,
"token_acc": 0.8874542715730579
},
{
"epoch": 0.19536254163464,
"grad_norm": 5.34375,
"learning_rate": 1.886785204872975e-05,
"loss": 0.28728461265563965,
"step": 1525,
"token_acc": 0.8864913949667041
},
{
"epoch": 0.1960030745580323,
"grad_norm": 11.875,
"learning_rate": 1.885804128233772e-05,
"loss": 0.29347355365753175,
"step": 1530,
"token_acc": 0.8853349426674714
},
{
"epoch": 0.19664360748142454,
"grad_norm": 38.0,
"learning_rate": 1.884819076431208e-05,
"loss": 0.2958747148513794,
"step": 1535,
"token_acc": 0.88622210690192
},
{
"epoch": 0.19728414040481682,
"grad_norm": 4.1875,
"learning_rate": 1.8838300538858338e-05,
"loss": 0.29049015045166016,
"step": 1540,
"token_acc": 0.887807818150508
},
{
"epoch": 0.19792467332820907,
"grad_norm": 5.125,
"learning_rate": 1.8828370650360183e-05,
"loss": 0.29225118160247804,
"step": 1545,
"token_acc": 0.8890085278663106
},
{
"epoch": 0.19856520625160134,
"grad_norm": 5.71875,
"learning_rate": 1.8818401143379312e-05,
"loss": 0.2903005599975586,
"step": 1550,
"token_acc": 0.8862319467266281
},
{
"epoch": 0.1992057391749936,
"grad_norm": 4.6875,
"learning_rate": 1.8808392062655206e-05,
"loss": 0.2934314966201782,
"step": 1555,
"token_acc": 0.8883467898622684
},
{
"epoch": 0.19984627209838585,
"grad_norm": 5.3125,
"learning_rate": 1.8798343453104937e-05,
"loss": 0.2941242218017578,
"step": 1560,
"token_acc": 0.887303361127826
},
{
"epoch": 0.20048680502177813,
"grad_norm": 7.1875,
"learning_rate": 1.8788255359822975e-05,
"loss": 0.30154945850372317,
"step": 1565,
"token_acc": 0.8849016823716708
},
{
"epoch": 0.20112733794517038,
"grad_norm": 5.1875,
"learning_rate": 1.8778127828080978e-05,
"loss": 0.3002612590789795,
"step": 1570,
"token_acc": 0.8851415297255442
},
{
"epoch": 0.20176787086856265,
"grad_norm": 3.484375,
"learning_rate": 1.8767960903327575e-05,
"loss": 0.28886966705322265,
"step": 1575,
"token_acc": 0.8898769695661558
},
{
"epoch": 0.2024084037919549,
"grad_norm": 4.125,
"learning_rate": 1.87577546311882e-05,
"loss": 0.29037837982177733,
"step": 1580,
"token_acc": 0.8892447522181346
},
{
"epoch": 0.20304893671534716,
"grad_norm": 3.8125,
"learning_rate": 1.8747509057464844e-05,
"loss": 0.2931870937347412,
"step": 1585,
"token_acc": 0.8877136936625799
},
{
"epoch": 0.20368946963873943,
"grad_norm": 6.96875,
"learning_rate": 1.8737224228135883e-05,
"loss": 0.29495954513549805,
"step": 1590,
"token_acc": 0.8842078167231542
},
{
"epoch": 0.20433000256213169,
"grad_norm": 3.5625,
"learning_rate": 1.872690018935584e-05,
"loss": 0.30391521453857423,
"step": 1595,
"token_acc": 0.882081572820557
},
{
"epoch": 0.20497053548552396,
"grad_norm": 3.265625,
"learning_rate": 1.8716536987455216e-05,
"loss": 0.292464280128479,
"step": 1600,
"token_acc": 0.8882241542566928
},
{
"epoch": 0.20497053548552396,
"eval_loss": 0.3390945494174957,
"eval_runtime": 103.2443,
"eval_samples_per_second": 96.858,
"eval_steps_per_second": 12.107,
"eval_token_acc": 0.8747529220257902,
"step": 1600
},
{
"epoch": 0.20561106840891621,
"grad_norm": 4.25,
"learning_rate": 1.870613466894025e-05,
"loss": 0.29375975131988524,
"step": 1605,
"token_acc": 0.8863264779278892
},
{
"epoch": 0.2062516013323085,
"grad_norm": 6.8125,
"learning_rate": 1.8695693280492723e-05,
"loss": 0.29436321258544923,
"step": 1610,
"token_acc": 0.8866014901589215
},
{
"epoch": 0.20689213425570074,
"grad_norm": 3.875,
"learning_rate": 1.8685212868969747e-05,
"loss": 0.2911177635192871,
"step": 1615,
"token_acc": 0.8861411643482741
},
{
"epoch": 0.207532667179093,
"grad_norm": 3.125,
"learning_rate": 1.867469348140356e-05,
"loss": 0.2982187747955322,
"step": 1620,
"token_acc": 0.8837629642380685
},
{
"epoch": 0.20817320010248527,
"grad_norm": 3.25,
"learning_rate": 1.8664135165001305e-05,
"loss": 0.28237018585205076,
"step": 1625,
"token_acc": 0.8903340102838871
},
{
"epoch": 0.20881373302587752,
"grad_norm": 4.65625,
"learning_rate": 1.865353796714483e-05,
"loss": 0.2914335012435913,
"step": 1630,
"token_acc": 0.8859261186264308
},
{
"epoch": 0.2094542659492698,
"grad_norm": 3.953125,
"learning_rate": 1.8642901935390457e-05,
"loss": 0.2944057464599609,
"step": 1635,
"token_acc": 0.8854714864981451
},
{
"epoch": 0.21009479887266205,
"grad_norm": 7.78125,
"learning_rate": 1.8632227117468794e-05,
"loss": 0.2919133186340332,
"step": 1640,
"token_acc": 0.8885867219200553
},
{
"epoch": 0.2107353317960543,
"grad_norm": 5.3125,
"learning_rate": 1.86215135612845e-05,
"loss": 0.29367570877075194,
"step": 1645,
"token_acc": 0.8866028091576822
},
{
"epoch": 0.21137586471944658,
"grad_norm": 2.65625,
"learning_rate": 1.8610761314916067e-05,
"loss": 0.29374768733978274,
"step": 1650,
"token_acc": 0.8863107047356164
},
{
"epoch": 0.21201639764283883,
"grad_norm": 4.4375,
"learning_rate": 1.859997042661564e-05,
"loss": 0.286625599861145,
"step": 1655,
"token_acc": 0.8884670147128619
},
{
"epoch": 0.2126569305662311,
"grad_norm": 2.703125,
"learning_rate": 1.858914094480875e-05,
"loss": 0.2876077651977539,
"step": 1660,
"token_acc": 0.8886684208256049
},
{
"epoch": 0.21329746348962336,
"grad_norm": 5.5625,
"learning_rate": 1.8578272918094134e-05,
"loss": 0.2962442398071289,
"step": 1665,
"token_acc": 0.885100138121547
},
{
"epoch": 0.21393799641301564,
"grad_norm": 10.125,
"learning_rate": 1.85673663952435e-05,
"loss": 0.2934115886688232,
"step": 1670,
"token_acc": 0.8867126833477136
},
{
"epoch": 0.2145785293364079,
"grad_norm": 18.25,
"learning_rate": 1.855642142520132e-05,
"loss": 0.2942723274230957,
"step": 1675,
"token_acc": 0.8857585939519256
},
{
"epoch": 0.21521906225980014,
"grad_norm": 4.78125,
"learning_rate": 1.8545438057084587e-05,
"loss": 0.29166316986083984,
"step": 1680,
"token_acc": 0.8873196855834845
},
{
"epoch": 0.21585959518319242,
"grad_norm": 5.5625,
"learning_rate": 1.8534416340182625e-05,
"loss": 0.29405913352966306,
"step": 1685,
"token_acc": 0.8858929269245799
},
{
"epoch": 0.21650012810658467,
"grad_norm": 3.734375,
"learning_rate": 1.852335632395685e-05,
"loss": 0.2922976493835449,
"step": 1690,
"token_acc": 0.8865123084868308
},
{
"epoch": 0.21714066102997695,
"grad_norm": 3.765625,
"learning_rate": 1.851225805804055e-05,
"loss": 0.2907034158706665,
"step": 1695,
"token_acc": 0.8852600656644203
},
{
"epoch": 0.2177811939533692,
"grad_norm": 2.84375,
"learning_rate": 1.850112159223866e-05,
"loss": 0.29575324058532715,
"step": 1700,
"token_acc": 0.8855988654432937
},
{
"epoch": 0.2177811939533692,
"eval_loss": 0.3360104560852051,
"eval_runtime": 105.7607,
"eval_samples_per_second": 94.553,
"eval_steps_per_second": 11.819,
"eval_token_acc": 0.8750740541827463,
"step": 1700
},
{
"epoch": 0.21842172687676145,
"grad_norm": 4.46875,
"learning_rate": 1.848994697652755e-05,
"loss": 0.2958073139190674,
"step": 1705,
"token_acc": 0.884866163349348
},
{
"epoch": 0.21906225980015373,
"grad_norm": 6.5,
"learning_rate": 1.8478734261054785e-05,
"loss": 0.29183714389801024,
"step": 1710,
"token_acc": 0.8865205384880911
},
{
"epoch": 0.21970279272354598,
"grad_norm": 3.109375,
"learning_rate": 1.8467483496138913e-05,
"loss": 0.29586522579193114,
"step": 1715,
"token_acc": 0.8874040865591861
},
{
"epoch": 0.22034332564693826,
"grad_norm": 3.03125,
"learning_rate": 1.8456194732269227e-05,
"loss": 0.298976993560791,
"step": 1720,
"token_acc": 0.8849873210985516
},
{
"epoch": 0.2209838585703305,
"grad_norm": 5.25,
"learning_rate": 1.8444868020105556e-05,
"loss": 0.28900148868560793,
"step": 1725,
"token_acc": 0.8885448916408669
},
{
"epoch": 0.2216243914937228,
"grad_norm": 100.5,
"learning_rate": 1.8433503410478018e-05,
"loss": 0.2942624092102051,
"step": 1730,
"token_acc": 0.8865779615036817
},
{
"epoch": 0.22226492441711504,
"grad_norm": 3.40625,
"learning_rate": 1.8422100954386805e-05,
"loss": 0.2904630184173584,
"step": 1735,
"token_acc": 0.8851269649334945
},
{
"epoch": 0.2229054573405073,
"grad_norm": 3.75,
"learning_rate": 1.841066070300195e-05,
"loss": 0.2874864101409912,
"step": 1740,
"token_acc": 0.8891145585756882
},
{
"epoch": 0.22354599026389957,
"grad_norm": 5.75,
"learning_rate": 1.8399182707663097e-05,
"loss": 0.28712892532348633,
"step": 1745,
"token_acc": 0.8877060885369337
},
{
"epoch": 0.22418652318729182,
"grad_norm": 4.71875,
"learning_rate": 1.8387667019879267e-05,
"loss": 0.29011356830596924,
"step": 1750,
"token_acc": 0.8868864532339817
},
{
"epoch": 0.2248270561106841,
"grad_norm": 4.0,
"learning_rate": 1.8376113691328638e-05,
"loss": 0.2822575569152832,
"step": 1755,
"token_acc": 0.8909889352908253
},
{
"epoch": 0.22546758903407635,
"grad_norm": 2.828125,
"learning_rate": 1.83645227738583e-05,
"loss": 0.28959126472473146,
"step": 1760,
"token_acc": 0.8877797943133696
},
{
"epoch": 0.22610812195746863,
"grad_norm": 6.1875,
"learning_rate": 1.8352894319484028e-05,
"loss": 0.29406278133392333,
"step": 1765,
"token_acc": 0.8860137145814465
},
{
"epoch": 0.22674865488086088,
"grad_norm": 5.1875,
"learning_rate": 1.834122838039006e-05,
"loss": 0.293654203414917,
"step": 1770,
"token_acc": 0.8864692718195903
},
{
"epoch": 0.22738918780425313,
"grad_norm": 6.0,
"learning_rate": 1.8329525008928835e-05,
"loss": 0.28885598182678224,
"step": 1775,
"token_acc": 0.8875652811946998
},
{
"epoch": 0.2280297207276454,
"grad_norm": 3.140625,
"learning_rate": 1.8317784257620784e-05,
"loss": 0.286731481552124,
"step": 1780,
"token_acc": 0.8878766945859597
},
{
"epoch": 0.22867025365103766,
"grad_norm": 2.875,
"learning_rate": 1.830600617915409e-05,
"loss": 0.2842278003692627,
"step": 1785,
"token_acc": 0.8887885045603167
},
{
"epoch": 0.22931078657442994,
"grad_norm": 5.53125,
"learning_rate": 1.829419082638443e-05,
"loss": 0.2927645206451416,
"step": 1790,
"token_acc": 0.8868731372294933
},
{
"epoch": 0.2299513194978222,
"grad_norm": 2.453125,
"learning_rate": 1.828233825233477e-05,
"loss": 0.28806371688842775,
"step": 1795,
"token_acc": 0.8892287806979751
},
{
"epoch": 0.23059185242121444,
"grad_norm": 7.71875,
"learning_rate": 1.827044851019511e-05,
"loss": 0.27867727279663085,
"step": 1800,
"token_acc": 0.8907344926031664
},
{
"epoch": 0.23059185242121444,
"eval_loss": 0.3403998911380768,
"eval_runtime": 102.8463,
"eval_samples_per_second": 97.232,
"eval_steps_per_second": 12.154,
"eval_token_acc": 0.8755059215662391,
"step": 1800
},
{
"epoch": 0.23123238534460672,
"grad_norm": 3.0625,
"learning_rate": 1.8258521653322234e-05,
"loss": 0.29278562068939207,
"step": 1805,
"token_acc": 0.886281276962899
},
{
"epoch": 0.23187291826799897,
"grad_norm": 13.375,
"learning_rate": 1.8246557735239497e-05,
"loss": 0.28790295124053955,
"step": 1810,
"token_acc": 0.889273356401384
},
{
"epoch": 0.23251345119139125,
"grad_norm": 19.125,
"learning_rate": 1.8234556809636567e-05,
"loss": 0.2872922897338867,
"step": 1815,
"token_acc": 0.890285369947919
},
{
"epoch": 0.2331539841147835,
"grad_norm": 8.375,
"learning_rate": 1.8222518930369188e-05,
"loss": 0.29638094902038575,
"step": 1820,
"token_acc": 0.8846751229614289
},
{
"epoch": 0.23379451703817578,
"grad_norm": 6.46875,
"learning_rate": 1.8210444151458935e-05,
"loss": 0.2879481792449951,
"step": 1825,
"token_acc": 0.8891678933240973
},
{
"epoch": 0.23443504996156803,
"grad_norm": 3.734375,
"learning_rate": 1.819833252709298e-05,
"loss": 0.2846549034118652,
"step": 1830,
"token_acc": 0.8897593732512591
},
{
"epoch": 0.23507558288496028,
"grad_norm": 2.859375,
"learning_rate": 1.818618411162384e-05,
"loss": 0.2873443841934204,
"step": 1835,
"token_acc": 0.8899616395845007
},
{
"epoch": 0.23571611580835256,
"grad_norm": 6.125,
"learning_rate": 1.817399895956914e-05,
"loss": 0.2880409240722656,
"step": 1840,
"token_acc": 0.889992689115383
},
{
"epoch": 0.2363566487317448,
"grad_norm": 2.90625,
"learning_rate": 1.816177712561136e-05,
"loss": 0.28801445960998534,
"step": 1845,
"token_acc": 0.8874516544907607
},
{
"epoch": 0.23699718165513708,
"grad_norm": 3.625,
"learning_rate": 1.8149518664597604e-05,
"loss": 0.2893885850906372,
"step": 1850,
"token_acc": 0.887526974536038
},
{
"epoch": 0.23763771457852934,
"grad_norm": 3.421875,
"learning_rate": 1.8137223631539335e-05,
"loss": 0.28786296844482423,
"step": 1855,
"token_acc": 0.8883566373209045
},
{
"epoch": 0.2382782475019216,
"grad_norm": 3.59375,
"learning_rate": 1.8124892081612148e-05,
"loss": 0.2903712511062622,
"step": 1860,
"token_acc": 0.8874086807047701
},
{
"epoch": 0.23891878042531386,
"grad_norm": 5.625,
"learning_rate": 1.8112524070155503e-05,
"loss": 0.2792266607284546,
"step": 1865,
"token_acc": 0.8924651889125059
},
{
"epoch": 0.23955931334870612,
"grad_norm": 6.5625,
"learning_rate": 1.8100119652672488e-05,
"loss": 0.28893446922302246,
"step": 1870,
"token_acc": 0.8882462122847153
},
{
"epoch": 0.2401998462720984,
"grad_norm": 4.46875,
"learning_rate": 1.8087678884829573e-05,
"loss": 0.28021440505981443,
"step": 1875,
"token_acc": 0.8914177335229967
},
{
"epoch": 0.24084037919549064,
"grad_norm": 7.53125,
"learning_rate": 1.8075201822456353e-05,
"loss": 0.287343430519104,
"step": 1880,
"token_acc": 0.8879555440682347
},
{
"epoch": 0.24148091211888292,
"grad_norm": 3.875,
"learning_rate": 1.8062688521545294e-05,
"loss": 0.2859031677246094,
"step": 1885,
"token_acc": 0.8882400927396849
},
{
"epoch": 0.24212144504227517,
"grad_norm": 3.765625,
"learning_rate": 1.805013903825149e-05,
"loss": 0.2897067070007324,
"step": 1890,
"token_acc": 0.8861764071598016
},
{
"epoch": 0.24276197796566742,
"grad_norm": 3.3125,
"learning_rate": 1.803755342889242e-05,
"loss": 0.28570735454559326,
"step": 1895,
"token_acc": 0.8883235598482235
},
{
"epoch": 0.2434025108890597,
"grad_norm": 3.546875,
"learning_rate": 1.802493174994766e-05,
"loss": 0.2846654176712036,
"step": 1900,
"token_acc": 0.8901407234740568
},
{
"epoch": 0.2434025108890597,
"eval_loss": 0.3387065827846527,
"eval_runtime": 103.0506,
"eval_samples_per_second": 97.04,
"eval_steps_per_second": 12.13,
"eval_token_acc": 0.8753702709137318,
"step": 1900
},
{
"epoch": 0.24404304381245195,
"grad_norm": 2.6875,
"learning_rate": 1.8012274058058673e-05,
"loss": 0.276248574256897,
"step": 1905,
"token_acc": 0.8937481149554052
},
{
"epoch": 0.24468357673584423,
"grad_norm": 4.96875,
"learning_rate": 1.799958041002853e-05,
"loss": 0.2819145679473877,
"step": 1910,
"token_acc": 0.8917169974115617
},
{
"epoch": 0.24532410965923648,
"grad_norm": 5.15625,
"learning_rate": 1.7986850862821654e-05,
"loss": 0.28824849128723146,
"step": 1915,
"token_acc": 0.8893915891072044
},
{
"epoch": 0.24596464258262873,
"grad_norm": 3.375,
"learning_rate": 1.797408547356357e-05,
"loss": 0.28600053787231444,
"step": 1920,
"token_acc": 0.8900150959672202
},
{
"epoch": 0.246605175506021,
"grad_norm": 16.875,
"learning_rate": 1.7961284299540666e-05,
"loss": 0.2812356948852539,
"step": 1925,
"token_acc": 0.8895660442600276
},
{
"epoch": 0.24724570842941326,
"grad_norm": 4.09375,
"learning_rate": 1.7948447398199893e-05,
"loss": 0.2775670051574707,
"step": 1930,
"token_acc": 0.892983822129942
},
{
"epoch": 0.24788624135280554,
"grad_norm": 3.21875,
"learning_rate": 1.7935574827148554e-05,
"loss": 0.28611729145050047,
"step": 1935,
"token_acc": 0.8871710951294087
},
{
"epoch": 0.2485267742761978,
"grad_norm": 4.09375,
"learning_rate": 1.7922666644154015e-05,
"loss": 0.2785792827606201,
"step": 1940,
"token_acc": 0.8903730601305494
},
{
"epoch": 0.24916730719959007,
"grad_norm": 2.78125,
"learning_rate": 1.7909722907143456e-05,
"loss": 0.28144145011901855,
"step": 1945,
"token_acc": 0.8897014540135937
},
{
"epoch": 0.24980784012298232,
"grad_norm": 2.859375,
"learning_rate": 1.789674367420361e-05,
"loss": 0.2729172706604004,
"step": 1950,
"token_acc": 0.892122991881154
},
{
"epoch": 0.2504483730463746,
"grad_norm": 3.9375,
"learning_rate": 1.788372900358051e-05,
"loss": 0.28403098583221437,
"step": 1955,
"token_acc": 0.8906849433165223
},
{
"epoch": 0.25108890596976685,
"grad_norm": 4.0,
"learning_rate": 1.7870678953679208e-05,
"loss": 0.28090338706970214,
"step": 1960,
"token_acc": 0.8915574335977924
},
{
"epoch": 0.25172943889315913,
"grad_norm": 3.015625,
"learning_rate": 1.7857593583063533e-05,
"loss": 0.2826396942138672,
"step": 1965,
"token_acc": 0.8903665961397297
},
{
"epoch": 0.25236997181655135,
"grad_norm": 6.8125,
"learning_rate": 1.784447295045582e-05,
"loss": 0.28316607475280764,
"step": 1970,
"token_acc": 0.8914384300194091
},
{
"epoch": 0.25301050473994363,
"grad_norm": 2.96875,
"learning_rate": 1.7831317114736647e-05,
"loss": 0.27657251358032225,
"step": 1975,
"token_acc": 0.8923183631003988
},
{
"epoch": 0.2536510376633359,
"grad_norm": 3.265625,
"learning_rate": 1.7818126134944565e-05,
"loss": 0.2740725040435791,
"step": 1980,
"token_acc": 0.8953624065349872
},
{
"epoch": 0.25429157058672813,
"grad_norm": 2.890625,
"learning_rate": 1.7804900070275853e-05,
"loss": 0.2742879867553711,
"step": 1985,
"token_acc": 0.8926875593542261
},
{
"epoch": 0.2549321035101204,
"grad_norm": 3.625,
"learning_rate": 1.7791638980084217e-05,
"loss": 0.2816567897796631,
"step": 1990,
"token_acc": 0.8898188093183779
},
{
"epoch": 0.2555726364335127,
"grad_norm": 4.71875,
"learning_rate": 1.777834292388056e-05,
"loss": 0.28623175621032715,
"step": 1995,
"token_acc": 0.8866232702504634
},
{
"epoch": 0.25621316935690497,
"grad_norm": 3.46875,
"learning_rate": 1.7765011961332695e-05,
"loss": 0.287227988243103,
"step": 2000,
"token_acc": 0.8884247696150203
},
{
"epoch": 0.25621316935690497,
"eval_loss": 0.3332171142101288,
"eval_runtime": 103.5637,
"eval_samples_per_second": 96.559,
"eval_steps_per_second": 12.07,
"eval_token_acc": 0.8768624280913123,
"step": 2000
},
{
"epoch": 0.2568537022802972,
"grad_norm": 7.625,
"learning_rate": 1.7751646152265086e-05,
"loss": 0.2728090763092041,
"step": 2005,
"token_acc": 0.8931116389548693
},
{
"epoch": 0.25749423520368947,
"grad_norm": 3.453125,
"learning_rate": 1.7738245556658566e-05,
"loss": 0.28285210132598876,
"step": 2010,
"token_acc": 0.8916788698423637
},
{
"epoch": 0.25813476812708175,
"grad_norm": 5.4375,
"learning_rate": 1.7724810234650086e-05,
"loss": 0.2836940050125122,
"step": 2015,
"token_acc": 0.8885819123677963
},
{
"epoch": 0.25877530105047397,
"grad_norm": 4.5,
"learning_rate": 1.7711340246532433e-05,
"loss": 0.28231005668640136,
"step": 2020,
"token_acc": 0.8901254726710209
},
{
"epoch": 0.25941583397386625,
"grad_norm": 4.34375,
"learning_rate": 1.769783565275396e-05,
"loss": 0.27600274085998533,
"step": 2025,
"token_acc": 0.8926168707952389
},
{
"epoch": 0.26005636689725853,
"grad_norm": 2.875,
"learning_rate": 1.768429651391833e-05,
"loss": 0.28248867988586424,
"step": 2030,
"token_acc": 0.8890473720608575
},
{
"epoch": 0.2606968998206508,
"grad_norm": 3.8125,
"learning_rate": 1.767072289078421e-05,
"loss": 0.2763254642486572,
"step": 2035,
"token_acc": 0.8923250173250173
},
{
"epoch": 0.26133743274404303,
"grad_norm": 27.5,
"learning_rate": 1.7657114844265036e-05,
"loss": 0.2861664056777954,
"step": 2040,
"token_acc": 0.8899861997584958
},
{
"epoch": 0.2619779656674353,
"grad_norm": 5.46875,
"learning_rate": 1.764347243542872e-05,
"loss": 0.278385591506958,
"step": 2045,
"token_acc": 0.8922844175491679
},
{
"epoch": 0.2626184985908276,
"grad_norm": 7.46875,
"learning_rate": 1.7629795725497382e-05,
"loss": 0.28106253147125243,
"step": 2050,
"token_acc": 0.890844918865407
},
{
"epoch": 0.2632590315142198,
"grad_norm": 3.34375,
"learning_rate": 1.7616084775847064e-05,
"loss": 0.2838444709777832,
"step": 2055,
"token_acc": 0.8908127665073007
},
{
"epoch": 0.2638995644376121,
"grad_norm": 3.484375,
"learning_rate": 1.760233964800747e-05,
"loss": 0.27664880752563475,
"step": 2060,
"token_acc": 0.8921712169494445
},
{
"epoch": 0.26454009736100437,
"grad_norm": 4.03125,
"learning_rate": 1.7588560403661686e-05,
"loss": 0.2756629228591919,
"step": 2065,
"token_acc": 0.8935702272629263
},
{
"epoch": 0.26518063028439665,
"grad_norm": 25.5,
"learning_rate": 1.7574747104645894e-05,
"loss": 0.28539879322052003,
"step": 2070,
"token_acc": 0.890754132231405
},
{
"epoch": 0.26582116320778887,
"grad_norm": 3.3125,
"learning_rate": 1.7560899812949097e-05,
"loss": 0.28184425830841064,
"step": 2075,
"token_acc": 0.8913267940113577
},
{
"epoch": 0.26646169613118115,
"grad_norm": 3.375,
"learning_rate": 1.7547018590712862e-05,
"loss": 0.2689033508300781,
"step": 2080,
"token_acc": 0.895397489539749
},
{
"epoch": 0.2671022290545734,
"grad_norm": 4.59375,
"learning_rate": 1.7533103500231002e-05,
"loss": 0.2777507543563843,
"step": 2085,
"token_acc": 0.892097198843282
},
{
"epoch": 0.26774276197796565,
"grad_norm": 4.28125,
"learning_rate": 1.7519154603949332e-05,
"loss": 0.2816345691680908,
"step": 2090,
"token_acc": 0.889937781591933
},
{
"epoch": 0.2683832949013579,
"grad_norm": 4.9375,
"learning_rate": 1.750517196446538e-05,
"loss": 0.27451438903808595,
"step": 2095,
"token_acc": 0.8933247200689061
},
{
"epoch": 0.2690238278247502,
"grad_norm": 3.828125,
"learning_rate": 1.749115564452808e-05,
"loss": 0.28593323230743406,
"step": 2100,
"token_acc": 0.8909067435555365
},
{
"epoch": 0.2690238278247502,
"eval_loss": 0.3338375389575958,
"eval_runtime": 104.1317,
"eval_samples_per_second": 96.032,
"eval_steps_per_second": 12.004,
"eval_token_acc": 0.8768458178073317,
"step": 2100
},
{
"epoch": 0.26966436074814243,
"grad_norm": 2.921875,
"learning_rate": 1.747710570703753e-05,
"loss": 0.28068857192993163,
"step": 2105,
"token_acc": 0.8920412834132228
},
{
"epoch": 0.2703048936715347,
"grad_norm": 3.734375,
"learning_rate": 1.7463022215044686e-05,
"loss": 0.2719306945800781,
"step": 2110,
"token_acc": 0.8959989630141721
},
{
"epoch": 0.270945426594927,
"grad_norm": 3.609375,
"learning_rate": 1.7448905231751086e-05,
"loss": 0.27764501571655276,
"step": 2115,
"token_acc": 0.8928232144399344
},
{
"epoch": 0.27158595951831926,
"grad_norm": 2.59375,
"learning_rate": 1.743475482050856e-05,
"loss": 0.27620573043823243,
"step": 2120,
"token_acc": 0.8925562707910313
},
{
"epoch": 0.2722264924417115,
"grad_norm": 8.0,
"learning_rate": 1.7420571044818954e-05,
"loss": 0.27559990882873536,
"step": 2125,
"token_acc": 0.8930252645217016
},
{
"epoch": 0.27286702536510377,
"grad_norm": 2.71875,
"learning_rate": 1.7406353968333837e-05,
"loss": 0.2709467887878418,
"step": 2130,
"token_acc": 0.8944920546057448
},
{
"epoch": 0.27350755828849604,
"grad_norm": 3.796875,
"learning_rate": 1.7392103654854223e-05,
"loss": 0.27666122913360597,
"step": 2135,
"token_acc": 0.8931367037149159
},
{
"epoch": 0.27414809121188827,
"grad_norm": 12.5,
"learning_rate": 1.7377820168330285e-05,
"loss": 0.28263001441955565,
"step": 2140,
"token_acc": 0.8904274533413606
},
{
"epoch": 0.27478862413528055,
"grad_norm": 2.40625,
"learning_rate": 1.7363503572861066e-05,
"loss": 0.2721690654754639,
"step": 2145,
"token_acc": 0.8954508143603923
},
{
"epoch": 0.2754291570586728,
"grad_norm": 3.265625,
"learning_rate": 1.734915393269417e-05,
"loss": 0.28317282199859617,
"step": 2150,
"token_acc": 0.8910895342842413
},
{
"epoch": 0.2760696899820651,
"grad_norm": 16.75,
"learning_rate": 1.733477131222552e-05,
"loss": 0.27765982151031493,
"step": 2155,
"token_acc": 0.8932398123843539
},
{
"epoch": 0.2767102229054573,
"grad_norm": 4.90625,
"learning_rate": 1.7320355775999024e-05,
"loss": 0.2786709785461426,
"step": 2160,
"token_acc": 0.8914231613375221
},
{
"epoch": 0.2773507558288496,
"grad_norm": 6.21875,
"learning_rate": 1.7305907388706312e-05,
"loss": 0.28313846588134767,
"step": 2165,
"token_acc": 0.8894102453723634
},
{
"epoch": 0.2779912887522419,
"grad_norm": 4.96875,
"learning_rate": 1.7291426215186436e-05,
"loss": 0.27286443710327146,
"step": 2170,
"token_acc": 0.892789455547898
},
{
"epoch": 0.2786318216756341,
"grad_norm": 3.828125,
"learning_rate": 1.7276912320425584e-05,
"loss": 0.270449161529541,
"step": 2175,
"token_acc": 0.8942574600971548
},
{
"epoch": 0.2792723545990264,
"grad_norm": 3.84375,
"learning_rate": 1.726236576955678e-05,
"loss": 0.26513283252716063,
"step": 2180,
"token_acc": 0.8964592970472526
},
{
"epoch": 0.27991288752241866,
"grad_norm": 4.4375,
"learning_rate": 1.7247786627859594e-05,
"loss": 0.2790388822555542,
"step": 2185,
"token_acc": 0.8934532002752925
},
{
"epoch": 0.28055342044581094,
"grad_norm": 4.5625,
"learning_rate": 1.7233174960759855e-05,
"loss": 0.2737919807434082,
"step": 2190,
"token_acc": 0.8916871152438762
},
{
"epoch": 0.28119395336920316,
"grad_norm": 4.03125,
"learning_rate": 1.721853083382936e-05,
"loss": 0.2736166715621948,
"step": 2195,
"token_acc": 0.8931646005509641
},
{
"epoch": 0.28183448629259544,
"grad_norm": 2.703125,
"learning_rate": 1.7203854312785565e-05,
"loss": 0.26971442699432374,
"step": 2200,
"token_acc": 0.8943965517241379
},
{
"epoch": 0.28183448629259544,
"eval_loss": 0.33342963457107544,
"eval_runtime": 105.1061,
"eval_samples_per_second": 95.142,
"eval_steps_per_second": 11.893,
"eval_token_acc": 0.8773164425201123,
"step": 2200
},
{
"epoch": 0.2824750192159877,
"grad_norm": 4.46875,
"learning_rate": 1.7189145463491303e-05,
"loss": 0.271907377243042,
"step": 2205,
"token_acc": 0.8919140136992203
},
{
"epoch": 0.28311555213937994,
"grad_norm": 3.109375,
"learning_rate": 1.7174404351954485e-05,
"loss": 0.2717395782470703,
"step": 2210,
"token_acc": 0.8932424268576853
},
{
"epoch": 0.2837560850627722,
"grad_norm": 4.375,
"learning_rate": 1.7159631044327798e-05,
"loss": 0.26909971237182617,
"step": 2215,
"token_acc": 0.8958585509251253
},
{
"epoch": 0.2843966179861645,
"grad_norm": 9.0,
"learning_rate": 1.714482560690842e-05,
"loss": 0.2807865858078003,
"step": 2220,
"token_acc": 0.8910201273008773
},
{
"epoch": 0.2850371509095567,
"grad_norm": 4.71875,
"learning_rate": 1.7129988106137715e-05,
"loss": 0.2830962657928467,
"step": 2225,
"token_acc": 0.8901977644024076
},
{
"epoch": 0.285677683832949,
"grad_norm": 5.28125,
"learning_rate": 1.7115118608600925e-05,
"loss": 0.2666552782058716,
"step": 2230,
"token_acc": 0.8958189058171745
},
{
"epoch": 0.2863182167563413,
"grad_norm": 6.09375,
"learning_rate": 1.7100217181026898e-05,
"loss": 0.2754360198974609,
"step": 2235,
"token_acc": 0.8915355351893481
},
{
"epoch": 0.28695874967973356,
"grad_norm": 3.921875,
"learning_rate": 1.708528389028776e-05,
"loss": 0.2791600227355957,
"step": 2240,
"token_acc": 0.8892330193143201
},
{
"epoch": 0.2875992826031258,
"grad_norm": 5.71875,
"learning_rate": 1.707031880339863e-05,
"loss": 0.27956390380859375,
"step": 2245,
"token_acc": 0.8920872595843552
},
{
"epoch": 0.28823981552651806,
"grad_norm": 3.328125,
"learning_rate": 1.705532198751732e-05,
"loss": 0.27212765216827395,
"step": 2250,
"token_acc": 0.8932557638439992
},
{
"epoch": 0.28888034844991034,
"grad_norm": 8.1875,
"learning_rate": 1.7040293509944027e-05,
"loss": 0.27141647338867186,
"step": 2255,
"token_acc": 0.8947846000950611
},
{
"epoch": 0.28952088137330256,
"grad_norm": 3.5625,
"learning_rate": 1.7025233438121037e-05,
"loss": 0.27087936401367185,
"step": 2260,
"token_acc": 0.8955649693092418
},
{
"epoch": 0.29016141429669484,
"grad_norm": 3.59375,
"learning_rate": 1.7010141839632417e-05,
"loss": 0.27354631423950193,
"step": 2265,
"token_acc": 0.8951717573764818
},
{
"epoch": 0.2908019472200871,
"grad_norm": 3.0625,
"learning_rate": 1.699501878220371e-05,
"loss": 0.27661924362182616,
"step": 2270,
"token_acc": 0.8932264736297828
},
{
"epoch": 0.2914424801434794,
"grad_norm": 2.46875,
"learning_rate": 1.6979864333701645e-05,
"loss": 0.271943473815918,
"step": 2275,
"token_acc": 0.8923408845738943
},
{
"epoch": 0.2920830130668716,
"grad_norm": 9.75,
"learning_rate": 1.6964678562133815e-05,
"loss": 0.27072222232818605,
"step": 2280,
"token_acc": 0.8939870012482245
},
{
"epoch": 0.2927235459902639,
"grad_norm": 2.90625,
"learning_rate": 1.6949461535648377e-05,
"loss": 0.26898555755615233,
"step": 2285,
"token_acc": 0.8956255128039038
},
{
"epoch": 0.2933640789136562,
"grad_norm": 3.390625,
"learning_rate": 1.6934213322533758e-05,
"loss": 0.27256574630737307,
"step": 2290,
"token_acc": 0.8912772451743262
},
{
"epoch": 0.2940046118370484,
"grad_norm": 28.0,
"learning_rate": 1.6918933991218333e-05,
"loss": 0.28531837463378906,
"step": 2295,
"token_acc": 0.8895293813989503
},
{
"epoch": 0.2946451447604407,
"grad_norm": 8.8125,
"learning_rate": 1.6903623610270127e-05,
"loss": 0.28380842208862306,
"step": 2300,
"token_acc": 0.8899965475573969
},
{
"epoch": 0.2946451447604407,
"eval_loss": 0.32995760440826416,
"eval_runtime": 105.2266,
"eval_samples_per_second": 95.033,
"eval_steps_per_second": 11.879,
"eval_token_acc": 0.8770423728344342,
"step": 2300
},
{
"epoch": 0.29528567768383296,
"grad_norm": 33.5,
"learning_rate": 1.6888282248396498e-05,
"loss": 0.2725163459777832,
"step": 2305,
"token_acc": 0.8936618507051943
},
{
"epoch": 0.29592621060722524,
"grad_norm": 3.609375,
"learning_rate": 1.6872909974443847e-05,
"loss": 0.2721263885498047,
"step": 2310,
"token_acc": 0.8947889750215332
},
{
"epoch": 0.29656674353061746,
"grad_norm": 4.4375,
"learning_rate": 1.685750685739728e-05,
"loss": 0.27622146606445314,
"step": 2315,
"token_acc": 0.8931007685001295
},
{
"epoch": 0.29720727645400974,
"grad_norm": 2.859375,
"learning_rate": 1.6842072966380333e-05,
"loss": 0.274534273147583,
"step": 2320,
"token_acc": 0.8931854473263994
},
{
"epoch": 0.297847809377402,
"grad_norm": 3.375,
"learning_rate": 1.682660837065463e-05,
"loss": 0.2757120132446289,
"step": 2325,
"token_acc": 0.8937096079276174
},
{
"epoch": 0.29848834230079424,
"grad_norm": 3.1875,
"learning_rate": 1.6811113139619596e-05,
"loss": 0.276756739616394,
"step": 2330,
"token_acc": 0.8923500559332244
},
{
"epoch": 0.2991288752241865,
"grad_norm": 3.125,
"learning_rate": 1.6795587342812137e-05,
"loss": 0.2754298448562622,
"step": 2335,
"token_acc": 0.8925148925148925
},
{
"epoch": 0.2997694081475788,
"grad_norm": 3.484375,
"learning_rate": 1.6780031049906317e-05,
"loss": 0.2664804935455322,
"step": 2340,
"token_acc": 0.8968724315438854
},
{
"epoch": 0.300409941070971,
"grad_norm": 2.640625,
"learning_rate": 1.6764444330713062e-05,
"loss": 0.2691181182861328,
"step": 2345,
"token_acc": 0.8944920546057448
},
{
"epoch": 0.3010504739943633,
"grad_norm": 3.65625,
"learning_rate": 1.674882725517984e-05,
"loss": 0.27036800384521487,
"step": 2350,
"token_acc": 0.8949255020513928
},
{
"epoch": 0.3016910069177556,
"grad_norm": 2.578125,
"learning_rate": 1.6733179893390342e-05,
"loss": 0.2797673463821411,
"step": 2355,
"token_acc": 0.8923209292320929
},
{
"epoch": 0.30233153984114786,
"grad_norm": 13.1875,
"learning_rate": 1.671750231556419e-05,
"loss": 0.2723313093185425,
"step": 2360,
"token_acc": 0.894600767009954
},
{
"epoch": 0.3029720727645401,
"grad_norm": 5.21875,
"learning_rate": 1.6701794592056572e-05,
"loss": 0.26928038597106935,
"step": 2365,
"token_acc": 0.8950982509177283
},
{
"epoch": 0.30361260568793236,
"grad_norm": 2.703125,
"learning_rate": 1.6686056793357993e-05,
"loss": 0.27132067680358884,
"step": 2370,
"token_acc": 0.8939844120053395
},
{
"epoch": 0.30425313861132464,
"grad_norm": 3.28125,
"learning_rate": 1.6670288990093904e-05,
"loss": 0.2636139392852783,
"step": 2375,
"token_acc": 0.8964771990490599
},
{
"epoch": 0.30489367153471686,
"grad_norm": 2.375,
"learning_rate": 1.665449125302441e-05,
"loss": 0.2698176860809326,
"step": 2380,
"token_acc": 0.8951257453979777
},
{
"epoch": 0.30553420445810914,
"grad_norm": 4.96875,
"learning_rate": 1.663866365304395e-05,
"loss": 0.27487883567810056,
"step": 2385,
"token_acc": 0.8932954398656504
},
{
"epoch": 0.3061747373815014,
"grad_norm": 2.6875,
"learning_rate": 1.6622806261180975e-05,
"loss": 0.27799344062805176,
"step": 2390,
"token_acc": 0.8914015477214101
},
{
"epoch": 0.3068152703048937,
"grad_norm": 6.71875,
"learning_rate": 1.660691914859763e-05,
"loss": 0.2766709566116333,
"step": 2395,
"token_acc": 0.8933596431022649
},
{
"epoch": 0.3074558032282859,
"grad_norm": 3.03125,
"learning_rate": 1.659100238658944e-05,
"loss": 0.2766282081604004,
"step": 2400,
"token_acc": 0.8921505237294711
},
{
"epoch": 0.3074558032282859,
"eval_loss": 0.33286821842193604,
"eval_runtime": 102.9833,
"eval_samples_per_second": 97.103,
"eval_steps_per_second": 12.138,
"eval_token_acc": 0.8770562147377513,
"step": 2400
},
{
"epoch": 0.3080963361516782,
"grad_norm": 2.765625,
"learning_rate": 1.6575056046584982e-05,
"loss": 0.2664001703262329,
"step": 2405,
"token_acc": 0.8966157299490632
},
{
"epoch": 0.3087368690750705,
"grad_norm": 2.921875,
"learning_rate": 1.6559080200145565e-05,
"loss": 0.2731971740722656,
"step": 2410,
"token_acc": 0.8921074184232078
},
{
"epoch": 0.3093774019984627,
"grad_norm": 3.796875,
"learning_rate": 1.6543074918964923e-05,
"loss": 0.27004868984222413,
"step": 2415,
"token_acc": 0.8929048954065129
},
{
"epoch": 0.310017934921855,
"grad_norm": 2.71875,
"learning_rate": 1.652704027486887e-05,
"loss": 0.27138233184814453,
"step": 2420,
"token_acc": 0.8945260347129506
},
{
"epoch": 0.31065846784524725,
"grad_norm": 3.078125,
"learning_rate": 1.6510976339814998e-05,
"loss": 0.27827138900756837,
"step": 2425,
"token_acc": 0.8920770324415148
},
{
"epoch": 0.31129900076863953,
"grad_norm": 3.578125,
"learning_rate": 1.6494883185892345e-05,
"loss": 0.268726110458374,
"step": 2430,
"token_acc": 0.8919771764502464
},
{
"epoch": 0.31193953369203176,
"grad_norm": 6.4375,
"learning_rate": 1.647876088532107e-05,
"loss": 0.27257063388824465,
"step": 2435,
"token_acc": 0.8935447825339522
},
{
"epoch": 0.31258006661542403,
"grad_norm": 3.578125,
"learning_rate": 1.6462609510452126e-05,
"loss": 0.27083382606506345,
"step": 2440,
"token_acc": 0.8954088212535466
},
{
"epoch": 0.3132205995388163,
"grad_norm": 3.078125,
"learning_rate": 1.6446429133766955e-05,
"loss": 0.2705575942993164,
"step": 2445,
"token_acc": 0.8939740326963723
},
{
"epoch": 0.31386113246220854,
"grad_norm": 2.6875,
"learning_rate": 1.6430219827877137e-05,
"loss": 0.27445831298828127,
"step": 2450,
"token_acc": 0.892545649838883
},
{
"epoch": 0.3145016653856008,
"grad_norm": 2.78125,
"learning_rate": 1.641398166552408e-05,
"loss": 0.26441996097564696,
"step": 2455,
"token_acc": 0.8965159505489755
},
{
"epoch": 0.3151421983089931,
"grad_norm": 4.3125,
"learning_rate": 1.6397714719578692e-05,
"loss": 0.2621718406677246,
"step": 2460,
"token_acc": 0.8974180950314602
},
{
"epoch": 0.31578273123238537,
"grad_norm": 4.96875,
"learning_rate": 1.6381419063041044e-05,
"loss": 0.2664108991622925,
"step": 2465,
"token_acc": 0.8976900534390623
},
{
"epoch": 0.3164232641557776,
"grad_norm": 3.171875,
"learning_rate": 1.636509476904005e-05,
"loss": 0.26438174247741697,
"step": 2470,
"token_acc": 0.8964934333145508
},
{
"epoch": 0.3170637970791699,
"grad_norm": 3.390625,
"learning_rate": 1.634874191083315e-05,
"loss": 0.2664673328399658,
"step": 2475,
"token_acc": 0.8969490355154706
},
{
"epoch": 0.31770433000256215,
"grad_norm": 7.3125,
"learning_rate": 1.6332360561805953e-05,
"loss": 0.2602536678314209,
"step": 2480,
"token_acc": 0.8977645434144658
},
{
"epoch": 0.3183448629259544,
"grad_norm": 3.0625,
"learning_rate": 1.631595079547194e-05,
"loss": 0.26571226119995117,
"step": 2485,
"token_acc": 0.8940942154485625
},
{
"epoch": 0.31898539584934665,
"grad_norm": 2.953125,
"learning_rate": 1.6299512685472104e-05,
"loss": 0.2715281009674072,
"step": 2490,
"token_acc": 0.8944671689989235
},
{
"epoch": 0.31962592877273893,
"grad_norm": 2.375,
"learning_rate": 1.6283046305574646e-05,
"loss": 0.26947875022888185,
"step": 2495,
"token_acc": 0.8938190607734806
},
{
"epoch": 0.32026646169613116,
"grad_norm": 2.65625,
"learning_rate": 1.6266551729674625e-05,
"loss": 0.26917757987976076,
"step": 2500,
"token_acc": 0.8948387096774194
},
{
"epoch": 0.32026646169613116,
"eval_loss": 0.33583664894104004,
"eval_runtime": 101.9417,
"eval_samples_per_second": 98.095,
"eval_steps_per_second": 12.262,
"eval_token_acc": 0.8780445266345903,
"step": 2500
},
{
"epoch": 0.32090699461952343,
"grad_norm": 2.8125,
"learning_rate": 1.6250029031793637e-05,
"loss": 0.26485161781311034,
"step": 2505,
"token_acc": 0.8968752690023242
},
{
"epoch": 0.3215475275429157,
"grad_norm": 3.515625,
"learning_rate": 1.623347828607948e-05,
"loss": 0.27197585105895994,
"step": 2510,
"token_acc": 0.8942249763318703
},
{
"epoch": 0.322188060466308,
"grad_norm": 3.53125,
"learning_rate": 1.621689956680581e-05,
"loss": 0.26804704666137696,
"step": 2515,
"token_acc": 0.8950142573230796
},
{
"epoch": 0.3228285933897002,
"grad_norm": 3.390625,
"learning_rate": 1.6200292948371826e-05,
"loss": 0.27621660232543943,
"step": 2520,
"token_acc": 0.8921281543364051
},
{
"epoch": 0.3234691263130925,
"grad_norm": 7.625,
"learning_rate": 1.6183658505301937e-05,
"loss": 0.270648455619812,
"step": 2525,
"token_acc": 0.8940274727640701
},
{
"epoch": 0.32410965923648477,
"grad_norm": 3.9375,
"learning_rate": 1.6166996312245403e-05,
"loss": 0.2624387502670288,
"step": 2530,
"token_acc": 0.8973961027763407
},
{
"epoch": 0.324750192159877,
"grad_norm": 18.0,
"learning_rate": 1.6150306443976026e-05,
"loss": 0.270206356048584,
"step": 2535,
"token_acc": 0.8947050707140394
},
{
"epoch": 0.32539072508326927,
"grad_norm": 7.5625,
"learning_rate": 1.6133588975391793e-05,
"loss": 0.26768012046813966,
"step": 2540,
"token_acc": 0.8949362728212195
},
{
"epoch": 0.32603125800666155,
"grad_norm": 2.53125,
"learning_rate": 1.6116843981514568e-05,
"loss": 0.265167760848999,
"step": 2545,
"token_acc": 0.895794614686433
},
{
"epoch": 0.32667179093005383,
"grad_norm": 2.34375,
"learning_rate": 1.6100071537489726e-05,
"loss": 0.2654293060302734,
"step": 2550,
"token_acc": 0.8955745341614907
},
{
"epoch": 0.32731232385344605,
"grad_norm": 4.71875,
"learning_rate": 1.6083271718585828e-05,
"loss": 0.2678376197814941,
"step": 2555,
"token_acc": 0.8955519229114687
},
{
"epoch": 0.32795285677683833,
"grad_norm": 4.8125,
"learning_rate": 1.606644460019429e-05,
"loss": 0.2697244644165039,
"step": 2560,
"token_acc": 0.8963868911760906
},
{
"epoch": 0.3285933897002306,
"grad_norm": 3.390625,
"learning_rate": 1.604959025782904e-05,
"loss": 0.2643167972564697,
"step": 2565,
"token_acc": 0.8960286936606024
},
{
"epoch": 0.32923392262362283,
"grad_norm": 4.0625,
"learning_rate": 1.6032708767126158e-05,
"loss": 0.2669541835784912,
"step": 2570,
"token_acc": 0.8956862407439298
},
{
"epoch": 0.3298744555470151,
"grad_norm": 7.125,
"learning_rate": 1.601580020384358e-05,
"loss": 0.27081780433654784,
"step": 2575,
"token_acc": 0.8950130095403296
},
{
"epoch": 0.3305149884704074,
"grad_norm": 6.9375,
"learning_rate": 1.5998864643860723e-05,
"loss": 0.25800356864929197,
"step": 2580,
"token_acc": 0.8975890576981345
},
{
"epoch": 0.33115552139379967,
"grad_norm": 3.078125,
"learning_rate": 1.5981902163178152e-05,
"loss": 0.26956448554992674,
"step": 2585,
"token_acc": 0.8931310867878997
},
{
"epoch": 0.3317960543171919,
"grad_norm": 2.34375,
"learning_rate": 1.596491283791725e-05,
"loss": 0.26112003326416017,
"step": 2590,
"token_acc": 0.8994970516822755
},
{
"epoch": 0.33243658724058417,
"grad_norm": 3.265625,
"learning_rate": 1.594789674431986e-05,
"loss": 0.27035064697265626,
"step": 2595,
"token_acc": 0.8960434445306439
},
{
"epoch": 0.33307712016397645,
"grad_norm": 3.140625,
"learning_rate": 1.593085395874796e-05,
"loss": 0.27758283615112306,
"step": 2600,
"token_acc": 0.8915802607236587
},
{
"epoch": 0.33307712016397645,
"eval_loss": 0.3308471143245697,
"eval_runtime": 103.8949,
"eval_samples_per_second": 96.251,
"eval_steps_per_second": 12.031,
"eval_token_acc": 0.8782355449003659,
"step": 2600
},
{
"epoch": 0.33371765308736867,
"grad_norm": 3.203125,
"learning_rate": 1.5913784557683304e-05,
"loss": 0.2707799196243286,
"step": 2605,
"token_acc": 0.8923612603705455
},
{
"epoch": 0.33435818601076095,
"grad_norm": 2.609375,
"learning_rate": 1.5896688617727095e-05,
"loss": 0.2663607120513916,
"step": 2610,
"token_acc": 0.8958297432362571
},
{
"epoch": 0.33499871893415323,
"grad_norm": 2.859375,
"learning_rate": 1.5879566215599623e-05,
"loss": 0.2679924488067627,
"step": 2615,
"token_acc": 0.8955616958811847
},
{
"epoch": 0.33563925185754545,
"grad_norm": 3.046875,
"learning_rate": 1.5862417428139938e-05,
"loss": 0.268009090423584,
"step": 2620,
"token_acc": 0.8962800309997416
},
{
"epoch": 0.33627978478093773,
"grad_norm": 3.234375,
"learning_rate": 1.5845242332305496e-05,
"loss": 0.257326078414917,
"step": 2625,
"token_acc": 0.8986603284356093
},
{
"epoch": 0.33692031770433,
"grad_norm": 3.078125,
"learning_rate": 1.5828041005171818e-05,
"loss": 0.2634852647781372,
"step": 2630,
"token_acc": 0.8965665605369589
},
{
"epoch": 0.3375608506277223,
"grad_norm": 3.453125,
"learning_rate": 1.581081352393213e-05,
"loss": 0.2582373857498169,
"step": 2635,
"token_acc": 0.8991817398794143
},
{
"epoch": 0.3382013835511145,
"grad_norm": 3.6875,
"learning_rate": 1.5793559965897042e-05,
"loss": 0.27222495079040526,
"step": 2640,
"token_acc": 0.891765924391507
},
{
"epoch": 0.3388419164745068,
"grad_norm": 2.703125,
"learning_rate": 1.577628040849418e-05,
"loss": 0.2661598205566406,
"step": 2645,
"token_acc": 0.8968578940562907
},
{
"epoch": 0.33948244939789907,
"grad_norm": 5.53125,
"learning_rate": 1.5758974929267844e-05,
"loss": 0.2645248889923096,
"step": 2650,
"token_acc": 0.899624563069089
},
{
"epoch": 0.3401229823212913,
"grad_norm": 3.359375,
"learning_rate": 1.574164360587867e-05,
"loss": 0.2611443281173706,
"step": 2655,
"token_acc": 0.8988632925616977
},
{
"epoch": 0.34076351524468357,
"grad_norm": 2.328125,
"learning_rate": 1.572428651610326e-05,
"loss": 0.27028732299804686,
"step": 2660,
"token_acc": 0.8950970685721665
},
{
"epoch": 0.34140404816807585,
"grad_norm": 2.609375,
"learning_rate": 1.570690373783386e-05,
"loss": 0.2680711269378662,
"step": 2665,
"token_acc": 0.8941867495897037
},
{
"epoch": 0.3420445810914681,
"grad_norm": 2.796875,
"learning_rate": 1.5689495349077984e-05,
"loss": 0.2609850406646729,
"step": 2670,
"token_acc": 0.8973326405126872
},
{
"epoch": 0.34268511401486035,
"grad_norm": 2.984375,
"learning_rate": 1.5672061427958086e-05,
"loss": 0.26308517456054686,
"step": 2675,
"token_acc": 0.896771416272062
},
{
"epoch": 0.3433256469382526,
"grad_norm": 2.875,
"learning_rate": 1.5654602052711202e-05,
"loss": 0.27320644855499265,
"step": 2680,
"token_acc": 0.8943637769567833
},
{
"epoch": 0.3439661798616449,
"grad_norm": 2.515625,
"learning_rate": 1.563711730168858e-05,
"loss": 0.26294333934783937,
"step": 2685,
"token_acc": 0.8980288752485519
},
{
"epoch": 0.34460671278503713,
"grad_norm": 2.5,
"learning_rate": 1.5619607253355365e-05,
"loss": 0.2679460048675537,
"step": 2690,
"token_acc": 0.8959211771792445
},
{
"epoch": 0.3452472457084294,
"grad_norm": 2.625,
"learning_rate": 1.5602071986290214e-05,
"loss": 0.2540433883666992,
"step": 2695,
"token_acc": 0.8991190188288133
},
{
"epoch": 0.3458877786318217,
"grad_norm": 3.734375,
"learning_rate": 1.558451157918496e-05,
"loss": 0.26918482780456543,
"step": 2700,
"token_acc": 0.8953965852604279
},
{
"epoch": 0.3458877786318217,
"eval_loss": 0.33141687512397766,
"eval_runtime": 102.8827,
"eval_samples_per_second": 97.198,
"eval_steps_per_second": 12.15,
"eval_token_acc": 0.8783047544169513,
"step": 2700
},
{
"epoch": 0.34652831155521396,
"grad_norm": 2.8125,
"learning_rate": 1.556692611084426e-05,
"loss": 0.2630035400390625,
"step": 2705,
"token_acc": 0.8983968711049986
},
{
"epoch": 0.3471688444786062,
"grad_norm": 5.09375,
"learning_rate": 1.554931566018523e-05,
"loss": 0.26360278129577636,
"step": 2710,
"token_acc": 0.8968615649183147
},
{
"epoch": 0.34780937740199847,
"grad_norm": 2.671875,
"learning_rate": 1.55316803062371e-05,
"loss": 0.25818705558776855,
"step": 2715,
"token_acc": 0.8981868297514967
},
{
"epoch": 0.34844991032539074,
"grad_norm": 2.921875,
"learning_rate": 1.5514020128140854e-05,
"loss": 0.26247010231018064,
"step": 2720,
"token_acc": 0.8978712401965009
},
{
"epoch": 0.34909044324878297,
"grad_norm": 3.3125,
"learning_rate": 1.5496335205148888e-05,
"loss": 0.26362130641937254,
"step": 2725,
"token_acc": 0.8962732651034244
},
{
"epoch": 0.34973097617217525,
"grad_norm": 2.328125,
"learning_rate": 1.547862561662463e-05,
"loss": 0.26531424522399905,
"step": 2730,
"token_acc": 0.8975365632684758
},
{
"epoch": 0.3503715090955675,
"grad_norm": 5.3125,
"learning_rate": 1.546089144204221e-05,
"loss": 0.2550010919570923,
"step": 2735,
"token_acc": 0.9006299620296859
},
{
"epoch": 0.3510120420189598,
"grad_norm": 2.890625,
"learning_rate": 1.5443132760986077e-05,
"loss": 0.25297343730926514,
"step": 2740,
"token_acc": 0.9003972023141352
},
{
"epoch": 0.351652574942352,
"grad_norm": 3.421875,
"learning_rate": 1.5425349653150674e-05,
"loss": 0.2688558578491211,
"step": 2745,
"token_acc": 0.8946644309729567
},
{
"epoch": 0.3522931078657443,
"grad_norm": 3.765625,
"learning_rate": 1.5407542198340045e-05,
"loss": 0.25696539878845215,
"step": 2750,
"token_acc": 0.8994601597927013
},
{
"epoch": 0.3529336407891366,
"grad_norm": 2.671875,
"learning_rate": 1.538971047646751e-05,
"loss": 0.2645355224609375,
"step": 2755,
"token_acc": 0.8972590932597828
},
{
"epoch": 0.3535741737125288,
"grad_norm": 3.09375,
"learning_rate": 1.537185456755528e-05,
"loss": 0.2609572410583496,
"step": 2760,
"token_acc": 0.8978304088512404
},
{
"epoch": 0.3542147066359211,
"grad_norm": 3.28125,
"learning_rate": 1.5353974551734102e-05,
"loss": 0.25736873149871825,
"step": 2765,
"token_acc": 0.9001164445594514
},
{
"epoch": 0.35485523955931336,
"grad_norm": 2.890625,
"learning_rate": 1.533607050924293e-05,
"loss": 0.2622791290283203,
"step": 2770,
"token_acc": 0.8979319258940112
},
{
"epoch": 0.3554957724827056,
"grad_norm": 3.375,
"learning_rate": 1.531814252042852e-05,
"loss": 0.2560434818267822,
"step": 2775,
"token_acc": 0.9002417335750669
},
{
"epoch": 0.35613630540609786,
"grad_norm": 3.109375,
"learning_rate": 1.5300190665745097e-05,
"loss": 0.26474769115448,
"step": 2780,
"token_acc": 0.8953059298034841
},
{
"epoch": 0.35677683832949014,
"grad_norm": 7.5625,
"learning_rate": 1.5282215025753984e-05,
"loss": 0.2650959014892578,
"step": 2785,
"token_acc": 0.8951942520328701
},
{
"epoch": 0.3574173712528824,
"grad_norm": 2.71875,
"learning_rate": 1.526421568112325e-05,
"loss": 0.26280429363250735,
"step": 2790,
"token_acc": 0.8963790945578525
},
{
"epoch": 0.35805790417627464,
"grad_norm": 8.0625,
"learning_rate": 1.5246192712627341e-05,
"loss": 0.2684659957885742,
"step": 2795,
"token_acc": 0.8949045957703406
},
{
"epoch": 0.3586984370996669,
"grad_norm": 3.390625,
"learning_rate": 1.522814620114671e-05,
"loss": 0.2673259019851685,
"step": 2800,
"token_acc": 0.8951397849462366
},
{
"epoch": 0.3586984370996669,
"eval_loss": 0.3319157361984253,
"eval_runtime": 102.1149,
"eval_samples_per_second": 97.929,
"eval_steps_per_second": 12.241,
"eval_token_acc": 0.8786978644711563,
"step": 2800
},
{
"epoch": 0.3593389700230592,
"grad_norm": 2.734375,
"learning_rate": 1.5210076227667467e-05,
"loss": 0.26007418632507323,
"step": 2805,
"token_acc": 0.898749460974558
},
{
"epoch": 0.3599795029464514,
"grad_norm": 3.328125,
"learning_rate": 1.5191982873281016e-05,
"loss": 0.2620399951934814,
"step": 2810,
"token_acc": 0.8979195441988951
},
{
"epoch": 0.3606200358698437,
"grad_norm": 3.203125,
"learning_rate": 1.5173866219183681e-05,
"loss": 0.2614466667175293,
"step": 2815,
"token_acc": 0.8992734192543897
},
{
"epoch": 0.361260568793236,
"grad_norm": 2.5,
"learning_rate": 1.5155726346676342e-05,
"loss": 0.2509075880050659,
"step": 2820,
"token_acc": 0.9027616216449097
},
{
"epoch": 0.36190110171662826,
"grad_norm": 2.78125,
"learning_rate": 1.5137563337164088e-05,
"loss": 0.26183514595031737,
"step": 2825,
"token_acc": 0.8969877438287589
},
{
"epoch": 0.3625416346400205,
"grad_norm": 2.84375,
"learning_rate": 1.5119377272155821e-05,
"loss": 0.2658205032348633,
"step": 2830,
"token_acc": 0.8951929295106704
},
{
"epoch": 0.36318216756341276,
"grad_norm": 3.359375,
"learning_rate": 1.5101168233263925e-05,
"loss": 0.25493884086608887,
"step": 2835,
"token_acc": 0.9017814778070138
},
{
"epoch": 0.36382270048680504,
"grad_norm": 4.125,
"learning_rate": 1.508293630220387e-05,
"loss": 0.2533620119094849,
"step": 2840,
"token_acc": 0.8996077417130048
},
{
"epoch": 0.36446323341019726,
"grad_norm": 3.5,
"learning_rate": 1.506468156079386e-05,
"loss": 0.2602185010910034,
"step": 2845,
"token_acc": 0.8991640093079376
},
{
"epoch": 0.36510376633358954,
"grad_norm": 2.90625,
"learning_rate": 1.5046404090954467e-05,
"loss": 0.26317653656005857,
"step": 2850,
"token_acc": 0.8983934186156696
},
{
"epoch": 0.3657442992569818,
"grad_norm": 3.9375,
"learning_rate": 1.5028103974708259e-05,
"loss": 0.2617523670196533,
"step": 2855,
"token_acc": 0.8986605796976614
},
{
"epoch": 0.3663848321803741,
"grad_norm": 3.609375,
"learning_rate": 1.5009781294179431e-05,
"loss": 0.2595290899276733,
"step": 2860,
"token_acc": 0.8996727523251808
},
{
"epoch": 0.3670253651037663,
"grad_norm": 7.65625,
"learning_rate": 1.4991436131593438e-05,
"loss": 0.2566396236419678,
"step": 2865,
"token_acc": 0.8992037873897138
},
{
"epoch": 0.3676658980271586,
"grad_norm": 3.953125,
"learning_rate": 1.4973068569276627e-05,
"loss": 0.2593822479248047,
"step": 2870,
"token_acc": 0.8983920334526017
},
{
"epoch": 0.3683064309505509,
"grad_norm": 3.71875,
"learning_rate": 1.495467868965587e-05,
"loss": 0.25176091194152833,
"step": 2875,
"token_acc": 0.8993063035891249
},
{
"epoch": 0.3689469638739431,
"grad_norm": 5.6875,
"learning_rate": 1.4936266575258184e-05,
"loss": 0.26164243221282957,
"step": 2880,
"token_acc": 0.8975323047668439
},
{
"epoch": 0.3695874967973354,
"grad_norm": 8.1875,
"learning_rate": 1.4917832308710374e-05,
"loss": 0.2630914211273193,
"step": 2885,
"token_acc": 0.897822806639362
},
{
"epoch": 0.37022802972072766,
"grad_norm": 2.890625,
"learning_rate": 1.489937597273865e-05,
"loss": 0.26312851905822754,
"step": 2890,
"token_acc": 0.8980260322386001
},
{
"epoch": 0.3708685626441199,
"grad_norm": 2.71875,
"learning_rate": 1.4880897650168269e-05,
"loss": 0.26306843757629395,
"step": 2895,
"token_acc": 0.8972188633615478
},
{
"epoch": 0.37150909556751216,
"grad_norm": 6.25,
"learning_rate": 1.4862397423923148e-05,
"loss": 0.2542487382888794,
"step": 2900,
"token_acc": 0.9007903943333477
},
{
"epoch": 0.37150909556751216,
"eval_loss": 0.3293861448764801,
"eval_runtime": 103.439,
"eval_samples_per_second": 96.675,
"eval_steps_per_second": 12.084,
"eval_token_acc": 0.8789276400662197,
"step": 2900
},
{
"epoch": 0.37214962849090444,
"grad_norm": 5.3125,
"learning_rate": 1.48438753770255e-05,
"loss": 0.2586365222930908,
"step": 2905,
"token_acc": 0.8990952175786299
},
{
"epoch": 0.3727901614142967,
"grad_norm": 3.40625,
"learning_rate": 1.4825331592595471e-05,
"loss": 0.25507054328918455,
"step": 2910,
"token_acc": 0.8996947418203706
},
{
"epoch": 0.37343069433768894,
"grad_norm": 3.796875,
"learning_rate": 1.480676615385074e-05,
"loss": 0.25874695777893064,
"step": 2915,
"token_acc": 0.8979389016117184
},
{
"epoch": 0.3740712272610812,
"grad_norm": 3.03125,
"learning_rate": 1.4788179144106187e-05,
"loss": 0.2610326766967773,
"step": 2920,
"token_acc": 0.8965872504829363
},
{
"epoch": 0.3747117601844735,
"grad_norm": 2.96875,
"learning_rate": 1.4769570646773469e-05,
"loss": 0.25159320831298826,
"step": 2925,
"token_acc": 0.9015691868758916
},
{
"epoch": 0.3753522931078657,
"grad_norm": 3.0,
"learning_rate": 1.4750940745360683e-05,
"loss": 0.2555972099304199,
"step": 2930,
"token_acc": 0.9030378872874774
},
{
"epoch": 0.375992826031258,
"grad_norm": 3.359375,
"learning_rate": 1.4732289523471983e-05,
"loss": 0.25429458618164064,
"step": 2935,
"token_acc": 0.901710690739863
},
{
"epoch": 0.3766333589546503,
"grad_norm": 3.015625,
"learning_rate": 1.47136170648072e-05,
"loss": 0.260566258430481,
"step": 2940,
"token_acc": 0.8977762454749181
},
{
"epoch": 0.37727389187804256,
"grad_norm": 4.4375,
"learning_rate": 1.469492345316146e-05,
"loss": 0.2575147390365601,
"step": 2945,
"token_acc": 0.9003219575016098
},
{
"epoch": 0.3779144248014348,
"grad_norm": 3.078125,
"learning_rate": 1.4676208772424825e-05,
"loss": 0.26031718254089353,
"step": 2950,
"token_acc": 0.8967789165446559
},
{
"epoch": 0.37855495772482706,
"grad_norm": 3.9375,
"learning_rate": 1.4657473106581903e-05,
"loss": 0.2566239356994629,
"step": 2955,
"token_acc": 0.900069096562446
},
{
"epoch": 0.37919549064821934,
"grad_norm": 24.5,
"learning_rate": 1.4638716539711477e-05,
"loss": 0.26539459228515627,
"step": 2960,
"token_acc": 0.8971825516676015
},
{
"epoch": 0.37983602357161156,
"grad_norm": 2.796875,
"learning_rate": 1.4619939155986122e-05,
"loss": 0.2547321081161499,
"step": 2965,
"token_acc": 0.9001380977041257
},
{
"epoch": 0.38047655649500384,
"grad_norm": 3.53125,
"learning_rate": 1.4601141039671837e-05,
"loss": 0.26095755100250245,
"step": 2970,
"token_acc": 0.8989492800622649
},
{
"epoch": 0.3811170894183961,
"grad_norm": 2.65625,
"learning_rate": 1.4582322275127663e-05,
"loss": 0.2595865726470947,
"step": 2975,
"token_acc": 0.8997066689673022
},
{
"epoch": 0.3817576223417884,
"grad_norm": 3.0625,
"learning_rate": 1.4563482946805291e-05,
"loss": 0.2566410541534424,
"step": 2980,
"token_acc": 0.8992691315563198
},
{
"epoch": 0.3823981552651806,
"grad_norm": 3.734375,
"learning_rate": 1.4544623139248707e-05,
"loss": 0.26386113166809083,
"step": 2985,
"token_acc": 0.8982700748773561
},
{
"epoch": 0.3830386881885729,
"grad_norm": 4.15625,
"learning_rate": 1.4525742937093797e-05,
"loss": 0.2548778533935547,
"step": 2990,
"token_acc": 0.8996550237171195
},
{
"epoch": 0.3836792211119652,
"grad_norm": 2.84375,
"learning_rate": 1.4506842425067963e-05,
"loss": 0.2560065746307373,
"step": 2995,
"token_acc": 0.8996068944662836
},
{
"epoch": 0.3843197540353574,
"grad_norm": 3.84375,
"learning_rate": 1.4487921687989763e-05,
"loss": 0.2564894676208496,
"step": 3000,
"token_acc": 0.8991150442477877
},
{
"epoch": 0.3843197540353574,
"eval_loss": 0.33271661400794983,
"eval_runtime": 102.5035,
"eval_samples_per_second": 97.558,
"eval_steps_per_second": 12.195,
"eval_token_acc": 0.879677871226005,
"step": 3000
},
{
"epoch": 0.3849602869587497,
"grad_norm": 3.09375,
"learning_rate": 1.4468980810768507e-05,
"loss": 0.2549588203430176,
"step": 3005,
"token_acc": 0.9006631071305546
},
{
"epoch": 0.38560081988214195,
"grad_norm": 2.296875,
"learning_rate": 1.4450019878403894e-05,
"loss": 0.256690239906311,
"step": 3010,
"token_acc": 0.9002636014001123
},
{
"epoch": 0.3862413528055342,
"grad_norm": 6.46875,
"learning_rate": 1.4431038975985616e-05,
"loss": 0.2593832969665527,
"step": 3015,
"token_acc": 0.8996518973741888
},
{
"epoch": 0.38688188572892646,
"grad_norm": 5.46875,
"learning_rate": 1.441203818869299e-05,
"loss": 0.26778130531311034,
"step": 3020,
"token_acc": 0.8993683667769519
},
{
"epoch": 0.38752241865231873,
"grad_norm": 3.15625,
"learning_rate": 1.4393017601794558e-05,
"loss": 0.25722360610961914,
"step": 3025,
"token_acc": 0.8998792687133494
},
{
"epoch": 0.388162951575711,
"grad_norm": 3.671875,
"learning_rate": 1.4373977300647735e-05,
"loss": 0.25923748016357423,
"step": 3030,
"token_acc": 0.8972777082704172
},
{
"epoch": 0.38880348449910324,
"grad_norm": 3.234375,
"learning_rate": 1.4354917370698388e-05,
"loss": 0.24125347137451172,
"step": 3035,
"token_acc": 0.9047619047619048
},
{
"epoch": 0.3894440174224955,
"grad_norm": 2.59375,
"learning_rate": 1.4335837897480475e-05,
"loss": 0.26301088333129885,
"step": 3040,
"token_acc": 0.89577136945558
},
{
"epoch": 0.3900845503458878,
"grad_norm": 5.1875,
"learning_rate": 1.4316738966615665e-05,
"loss": 0.25510516166687014,
"step": 3045,
"token_acc": 0.9006117525417887
},
{
"epoch": 0.39072508326928,
"grad_norm": 4.0,
"learning_rate": 1.4297620663812934e-05,
"loss": 0.26404881477355957,
"step": 3050,
"token_acc": 0.8973620897061351
},
{
"epoch": 0.3913656161926723,
"grad_norm": 2.921875,
"learning_rate": 1.4278483074868206e-05,
"loss": 0.2587254524230957,
"step": 3055,
"token_acc": 0.8988434317279476
},
{
"epoch": 0.3920061491160646,
"grad_norm": 2.625,
"learning_rate": 1.4259326285663942e-05,
"loss": 0.2552812576293945,
"step": 3060,
"token_acc": 0.9003674086881348
},
{
"epoch": 0.39264668203945685,
"grad_norm": 2.859375,
"learning_rate": 1.4240150382168766e-05,
"loss": 0.2574739933013916,
"step": 3065,
"token_acc": 0.9006264852019875
},
{
"epoch": 0.3932872149628491,
"grad_norm": 3.109375,
"learning_rate": 1.4220955450437097e-05,
"loss": 0.2653143644332886,
"step": 3070,
"token_acc": 0.8960445153776474
},
{
"epoch": 0.39392774788624135,
"grad_norm": 3.125,
"learning_rate": 1.4201741576608724e-05,
"loss": 0.2522631883621216,
"step": 3075,
"token_acc": 0.9006473888649115
},
{
"epoch": 0.39456828080963363,
"grad_norm": 2.75,
"learning_rate": 1.4182508846908456e-05,
"loss": 0.25041637420654295,
"step": 3080,
"token_acc": 0.9022227988237329
},
{
"epoch": 0.39520881373302585,
"grad_norm": 2.359375,
"learning_rate": 1.4163257347645711e-05,
"loss": 0.25646038055419923,
"step": 3085,
"token_acc": 0.8983489244298831
},
{
"epoch": 0.39584934665641813,
"grad_norm": 3.46875,
"learning_rate": 1.4143987165214146e-05,
"loss": 0.2523691654205322,
"step": 3090,
"token_acc": 0.9009485036164407
},
{
"epoch": 0.3964898795798104,
"grad_norm": 3.40625,
"learning_rate": 1.4124698386091256e-05,
"loss": 0.2536661148071289,
"step": 3095,
"token_acc": 0.8998623537508603
},
{
"epoch": 0.3971304125032027,
"grad_norm": 3.046875,
"learning_rate": 1.4105391096837988e-05,
"loss": 0.25694756507873534,
"step": 3100,
"token_acc": 0.8986861942709455
},
{
"epoch": 0.3971304125032027,
"eval_loss": 0.3274412453174591,
"eval_runtime": 105.1272,
"eval_samples_per_second": 95.123,
"eval_steps_per_second": 11.89,
"eval_token_acc": 0.880165106222766,
"step": 3100
},
{
"epoch": 0.3977709454265949,
"grad_norm": 2.90625,
"learning_rate": 1.4086065384098367e-05,
"loss": 0.2536616802215576,
"step": 3105,
"token_acc": 0.9024840983324738
},
{
"epoch": 0.3984114783499872,
"grad_norm": 2.65625,
"learning_rate": 1.4066721334599084e-05,
"loss": 0.2547293663024902,
"step": 3110,
"token_acc": 0.8989598169968492
},
{
"epoch": 0.39905201127337947,
"grad_norm": 2.4375,
"learning_rate": 1.4047359035149126e-05,
"loss": 0.24942498207092284,
"step": 3115,
"token_acc": 0.9021344624956792
},
{
"epoch": 0.3996925441967717,
"grad_norm": 2.953125,
"learning_rate": 1.4027978572639375e-05,
"loss": 0.25708999633789065,
"step": 3120,
"token_acc": 0.900335801618736
},
{
"epoch": 0.40033307712016397,
"grad_norm": 3.25,
"learning_rate": 1.4008580034042226e-05,
"loss": 0.254312539100647,
"step": 3125,
"token_acc": 0.9001466528640442
},
{
"epoch": 0.40097361004355625,
"grad_norm": 4.21875,
"learning_rate": 1.3989163506411187e-05,
"loss": 0.25107884407043457,
"step": 3130,
"token_acc": 0.902516670251667
},
{
"epoch": 0.40161414296694853,
"grad_norm": 2.953125,
"learning_rate": 1.39697290768805e-05,
"loss": 0.24998018741607667,
"step": 3135,
"token_acc": 0.9032967981358418
},
{
"epoch": 0.40225467589034075,
"grad_norm": 2.75,
"learning_rate": 1.3950276832664745e-05,
"loss": 0.2500455856323242,
"step": 3140,
"token_acc": 0.9015105740181268
},
{
"epoch": 0.40289520881373303,
"grad_norm": 4.25,
"learning_rate": 1.3930806861058438e-05,
"loss": 0.25563080310821534,
"step": 3145,
"token_acc": 0.8991560454702032
},
{
"epoch": 0.4035357417371253,
"grad_norm": 3.0,
"learning_rate": 1.3911319249435657e-05,
"loss": 0.25497581958770754,
"step": 3150,
"token_acc": 0.8996980155306299
},
{
"epoch": 0.40417627466051753,
"grad_norm": 2.796875,
"learning_rate": 1.3891814085249644e-05,
"loss": 0.25732955932617185,
"step": 3155,
"token_acc": 0.8988880268942333
},
{
"epoch": 0.4048168075839098,
"grad_norm": 8.375,
"learning_rate": 1.3872291456032405e-05,
"loss": 0.2536874294281006,
"step": 3160,
"token_acc": 0.9003486720330592
},
{
"epoch": 0.4054573405073021,
"grad_norm": 2.6875,
"learning_rate": 1.3852751449394324e-05,
"loss": 0.2530160427093506,
"step": 3165,
"token_acc": 0.9021668892430965
},
{
"epoch": 0.4060978734306943,
"grad_norm": 3.109375,
"learning_rate": 1.383319415302377e-05,
"loss": 0.2553149938583374,
"step": 3170,
"token_acc": 0.8996287342427906
},
{
"epoch": 0.4067384063540866,
"grad_norm": 4.40625,
"learning_rate": 1.3813619654686703e-05,
"loss": 0.25406613349914553,
"step": 3175,
"token_acc": 0.9016216216216216
},
{
"epoch": 0.40737893927747887,
"grad_norm": 2.859375,
"learning_rate": 1.3794028042226273e-05,
"loss": 0.2548455476760864,
"step": 3180,
"token_acc": 0.9005142832447384
},
{
"epoch": 0.40801947220087115,
"grad_norm": 3.15625,
"learning_rate": 1.3774419403562437e-05,
"loss": 0.2509315013885498,
"step": 3185,
"token_acc": 0.9003025064822817
},
{
"epoch": 0.40866000512426337,
"grad_norm": 2.609375,
"learning_rate": 1.3754793826691565e-05,
"loss": 0.2544880390167236,
"step": 3190,
"token_acc": 0.8993057052913019
},
{
"epoch": 0.40930053804765565,
"grad_norm": 2.546875,
"learning_rate": 1.3735151399686024e-05,
"loss": 0.25415782928466796,
"step": 3195,
"token_acc": 0.9004563850856798
},
{
"epoch": 0.4099410709710479,
"grad_norm": 2.734375,
"learning_rate": 1.371549221069381e-05,
"loss": 0.252706241607666,
"step": 3200,
"token_acc": 0.9001977984176126
},
{
"epoch": 0.4099410709710479,
"eval_loss": 0.3304235339164734,
"eval_runtime": 103.4877,
"eval_samples_per_second": 96.63,
"eval_steps_per_second": 12.079,
"eval_token_acc": 0.8798495108271368,
"step": 3200
},
{
"epoch": 0.41058160389444015,
"grad_norm": 3.171875,
"learning_rate": 1.369581634793814e-05,
"loss": 0.24554102420806884,
"step": 3205,
"token_acc": 0.9046261500583128
},
{
"epoch": 0.41122213681783243,
"grad_norm": 3.15625,
"learning_rate": 1.367612389971705e-05,
"loss": 0.25843195915222167,
"step": 3210,
"token_acc": 0.8992231333621061
},
{
"epoch": 0.4118626697412247,
"grad_norm": 2.703125,
"learning_rate": 1.3656414954403015e-05,
"loss": 0.2526721477508545,
"step": 3215,
"token_acc": 0.9009551673694174
},
{
"epoch": 0.412503202664617,
"grad_norm": 4.03125,
"learning_rate": 1.3636689600442535e-05,
"loss": 0.2488550662994385,
"step": 3220,
"token_acc": 0.9032662773091067
},
{
"epoch": 0.4131437355880092,
"grad_norm": 8.6875,
"learning_rate": 1.3616947926355748e-05,
"loss": 0.2410456657409668,
"step": 3225,
"token_acc": 0.9047186932849365
},
{
"epoch": 0.4137842685114015,
"grad_norm": 3.6875,
"learning_rate": 1.3597190020736032e-05,
"loss": 0.25398988723754884,
"step": 3230,
"token_acc": 0.9014248202832422
},
{
"epoch": 0.41442480143479377,
"grad_norm": 4.0625,
"learning_rate": 1.3577415972249608e-05,
"loss": 0.24551260471343994,
"step": 3235,
"token_acc": 0.9037839237174785
},
{
"epoch": 0.415065334358186,
"grad_norm": 4.125,
"learning_rate": 1.3557625869635136e-05,
"loss": 0.2562254905700684,
"step": 3240,
"token_acc": 0.8990086206896551
},
{
"epoch": 0.41570586728157827,
"grad_norm": 2.890625,
"learning_rate": 1.3537819801703323e-05,
"loss": 0.2528964996337891,
"step": 3245,
"token_acc": 0.9020725388601036
},
{
"epoch": 0.41634640020497055,
"grad_norm": 70.0,
"learning_rate": 1.3517997857336522e-05,
"loss": 0.2532426595687866,
"step": 3250,
"token_acc": 0.8992752998015702
},
{
"epoch": 0.4169869331283628,
"grad_norm": 3.03125,
"learning_rate": 1.3498160125488336e-05,
"loss": 0.248179292678833,
"step": 3255,
"token_acc": 0.9034928848641656
},
{
"epoch": 0.41762746605175505,
"grad_norm": 2.703125,
"learning_rate": 1.3478306695183212e-05,
"loss": 0.25196003913879395,
"step": 3260,
"token_acc": 0.9024979507312654
},
{
"epoch": 0.4182679989751473,
"grad_norm": 3.75,
"learning_rate": 1.3458437655516048e-05,
"loss": 0.2540182590484619,
"step": 3265,
"token_acc": 0.9011801322785631
},
{
"epoch": 0.4189085318985396,
"grad_norm": 4.6875,
"learning_rate": 1.3438553095651794e-05,
"loss": 0.24988923072814942,
"step": 3270,
"token_acc": 0.9028460543337645
},
{
"epoch": 0.4195490648219318,
"grad_norm": 3.3125,
"learning_rate": 1.3418653104825044e-05,
"loss": 0.25744991302490233,
"step": 3275,
"token_acc": 0.8989907702924178
},
{
"epoch": 0.4201895977453241,
"grad_norm": 2.515625,
"learning_rate": 1.3398737772339643e-05,
"loss": 0.25907082557678224,
"step": 3280,
"token_acc": 0.8988532405617833
},
{
"epoch": 0.4208301306687164,
"grad_norm": 2.59375,
"learning_rate": 1.3378807187568288e-05,
"loss": 0.2617329597473145,
"step": 3285,
"token_acc": 0.8974062165058949
},
{
"epoch": 0.4214706635921086,
"grad_norm": 2.34375,
"learning_rate": 1.335886143995211e-05,
"loss": 0.25168476104736326,
"step": 3290,
"token_acc": 0.9027303015879884
},
{
"epoch": 0.4221111965155009,
"grad_norm": 3.484375,
"learning_rate": 1.3338900619000299e-05,
"loss": 0.25457475185394285,
"step": 3295,
"token_acc": 0.9008958566629339
},
{
"epoch": 0.42275172943889316,
"grad_norm": 3.25,
"learning_rate": 1.3318924814289682e-05,
"loss": 0.25605058670043945,
"step": 3300,
"token_acc": 0.9003745640849012
},
{
"epoch": 0.42275172943889316,
"eval_loss": 0.3262101709842682,
"eval_runtime": 103.038,
"eval_samples_per_second": 97.052,
"eval_steps_per_second": 12.131,
"eval_token_acc": 0.8801623378421026,
"step": 3300
},
{
"epoch": 0.42339226236228544,
"grad_norm": 2.9375,
"learning_rate": 1.329893411546433e-05,
"loss": 0.2533790826797485,
"step": 3305,
"token_acc": 0.9021087584630644
},
{
"epoch": 0.42403279528567767,
"grad_norm": 3.109375,
"learning_rate": 1.327892861223515e-05,
"loss": 0.26516075134277345,
"step": 3310,
"token_acc": 0.8975274134594711
},
{
"epoch": 0.42467332820906994,
"grad_norm": 2.828125,
"learning_rate": 1.3258908394379492e-05,
"loss": 0.24489293098449708,
"step": 3315,
"token_acc": 0.9024179620034543
},
{
"epoch": 0.4253138611324622,
"grad_norm": 2.671875,
"learning_rate": 1.323887355174073e-05,
"loss": 0.2507158279418945,
"step": 3320,
"token_acc": 0.9005565382458259
},
{
"epoch": 0.42595439405585445,
"grad_norm": 2.859375,
"learning_rate": 1.3218824174227876e-05,
"loss": 0.2552894353866577,
"step": 3325,
"token_acc": 0.899343072002766
},
{
"epoch": 0.4265949269792467,
"grad_norm": 4.6875,
"learning_rate": 1.3198760351815165e-05,
"loss": 0.25093369483947753,
"step": 3330,
"token_acc": 0.901463793773479
},
{
"epoch": 0.427235459902639,
"grad_norm": 6.78125,
"learning_rate": 1.3178682174541664e-05,
"loss": 0.25160994529724123,
"step": 3335,
"token_acc": 0.9026560138199093
},
{
"epoch": 0.4278759928260313,
"grad_norm": 3.1875,
"learning_rate": 1.3158589732510847e-05,
"loss": 0.25160062313079834,
"step": 3340,
"token_acc": 0.9015646492434664
},
{
"epoch": 0.4285165257494235,
"grad_norm": 3.015625,
"learning_rate": 1.3138483115890214e-05,
"loss": 0.24968068599700927,
"step": 3345,
"token_acc": 0.9020596346087556
},
{
"epoch": 0.4291570586728158,
"grad_norm": 2.40625,
"learning_rate": 1.3118362414910869e-05,
"loss": 0.25055222511291503,
"step": 3350,
"token_acc": 0.902113891285591
},
{
"epoch": 0.42979759159620806,
"grad_norm": 3.21875,
"learning_rate": 1.3098227719867117e-05,
"loss": 0.23631854057312013,
"step": 3355,
"token_acc": 0.9082366187424216
},
{
"epoch": 0.4304381245196003,
"grad_norm": 3.125,
"learning_rate": 1.3078079121116074e-05,
"loss": 0.2557328939437866,
"step": 3360,
"token_acc": 0.9006379860332787
},
{
"epoch": 0.43107865744299256,
"grad_norm": 2.828125,
"learning_rate": 1.305791670907725e-05,
"loss": 0.2555293083190918,
"step": 3365,
"token_acc": 0.9005658502872446
},
{
"epoch": 0.43171919036638484,
"grad_norm": 2.5625,
"learning_rate": 1.3037740574232134e-05,
"loss": 0.25120766162872316,
"step": 3370,
"token_acc": 0.9023821853961678
},
{
"epoch": 0.4323597232897771,
"grad_norm": 3.09375,
"learning_rate": 1.3017550807123806e-05,
"loss": 0.2534923553466797,
"step": 3375,
"token_acc": 0.9000387780602352
},
{
"epoch": 0.43300025621316934,
"grad_norm": 3.109375,
"learning_rate": 1.2997347498356519e-05,
"loss": 0.24217534065246582,
"step": 3380,
"token_acc": 0.9059788473990935
},
{
"epoch": 0.4336407891365616,
"grad_norm": 2.296875,
"learning_rate": 1.2977130738595298e-05,
"loss": 0.2505367279052734,
"step": 3385,
"token_acc": 0.9020945800043187
},
{
"epoch": 0.4342813220599539,
"grad_norm": 2.65625,
"learning_rate": 1.2956900618565532e-05,
"loss": 0.24520423412322997,
"step": 3390,
"token_acc": 0.9031252705393472
},
{
"epoch": 0.4349218549833461,
"grad_norm": 3.640625,
"learning_rate": 1.293665722905256e-05,
"loss": 0.2532040596008301,
"step": 3395,
"token_acc": 0.9023980712102295
},
{
"epoch": 0.4355623879067384,
"grad_norm": 3.03125,
"learning_rate": 1.2916400660901276e-05,
"loss": 0.24737958908081054,
"step": 3400,
"token_acc": 0.9031688624817016
},
{
"epoch": 0.4355623879067384,
"eval_loss": 0.3326387107372284,
"eval_runtime": 103.1507,
"eval_samples_per_second": 96.945,
"eval_steps_per_second": 12.118,
"eval_token_acc": 0.8803395142045611,
"step": 3400
},
{
"epoch": 0.4362029208301307,
"grad_norm": 2.265625,
"learning_rate": 1.2896131005015717e-05,
"loss": 0.25047874450683594,
"step": 3405,
"token_acc": 0.9031589849818746
},
{
"epoch": 0.4368434537535229,
"grad_norm": 2.515625,
"learning_rate": 1.2875848352358644e-05,
"loss": 0.25389971733093264,
"step": 3410,
"token_acc": 0.9008509541000516
},
{
"epoch": 0.4374839866769152,
"grad_norm": 3.875,
"learning_rate": 1.2855552793951146e-05,
"loss": 0.2464221954345703,
"step": 3415,
"token_acc": 0.9023832138848114
},
{
"epoch": 0.43812451960030746,
"grad_norm": 3.6875,
"learning_rate": 1.2835244420872232e-05,
"loss": 0.25347232818603516,
"step": 3420,
"token_acc": 0.900116044182748
},
{
"epoch": 0.43876505252369974,
"grad_norm": 2.546875,
"learning_rate": 1.2814923324258416e-05,
"loss": 0.2549131393432617,
"step": 3425,
"token_acc": 0.9009849886016603
},
{
"epoch": 0.43940558544709196,
"grad_norm": 2.9375,
"learning_rate": 1.2794589595303316e-05,
"loss": 0.24712648391723632,
"step": 3430,
"token_acc": 0.9032174364296834
},
{
"epoch": 0.44004611837048424,
"grad_norm": 3.140625,
"learning_rate": 1.277424332525723e-05,
"loss": 0.24843959808349608,
"step": 3435,
"token_acc": 0.9036415534988322
},
{
"epoch": 0.4406866512938765,
"grad_norm": 9.1875,
"learning_rate": 1.2753884605426736e-05,
"loss": 0.24894342422485352,
"step": 3440,
"token_acc": 0.9017941861468127
},
{
"epoch": 0.44132718421726874,
"grad_norm": 2.953125,
"learning_rate": 1.273351352717429e-05,
"loss": 0.24595353603363038,
"step": 3445,
"token_acc": 0.9049160516207001
},
{
"epoch": 0.441967717140661,
"grad_norm": 2.921875,
"learning_rate": 1.2713130181917806e-05,
"loss": 0.25805752277374266,
"step": 3450,
"token_acc": 0.8997292302402544
},
{
"epoch": 0.4426082500640533,
"grad_norm": 3.546875,
"learning_rate": 1.269273466113024e-05,
"loss": 0.2535008430480957,
"step": 3455,
"token_acc": 0.9011762685670129
},
{
"epoch": 0.4432487829874456,
"grad_norm": 3.671875,
"learning_rate": 1.2672327056339198e-05,
"loss": 0.24500885009765624,
"step": 3460,
"token_acc": 0.905705264068515
},
{
"epoch": 0.4438893159108378,
"grad_norm": 4.28125,
"learning_rate": 1.2651907459126512e-05,
"loss": 0.25068912506103513,
"step": 3465,
"token_acc": 0.9028803385585352
},
{
"epoch": 0.4445298488342301,
"grad_norm": 2.828125,
"learning_rate": 1.2631475961127822e-05,
"loss": 0.2502088785171509,
"step": 3470,
"token_acc": 0.9028352292312996
},
{
"epoch": 0.44517038175762236,
"grad_norm": 4.375,
"learning_rate": 1.2611032654032185e-05,
"loss": 0.2501903295516968,
"step": 3475,
"token_acc": 0.901529554096094
},
{
"epoch": 0.4458109146810146,
"grad_norm": 4.125,
"learning_rate": 1.2590577629581648e-05,
"loss": 0.25160870552062986,
"step": 3480,
"token_acc": 0.9006165653429914
},
{
"epoch": 0.44645144760440686,
"grad_norm": 10.75,
"learning_rate": 1.2570110979570846e-05,
"loss": 0.2540600299835205,
"step": 3485,
"token_acc": 0.9013576215844646
},
{
"epoch": 0.44709198052779914,
"grad_norm": 2.625,
"learning_rate": 1.2549632795846582e-05,
"loss": 0.25437102317810056,
"step": 3490,
"token_acc": 0.9012979172955026
},
{
"epoch": 0.4477325134511914,
"grad_norm": 2.609375,
"learning_rate": 1.2529143170307418e-05,
"loss": 0.25037708282470705,
"step": 3495,
"token_acc": 0.90243692783771
},
{
"epoch": 0.44837304637458364,
"grad_norm": 2.984375,
"learning_rate": 1.250864219490326e-05,
"loss": 0.2490053653717041,
"step": 3500,
"token_acc": 0.9017818527809013
},
{
"epoch": 0.44837304637458364,
"eval_loss": 0.3326828181743622,
"eval_runtime": 103.6038,
"eval_samples_per_second": 96.522,
"eval_steps_per_second": 12.065,
"eval_token_acc": 0.8804225656244636,
"step": 3500
},
{
"epoch": 0.4490135792979759,
"grad_norm": 3.390625,
"learning_rate": 1.248812996163495e-05,
"loss": 0.24862072467803956,
"step": 3505,
"token_acc": 0.9011164274322169
},
{
"epoch": 0.4496541122213682,
"grad_norm": 2.71875,
"learning_rate": 1.2467606562553858e-05,
"loss": 0.25421929359436035,
"step": 3510,
"token_acc": 0.9018134715025907
},
{
"epoch": 0.4502946451447604,
"grad_norm": 5.625,
"learning_rate": 1.244707208976145e-05,
"loss": 0.24651005268096923,
"step": 3515,
"token_acc": 0.903206343733839
},
{
"epoch": 0.4509351780681527,
"grad_norm": 2.859375,
"learning_rate": 1.2426526635408896e-05,
"loss": 0.24950928688049318,
"step": 3520,
"token_acc": 0.9035152037955575
},
{
"epoch": 0.451575710991545,
"grad_norm": 4.875,
"learning_rate": 1.240597029169664e-05,
"loss": 0.25514960289001465,
"step": 3525,
"token_acc": 0.8999569336778639
},
{
"epoch": 0.45221624391493725,
"grad_norm": 2.484375,
"learning_rate": 1.2385403150874003e-05,
"loss": 0.24595193862915038,
"step": 3530,
"token_acc": 0.904110774556469
},
{
"epoch": 0.4528567768383295,
"grad_norm": 2.265625,
"learning_rate": 1.2364825305238748e-05,
"loss": 0.24859437942504883,
"step": 3535,
"token_acc": 0.9019379844961241
},
{
"epoch": 0.45349730976172176,
"grad_norm": 3.328125,
"learning_rate": 1.2344236847136683e-05,
"loss": 0.25172064304351804,
"step": 3540,
"token_acc": 0.9005772378736969
},
{
"epoch": 0.45413784268511403,
"grad_norm": 3.140625,
"learning_rate": 1.2323637868961247e-05,
"loss": 0.2530811309814453,
"step": 3545,
"token_acc": 0.900356943190126
},
{
"epoch": 0.45477837560850626,
"grad_norm": 2.78125,
"learning_rate": 1.2303028463153081e-05,
"loss": 0.25023765563964845,
"step": 3550,
"token_acc": 0.9036663650855198
},
{
"epoch": 0.45541890853189854,
"grad_norm": 3.375,
"learning_rate": 1.2282408722199623e-05,
"loss": 0.2615813732147217,
"step": 3555,
"token_acc": 0.8973312132021144
},
{
"epoch": 0.4560594414552908,
"grad_norm": 4.625,
"learning_rate": 1.2261778738634688e-05,
"loss": 0.24770092964172363,
"step": 3560,
"token_acc": 0.9046594673665189
},
{
"epoch": 0.45669997437868304,
"grad_norm": 3.09375,
"learning_rate": 1.2241138605038065e-05,
"loss": 0.2506240367889404,
"step": 3565,
"token_acc": 0.9026666666666666
},
{
"epoch": 0.4573405073020753,
"grad_norm": 5.75,
"learning_rate": 1.2220488414035088e-05,
"loss": 0.24530596733093263,
"step": 3570,
"token_acc": 0.9045893719806763
},
{
"epoch": 0.4579810402254676,
"grad_norm": 6.125,
"learning_rate": 1.2199828258296219e-05,
"loss": 0.24243788719177245,
"step": 3575,
"token_acc": 0.9063712388999051
},
{
"epoch": 0.4586215731488599,
"grad_norm": 3.359375,
"learning_rate": 1.2179158230536648e-05,
"loss": 0.25044434070587157,
"step": 3580,
"token_acc": 0.9025926405245676
},
{
"epoch": 0.4592621060722521,
"grad_norm": 2.734375,
"learning_rate": 1.215847842351586e-05,
"loss": 0.23762269020080568,
"step": 3585,
"token_acc": 0.907480400225235
},
{
"epoch": 0.4599026389956444,
"grad_norm": 4.25,
"learning_rate": 1.213778893003723e-05,
"loss": 0.24655213356018066,
"step": 3590,
"token_acc": 0.9028251024369204
},
{
"epoch": 0.46054317191903665,
"grad_norm": 2.984375,
"learning_rate": 1.2117089842947602e-05,
"loss": 0.2541653633117676,
"step": 3595,
"token_acc": 0.9008563191187229
},
{
"epoch": 0.4611837048424289,
"grad_norm": 4.25,
"learning_rate": 1.2096381255136869e-05,
"loss": 0.25534510612487793,
"step": 3600,
"token_acc": 0.9020348587619129
},
{
"epoch": 0.4611837048424289,
"eval_loss": 0.33049342036247253,
"eval_runtime": 103.5925,
"eval_samples_per_second": 96.532,
"eval_steps_per_second": 12.067,
"eval_token_acc": 0.881139576216288,
"step": 3600
},
{
"epoch": 0.46182423776582116,
"grad_norm": 2.90625,
"learning_rate": 1.207566325953756e-05,
"loss": 0.2503223896026611,
"step": 3605,
"token_acc": 0.9033789323781988
},
{
"epoch": 0.46246477068921343,
"grad_norm": 4.21875,
"learning_rate": 1.2054935949124429e-05,
"loss": 0.2458160400390625,
"step": 3610,
"token_acc": 0.9040593589577671
},
{
"epoch": 0.4631053036126057,
"grad_norm": 3.625,
"learning_rate": 1.2034199416914026e-05,
"loss": 0.2477043390274048,
"step": 3615,
"token_acc": 0.9040510807196169
},
{
"epoch": 0.46374583653599794,
"grad_norm": 3.21875,
"learning_rate": 1.2013453755964282e-05,
"loss": 0.24677414894104005,
"step": 3620,
"token_acc": 0.9042521044679473
},
{
"epoch": 0.4643863694593902,
"grad_norm": 2.890625,
"learning_rate": 1.1992699059374103e-05,
"loss": 0.24625577926635742,
"step": 3625,
"token_acc": 0.9028618364170845
},
{
"epoch": 0.4650269023827825,
"grad_norm": 3.0,
"learning_rate": 1.197193542028294e-05,
"loss": 0.2436453342437744,
"step": 3630,
"token_acc": 0.9060716139076285
},
{
"epoch": 0.4656674353061747,
"grad_norm": 3.25,
"learning_rate": 1.1951162931870367e-05,
"loss": 0.24116950035095214,
"step": 3635,
"token_acc": 0.9068125053828267
},
{
"epoch": 0.466307968229567,
"grad_norm": 5.09375,
"learning_rate": 1.1930381687355685e-05,
"loss": 0.25627937316894533,
"step": 3640,
"token_acc": 0.899607910724288
},
{
"epoch": 0.46694850115295927,
"grad_norm": 4.34375,
"learning_rate": 1.190959177999748e-05,
"loss": 0.23881065845489502,
"step": 3645,
"token_acc": 0.9060243048047398
},
{
"epoch": 0.46758903407635155,
"grad_norm": 2.59375,
"learning_rate": 1.1888793303093211e-05,
"loss": 0.24708976745605468,
"step": 3650,
"token_acc": 0.9032951905704207
},
{
"epoch": 0.4682295669997438,
"grad_norm": 3.53125,
"learning_rate": 1.18679863499788e-05,
"loss": 0.2470097541809082,
"step": 3655,
"token_acc": 0.9035228009509402
},
{
"epoch": 0.46887009992313605,
"grad_norm": 2.5625,
"learning_rate": 1.1847171014028207e-05,
"loss": 0.24061377048492433,
"step": 3660,
"token_acc": 0.9055094274346999
},
{
"epoch": 0.46951063284652833,
"grad_norm": 2.421875,
"learning_rate": 1.1826347388653005e-05,
"loss": 0.24855940341949462,
"step": 3665,
"token_acc": 0.9045786674737343
},
{
"epoch": 0.47015116576992055,
"grad_norm": 2.984375,
"learning_rate": 1.180551556730198e-05,
"loss": 0.2487732172012329,
"step": 3670,
"token_acc": 0.9030577088716624
},
{
"epoch": 0.47079169869331283,
"grad_norm": 2.453125,
"learning_rate": 1.1784675643460682e-05,
"loss": 0.24545960426330565,
"step": 3675,
"token_acc": 0.9027664242997891
},
{
"epoch": 0.4714322316167051,
"grad_norm": 3.0625,
"learning_rate": 1.176382771065103e-05,
"loss": 0.23899221420288086,
"step": 3680,
"token_acc": 0.906933437269932
},
{
"epoch": 0.47207276454009733,
"grad_norm": 6.9375,
"learning_rate": 1.1742971862430888e-05,
"loss": 0.25500404834747314,
"step": 3685,
"token_acc": 0.9016767964136385
},
{
"epoch": 0.4727132974634896,
"grad_norm": 4.53125,
"learning_rate": 1.1722108192393635e-05,
"loss": 0.24634737968444825,
"step": 3690,
"token_acc": 0.9017565007749269
},
{
"epoch": 0.4733538303868819,
"grad_norm": 2.796875,
"learning_rate": 1.1701236794167753e-05,
"loss": 0.2485578775405884,
"step": 3695,
"token_acc": 0.9019531419167641
},
{
"epoch": 0.47399436331027417,
"grad_norm": 3.265625,
"learning_rate": 1.168035776141641e-05,
"loss": 0.24262137413024903,
"step": 3700,
"token_acc": 0.9076976212062341
},
{
"epoch": 0.47399436331027417,
"eval_loss": 0.3315788805484772,
"eval_runtime": 103.5836,
"eval_samples_per_second": 96.54,
"eval_steps_per_second": 12.068,
"eval_token_acc": 0.8810066939444441,
"step": 3700
},
{
"epoch": 0.4746348962336664,
"grad_norm": 4.34375,
"learning_rate": 1.165947118783703e-05,
"loss": 0.24236671924591063,
"step": 3705,
"token_acc": 0.906829352418729
},
{
"epoch": 0.47527542915705867,
"grad_norm": 3.375,
"learning_rate": 1.1638577167160874e-05,
"loss": 0.24883639812469482,
"step": 3710,
"token_acc": 0.9045120220224526
},
{
"epoch": 0.47591596208045095,
"grad_norm": 3.375,
"learning_rate": 1.1617675793152631e-05,
"loss": 0.2473759651184082,
"step": 3715,
"token_acc": 0.9040358358099668
},
{
"epoch": 0.4765564950038432,
"grad_norm": 4.34375,
"learning_rate": 1.1596767159609988e-05,
"loss": 0.2524222135543823,
"step": 3720,
"token_acc": 0.9003019844693702
},
{
"epoch": 0.47719702792723545,
"grad_norm": 2.796875,
"learning_rate": 1.1575851360363201e-05,
"loss": 0.2499473810195923,
"step": 3725,
"token_acc": 0.9026130307718755
},
{
"epoch": 0.47783756085062773,
"grad_norm": 3.8125,
"learning_rate": 1.1554928489274697e-05,
"loss": 0.24831125736236573,
"step": 3730,
"token_acc": 0.9022294772112204
},
{
"epoch": 0.47847809377402,
"grad_norm": 2.796875,
"learning_rate": 1.1533998640238626e-05,
"loss": 0.2451251983642578,
"step": 3735,
"token_acc": 0.9032967032967033
},
{
"epoch": 0.47911862669741223,
"grad_norm": 2.90625,
"learning_rate": 1.1513061907180462e-05,
"loss": 0.2445608615875244,
"step": 3740,
"token_acc": 0.9054381711118809
},
{
"epoch": 0.4797591596208045,
"grad_norm": 2.65625,
"learning_rate": 1.1492118384056565e-05,
"loss": 0.2464083194732666,
"step": 3745,
"token_acc": 0.9023748976337227
},
{
"epoch": 0.4803996925441968,
"grad_norm": 3.5625,
"learning_rate": 1.1471168164853769e-05,
"loss": 0.24423737525939943,
"step": 3750,
"token_acc": 0.9049323100801931
},
{
"epoch": 0.481040225467589,
"grad_norm": 4.34375,
"learning_rate": 1.1450211343588962e-05,
"loss": 0.24666328430175782,
"step": 3755,
"token_acc": 0.905542815109688
},
{
"epoch": 0.4816807583909813,
"grad_norm": 2.28125,
"learning_rate": 1.142924801430865e-05,
"loss": 0.24257378578186034,
"step": 3760,
"token_acc": 0.9064537017051586
},
{
"epoch": 0.48232129131437357,
"grad_norm": 4.25,
"learning_rate": 1.1408278271088555e-05,
"loss": 0.24482569694519044,
"step": 3765,
"token_acc": 0.9051044583243593
},
{
"epoch": 0.48296182423776585,
"grad_norm": 4.75,
"learning_rate": 1.1387302208033173e-05,
"loss": 0.24971480369567872,
"step": 3770,
"token_acc": 0.9026384883570783
},
{
"epoch": 0.48360235716115807,
"grad_norm": 4.09375,
"learning_rate": 1.1366319919275368e-05,
"loss": 0.24797563552856444,
"step": 3775,
"token_acc": 0.9017930438539642
},
{
"epoch": 0.48424289008455035,
"grad_norm": 3.328125,
"learning_rate": 1.1345331498975938e-05,
"loss": 0.24426445960998536,
"step": 3780,
"token_acc": 0.9049935428325441
},
{
"epoch": 0.4848834230079426,
"grad_norm": 3.765625,
"learning_rate": 1.1324337041323204e-05,
"loss": 0.25280845165252686,
"step": 3785,
"token_acc": 0.9023151734228418
},
{
"epoch": 0.48552395593133485,
"grad_norm": 2.71875,
"learning_rate": 1.1303336640532567e-05,
"loss": 0.24581615924835204,
"step": 3790,
"token_acc": 0.9035905403072674
},
{
"epoch": 0.48616448885472713,
"grad_norm": 2.4375,
"learning_rate": 1.1282330390846117e-05,
"loss": 0.24577610492706298,
"step": 3795,
"token_acc": 0.9040096680909836
},
{
"epoch": 0.4868050217781194,
"grad_norm": 2.921875,
"learning_rate": 1.1261318386532177e-05,
"loss": 0.2388829231262207,
"step": 3800,
"token_acc": 0.9051091740743937
},
{
"epoch": 0.4868050217781194,
"eval_loss": 0.3326900601387024,
"eval_runtime": 103.7176,
"eval_samples_per_second": 96.416,
"eval_steps_per_second": 12.052,
"eval_token_acc": 0.8808295175819856,
"step": 3800
},
{
"epoch": 0.4874455547015117,
"grad_norm": 3.0625,
"learning_rate": 1.12403007218849e-05,
"loss": 0.2488114833831787,
"step": 3805,
"token_acc": 0.9019819043515725
},
{
"epoch": 0.4880860876249039,
"grad_norm": 3.546875,
"learning_rate": 1.121927749122384e-05,
"loss": 0.24407386779785156,
"step": 3810,
"token_acc": 0.9046487514041303
},
{
"epoch": 0.4887266205482962,
"grad_norm": 3.4375,
"learning_rate": 1.1198248788893531e-05,
"loss": 0.23504881858825682,
"step": 3815,
"token_acc": 0.908643467748899
},
{
"epoch": 0.48936715347168847,
"grad_norm": 2.890625,
"learning_rate": 1.117721470926306e-05,
"loss": 0.240411376953125,
"step": 3820,
"token_acc": 0.9068462401795735
},
{
"epoch": 0.4900076863950807,
"grad_norm": 6.03125,
"learning_rate": 1.1156175346725644e-05,
"loss": 0.23961906433105468,
"step": 3825,
"token_acc": 0.9053354053354054
},
{
"epoch": 0.49064821931847297,
"grad_norm": 3.453125,
"learning_rate": 1.113513079569821e-05,
"loss": 0.24782803058624267,
"step": 3830,
"token_acc": 0.9041679212009118
},
{
"epoch": 0.49128875224186525,
"grad_norm": 2.625,
"learning_rate": 1.1114081150620968e-05,
"loss": 0.24206724166870117,
"step": 3835,
"token_acc": 0.9066114275850164
},
{
"epoch": 0.49192928516525747,
"grad_norm": 3.03125,
"learning_rate": 1.1093026505956989e-05,
"loss": 0.24786317348480225,
"step": 3840,
"token_acc": 0.9004302925989673
},
{
"epoch": 0.49256981808864975,
"grad_norm": 3.421875,
"learning_rate": 1.107196695619178e-05,
"loss": 0.2436119556427002,
"step": 3845,
"token_acc": 0.9046715265496269
},
{
"epoch": 0.493210351012042,
"grad_norm": 3.328125,
"learning_rate": 1.105090259583286e-05,
"loss": 0.25207488536834716,
"step": 3850,
"token_acc": 0.901183050118305
},
{
"epoch": 0.4938508839354343,
"grad_norm": 2.65625,
"learning_rate": 1.1029833519409337e-05,
"loss": 0.24722940921783448,
"step": 3855,
"token_acc": 0.9046530682651257
},
{
"epoch": 0.4944914168588265,
"grad_norm": 2.6875,
"learning_rate": 1.100875982147148e-05,
"loss": 0.24187374114990234,
"step": 3860,
"token_acc": 0.906386286109072
},
{
"epoch": 0.4951319497822188,
"grad_norm": 7.78125,
"learning_rate": 1.09876815965903e-05,
"loss": 0.25092015266418455,
"step": 3865,
"token_acc": 0.9019010427841909
},
{
"epoch": 0.4957724827056111,
"grad_norm": 2.71875,
"learning_rate": 1.096659893935713e-05,
"loss": 0.2399623155593872,
"step": 3870,
"token_acc": 0.9047742492979045
},
{
"epoch": 0.4964130156290033,
"grad_norm": 2.734375,
"learning_rate": 1.0945511944383178e-05,
"loss": 0.24817066192626952,
"step": 3875,
"token_acc": 0.9023097474791002
},
{
"epoch": 0.4970535485523956,
"grad_norm": 32.75,
"learning_rate": 1.0924420706299131e-05,
"loss": 0.23887484073638915,
"step": 3880,
"token_acc": 0.9074281287657083
},
{
"epoch": 0.49769408147578786,
"grad_norm": 3.078125,
"learning_rate": 1.0903325319754717e-05,
"loss": 0.24414317607879638,
"step": 3885,
"token_acc": 0.9059984419631265
},
{
"epoch": 0.49833461439918014,
"grad_norm": 6.90625,
"learning_rate": 1.0882225879418272e-05,
"loss": 0.2399660110473633,
"step": 3890,
"token_acc": 0.9039769303606783
},
{
"epoch": 0.49897514732257237,
"grad_norm": 6.09375,
"learning_rate": 1.086112247997633e-05,
"loss": 0.24311909675598145,
"step": 3895,
"token_acc": 0.9053466029037956
},
{
"epoch": 0.49961568024596464,
"grad_norm": 2.84375,
"learning_rate": 1.0840015216133195e-05,
"loss": 0.24150404930114747,
"step": 3900,
"token_acc": 0.9062136674848211
},
{
"epoch": 0.49961568024596464,
"eval_loss": 0.3318501114845276,
"eval_runtime": 103.7178,
"eval_samples_per_second": 96.415,
"eval_steps_per_second": 12.052,
"eval_token_acc": 0.880832285962649,
"step": 3900
},
{
"epoch": 0.5002562131693569,
"grad_norm": 3.078125,
"learning_rate": 1.0818904182610505e-05,
"loss": 0.23810501098632814,
"step": 3905,
"token_acc": 0.9054380664652568
},
{
"epoch": 0.5008967460927491,
"grad_norm": 3.03125,
"learning_rate": 1.0797789474146825e-05,
"loss": 0.24517326354980468,
"step": 3910,
"token_acc": 0.9039823773324119
},
{
"epoch": 0.5015372790161414,
"grad_norm": 2.828125,
"learning_rate": 1.07766711854972e-05,
"loss": 0.24150619506835938,
"step": 3915,
"token_acc": 0.9048644922228446
},
{
"epoch": 0.5021778119395337,
"grad_norm": 3.671875,
"learning_rate": 1.0755549411432754e-05,
"loss": 0.24119091033935547,
"step": 3920,
"token_acc": 0.9053400155534433
},
{
"epoch": 0.502818344862926,
"grad_norm": 3.578125,
"learning_rate": 1.0734424246740238e-05,
"loss": 0.24077696800231935,
"step": 3925,
"token_acc": 0.9054638194864701
},
{
"epoch": 0.5034588777863183,
"grad_norm": 4.03125,
"learning_rate": 1.0713295786221634e-05,
"loss": 0.24392437934875488,
"step": 3930,
"token_acc": 0.9047639614736751
},
{
"epoch": 0.5040994107097104,
"grad_norm": 3.328125,
"learning_rate": 1.0692164124693703e-05,
"loss": 0.23980698585510254,
"step": 3935,
"token_acc": 0.905363456066224
},
{
"epoch": 0.5047399436331027,
"grad_norm": 2.765625,
"learning_rate": 1.067102935698758e-05,
"loss": 0.23611803054809571,
"step": 3940,
"token_acc": 0.9073914169760815
},
{
"epoch": 0.505380476556495,
"grad_norm": 3.40625,
"learning_rate": 1.064989157794833e-05,
"loss": 0.2380206823348999,
"step": 3945,
"token_acc": 0.9074281287657083
},
{
"epoch": 0.5060210094798873,
"grad_norm": 3.6875,
"learning_rate": 1.0628750882434537e-05,
"loss": 0.2411219596862793,
"step": 3950,
"token_acc": 0.9061099620820406
},
{
"epoch": 0.5066615424032795,
"grad_norm": 3.171875,
"learning_rate": 1.0607607365317874e-05,
"loss": 0.23887009620666505,
"step": 3955,
"token_acc": 0.906588215083319
},
{
"epoch": 0.5073020753266718,
"grad_norm": 4.625,
"learning_rate": 1.0586461121482672e-05,
"loss": 0.2420198917388916,
"step": 3960,
"token_acc": 0.9068667497957604
},
{
"epoch": 0.5079426082500641,
"grad_norm": 2.65625,
"learning_rate": 1.0565312245825505e-05,
"loss": 0.2432565689086914,
"step": 3965,
"token_acc": 0.905863065706027
},
{
"epoch": 0.5085831411734563,
"grad_norm": 2.8125,
"learning_rate": 1.0544160833254752e-05,
"loss": 0.2371816873550415,
"step": 3970,
"token_acc": 0.9089968976215098
},
{
"epoch": 0.5092236740968485,
"grad_norm": 3.6875,
"learning_rate": 1.052300697869018e-05,
"loss": 0.2434596061706543,
"step": 3975,
"token_acc": 0.9051765010128874
},
{
"epoch": 0.5098642070202408,
"grad_norm": 25.25,
"learning_rate": 1.0501850777062512e-05,
"loss": 0.24385199546813965,
"step": 3980,
"token_acc": 0.9046000258186669
},
{
"epoch": 0.5105047399436331,
"grad_norm": 3.578125,
"learning_rate": 1.0480692323313007e-05,
"loss": 0.23917775154113768,
"step": 3985,
"token_acc": 0.9061435654235827
},
{
"epoch": 0.5111452728670254,
"grad_norm": 3.546875,
"learning_rate": 1.0459531712393025e-05,
"loss": 0.2387022018432617,
"step": 3990,
"token_acc": 0.9047331145275522
},
{
"epoch": 0.5117858057904177,
"grad_norm": 2.515625,
"learning_rate": 1.0438369039263614e-05,
"loss": 0.24243345260620117,
"step": 3995,
"token_acc": 0.9045505472722571
},
{
"epoch": 0.5124263387138099,
"grad_norm": 3.078125,
"learning_rate": 1.0417204398895072e-05,
"loss": 0.23408794403076172,
"step": 4000,
"token_acc": 0.9087918271936279
},
{
"epoch": 0.5124263387138099,
"eval_loss": 0.3293675482273102,
"eval_runtime": 102.8807,
"eval_samples_per_second": 97.2,
"eval_steps_per_second": 12.15,
"eval_token_acc": 0.8810675983190392,
"step": 4000
},
{
"epoch": 0.5130668716372021,
"grad_norm": 6.1875,
"learning_rate": 1.039603788626653e-05,
"loss": 0.2410355567932129,
"step": 4005,
"token_acc": 0.9065908013276435
},
{
"epoch": 0.5137074045605944,
"grad_norm": 3.0,
"learning_rate": 1.0374869596365508e-05,
"loss": 0.2497018337249756,
"step": 4010,
"token_acc": 0.900746973469563
},
{
"epoch": 0.5143479374839867,
"grad_norm": 6.78125,
"learning_rate": 1.035369962418752e-05,
"loss": 0.24223339557647705,
"step": 4015,
"token_acc": 0.9042599153201417
},
{
"epoch": 0.5149884704073789,
"grad_norm": 4.375,
"learning_rate": 1.0332528064735614e-05,
"loss": 0.24308998584747316,
"step": 4020,
"token_acc": 0.905254091300603
},
{
"epoch": 0.5156290033307712,
"grad_norm": 2.78125,
"learning_rate": 1.031135501301997e-05,
"loss": 0.24722557067871093,
"step": 4025,
"token_acc": 0.902273805928291
},
{
"epoch": 0.5162695362541635,
"grad_norm": 3.4375,
"learning_rate": 1.0290180564057461e-05,
"loss": 0.23717832565307617,
"step": 4030,
"token_acc": 0.9058447172747709
},
{
"epoch": 0.5169100691775558,
"grad_norm": 4.03125,
"learning_rate": 1.0269004812871236e-05,
"loss": 0.23974413871765138,
"step": 4035,
"token_acc": 0.906494960806271
},
{
"epoch": 0.5175506021009479,
"grad_norm": 2.921875,
"learning_rate": 1.024782785449028e-05,
"loss": 0.25091626644134524,
"step": 4040,
"token_acc": 0.9013481980814104
},
{
"epoch": 0.5181911350243402,
"grad_norm": 3.0,
"learning_rate": 1.0226649783948997e-05,
"loss": 0.2415644645690918,
"step": 4045,
"token_acc": 0.9057377049180327
},
{
"epoch": 0.5188316679477325,
"grad_norm": 4.875,
"learning_rate": 1.0205470696286787e-05,
"loss": 0.24452197551727295,
"step": 4050,
"token_acc": 0.9055036791600327
},
{
"epoch": 0.5194722008711248,
"grad_norm": 4.25,
"learning_rate": 1.0184290686547611e-05,
"loss": 0.23984365463256835,
"step": 4055,
"token_acc": 0.9044206527370057
},
{
"epoch": 0.5201127337945171,
"grad_norm": 6.65625,
"learning_rate": 1.0163109849779567e-05,
"loss": 0.24106016159057617,
"step": 4060,
"token_acc": 0.9063618718999353
},
{
"epoch": 0.5207532667179093,
"grad_norm": 3.46875,
"learning_rate": 1.0141928281034468e-05,
"loss": 0.2418668746948242,
"step": 4065,
"token_acc": 0.9044542086671836
},
{
"epoch": 0.5213937996413016,
"grad_norm": 2.9375,
"learning_rate": 1.0120746075367406e-05,
"loss": 0.2402285099029541,
"step": 4070,
"token_acc": 0.9048873154304464
},
{
"epoch": 0.5220343325646938,
"grad_norm": 2.90625,
"learning_rate": 1.0099563327836338e-05,
"loss": 0.23992910385131835,
"step": 4075,
"token_acc": 0.906600034464932
},
{
"epoch": 0.5226748654880861,
"grad_norm": 3.515625,
"learning_rate": 1.0078380133501646e-05,
"loss": 0.24107756614685058,
"step": 4080,
"token_acc": 0.9048888506686159
},
{
"epoch": 0.5233153984114783,
"grad_norm": 3.078125,
"learning_rate": 1.0057196587425721e-05,
"loss": 0.24715356826782225,
"step": 4085,
"token_acc": 0.904106634457769
},
{
"epoch": 0.5239559313348706,
"grad_norm": 3.703125,
"learning_rate": 1.0036012784672538e-05,
"loss": 0.24057602882385254,
"step": 4090,
"token_acc": 0.9058342303552207
},
{
"epoch": 0.5245964642582629,
"grad_norm": 6.59375,
"learning_rate": 1.001482882030721e-05,
"loss": 0.2389677047729492,
"step": 4095,
"token_acc": 0.9081055404413352
},
{
"epoch": 0.5252369971816552,
"grad_norm": 2.53125,
"learning_rate": 9.99364478939559e-06,
"loss": 0.24011881351470948,
"step": 4100,
"token_acc": 0.9068454177084231
},
{
"epoch": 0.5252369971816552,
"eval_loss": 0.33330872654914856,
"eval_runtime": 102.9347,
"eval_samples_per_second": 97.149,
"eval_steps_per_second": 12.144,
"eval_token_acc": 0.8811091240289904,
"step": 4100
},
{
"epoch": 0.5258775301050475,
"grad_norm": 9.6875,
"learning_rate": 9.972460787003814e-06,
"loss": 0.24392313957214357,
"step": 4105,
"token_acc": 0.9049197262514527
},
{
"epoch": 0.5265180630284396,
"grad_norm": 3.078125,
"learning_rate": 9.95127690819791e-06,
"loss": 0.24266266822814941,
"step": 4110,
"token_acc": 0.9032661151327128
},
{
"epoch": 0.5271585959518319,
"grad_norm": 2.75,
"learning_rate": 9.93009324804333e-06,
"loss": 0.24082543849945068,
"step": 4115,
"token_acc": 0.9040199707325471
},
{
"epoch": 0.5277991288752242,
"grad_norm": 3.921875,
"learning_rate": 9.908909901604563e-06,
"loss": 0.24310965538024903,
"step": 4120,
"token_acc": 0.9044476079547905
},
{
"epoch": 0.5284396617986165,
"grad_norm": 2.453125,
"learning_rate": 9.887726963944676e-06,
"loss": 0.2375312328338623,
"step": 4125,
"token_acc": 0.9068424681144432
},
{
"epoch": 0.5290801947220087,
"grad_norm": 2.90625,
"learning_rate": 9.86654453012491e-06,
"loss": 0.23663816452026368,
"step": 4130,
"token_acc": 0.9086728274545534
},
{
"epoch": 0.529720727645401,
"grad_norm": 2.65625,
"learning_rate": 9.845362695204245e-06,
"loss": 0.24443821907043456,
"step": 4135,
"token_acc": 0.9042672413793104
},
{
"epoch": 0.5303612605687933,
"grad_norm": 4.65625,
"learning_rate": 9.824181554238965e-06,
"loss": 0.23482506275177,
"step": 4140,
"token_acc": 0.9089650996842971
},
{
"epoch": 0.5310017934921855,
"grad_norm": 17.5,
"learning_rate": 9.803001202282254e-06,
"loss": 0.24258599281311036,
"step": 4145,
"token_acc": 0.9058409510321446
},
{
"epoch": 0.5316423264155777,
"grad_norm": 2.890625,
"learning_rate": 9.781821734383741e-06,
"loss": 0.2373753547668457,
"step": 4150,
"token_acc": 0.9077843280691941
},
{
"epoch": 0.53228285933897,
"grad_norm": 3.46875,
"learning_rate": 9.760643245589096e-06,
"loss": 0.23887972831726073,
"step": 4155,
"token_acc": 0.9065009065009065
},
{
"epoch": 0.5329233922623623,
"grad_norm": 3.875,
"learning_rate": 9.73946583093959e-06,
"loss": 0.2412872791290283,
"step": 4160,
"token_acc": 0.9053228996474332
},
{
"epoch": 0.5335639251857546,
"grad_norm": 2.90625,
"learning_rate": 9.718289585471683e-06,
"loss": 0.23278658390045165,
"step": 4165,
"token_acc": 0.9080731969860064
},
{
"epoch": 0.5342044581091469,
"grad_norm": 3.015625,
"learning_rate": 9.697114604216573e-06,
"loss": 0.24164493083953859,
"step": 4170,
"token_acc": 0.9067511639937921
},
{
"epoch": 0.534844991032539,
"grad_norm": 3.109375,
"learning_rate": 9.6759409821998e-06,
"loss": 0.23885555267333985,
"step": 4175,
"token_acc": 0.9060220159723721
},
{
"epoch": 0.5354855239559313,
"grad_norm": 2.875,
"learning_rate": 9.65476881444079e-06,
"loss": 0.23392415046691895,
"step": 4180,
"token_acc": 0.9085397540273688
},
{
"epoch": 0.5361260568793236,
"grad_norm": 3.40625,
"learning_rate": 9.633598195952461e-06,
"loss": 0.23441019058227539,
"step": 4185,
"token_acc": 0.908675799086758
},
{
"epoch": 0.5367665898027159,
"grad_norm": 2.84375,
"learning_rate": 9.612429221740761e-06,
"loss": 0.23697328567504883,
"step": 4190,
"token_acc": 0.9047413793103448
},
{
"epoch": 0.5374071227261081,
"grad_norm": 3.15625,
"learning_rate": 9.591261986804264e-06,
"loss": 0.24399030208587646,
"step": 4195,
"token_acc": 0.905359667537809
},
{
"epoch": 0.5380476556495004,
"grad_norm": 3.25,
"learning_rate": 9.570096586133748e-06,
"loss": 0.23835985660552977,
"step": 4200,
"token_acc": 0.9053366669540478
},
{
"epoch": 0.5380476556495004,
"eval_loss": 0.32872986793518066,
"eval_runtime": 104.0894,
"eval_samples_per_second": 96.071,
"eval_steps_per_second": 12.009,
"eval_token_acc": 0.8815603700771271,
"step": 4200
},
{
"epoch": 0.5386881885728927,
"grad_norm": 2.625,
"learning_rate": 9.548933114711742e-06,
"loss": 0.2371370553970337,
"step": 4205,
"token_acc": 0.908296379213846
},
{
"epoch": 0.5393287214962849,
"grad_norm": 3.03125,
"learning_rate": 9.527771667512138e-06,
"loss": 0.2396193265914917,
"step": 4210,
"token_acc": 0.9050984786450028
},
{
"epoch": 0.5399692544196771,
"grad_norm": 3.171875,
"learning_rate": 9.506612339499725e-06,
"loss": 0.23920049667358398,
"step": 4215,
"token_acc": 0.9055294573977503
},
{
"epoch": 0.5406097873430694,
"grad_norm": 2.90625,
"learning_rate": 9.485455225629798e-06,
"loss": 0.23707218170166017,
"step": 4220,
"token_acc": 0.9071637426900585
},
{
"epoch": 0.5412503202664617,
"grad_norm": 3.046875,
"learning_rate": 9.464300420847698e-06,
"loss": 0.2316804885864258,
"step": 4225,
"token_acc": 0.9091498185588388
},
{
"epoch": 0.541890853189854,
"grad_norm": 3.28125,
"learning_rate": 9.443148020088426e-06,
"loss": 0.24402837753295897,
"step": 4230,
"token_acc": 0.9042589878437797
},
{
"epoch": 0.5425313861132463,
"grad_norm": 2.625,
"learning_rate": 9.421998118276169e-06,
"loss": 0.2375174045562744,
"step": 4235,
"token_acc": 0.9062986675895484
},
{
"epoch": 0.5431719190366385,
"grad_norm": 4.625,
"learning_rate": 9.400850810323925e-06,
"loss": 0.24248833656311036,
"step": 4240,
"token_acc": 0.9057444415606887
},
{
"epoch": 0.5438124519600307,
"grad_norm": 3.875,
"learning_rate": 9.379706191133033e-06,
"loss": 0.24248261451721193,
"step": 4245,
"token_acc": 0.9062149331031506
},
{
"epoch": 0.544452984883423,
"grad_norm": 3.96875,
"learning_rate": 9.358564355592775e-06,
"loss": 0.23543434143066405,
"step": 4250,
"token_acc": 0.9084744298548721
},
{
"epoch": 0.5450935178068153,
"grad_norm": 3.203125,
"learning_rate": 9.337425398579932e-06,
"loss": 0.23771212100982667,
"step": 4255,
"token_acc": 0.9078099493083598
},
{
"epoch": 0.5457340507302075,
"grad_norm": 3.515625,
"learning_rate": 9.316289414958379e-06,
"loss": 0.2383446216583252,
"step": 4260,
"token_acc": 0.9065444799861675
},
{
"epoch": 0.5463745836535998,
"grad_norm": 4.625,
"learning_rate": 9.295156499578647e-06,
"loss": 0.24645309448242186,
"step": 4265,
"token_acc": 0.9030008180135187
},
{
"epoch": 0.5470151165769921,
"grad_norm": 3.03125,
"learning_rate": 9.274026747277487e-06,
"loss": 0.23401763439178466,
"step": 4270,
"token_acc": 0.9084355033672941
},
{
"epoch": 0.5476556495003844,
"grad_norm": 3.15625,
"learning_rate": 9.252900252877464e-06,
"loss": 0.24168498516082765,
"step": 4275,
"token_acc": 0.9051816813588476
},
{
"epoch": 0.5482961824237765,
"grad_norm": 3.296875,
"learning_rate": 9.231777111186514e-06,
"loss": 0.23365185260772706,
"step": 4280,
"token_acc": 0.9065501055079453
},
{
"epoch": 0.5489367153471688,
"grad_norm": 3.140625,
"learning_rate": 9.210657416997543e-06,
"loss": 0.2374626636505127,
"step": 4285,
"token_acc": 0.9064794816414686
},
{
"epoch": 0.5495772482705611,
"grad_norm": 3.296875,
"learning_rate": 9.189541265087966e-06,
"loss": 0.23222618103027343,
"step": 4290,
"token_acc": 0.9083916688272405
},
{
"epoch": 0.5502177811939534,
"grad_norm": 3.671875,
"learning_rate": 9.168428750219323e-06,
"loss": 0.24052739143371582,
"step": 4295,
"token_acc": 0.9065121508259102
},
{
"epoch": 0.5508583141173456,
"grad_norm": 4.28125,
"learning_rate": 9.14731996713681e-06,
"loss": 0.2399202346801758,
"step": 4300,
"token_acc": 0.9062012818292049
},
{
"epoch": 0.5508583141173456,
"eval_loss": 0.32973331212997437,
"eval_runtime": 104.5664,
"eval_samples_per_second": 95.633,
"eval_steps_per_second": 11.954,
"eval_token_acc": 0.8813555099080344,
"step": 4300
},
{
"epoch": 0.5514988470407379,
"grad_norm": 4.5625,
"learning_rate": 9.126215010568896e-06,
"loss": 0.2345888137817383,
"step": 4305,
"token_acc": 0.9077134986225895
},
{
"epoch": 0.5521393799641302,
"grad_norm": 3.6875,
"learning_rate": 9.105113975226865e-06,
"loss": 0.24162061214447023,
"step": 4310,
"token_acc": 0.9073946393174179
},
{
"epoch": 0.5527799128875224,
"grad_norm": 2.5625,
"learning_rate": 9.08401695580441e-06,
"loss": 0.2340158700942993,
"step": 4315,
"token_acc": 0.9100293711126468
},
{
"epoch": 0.5534204458109147,
"grad_norm": 3.546875,
"learning_rate": 9.062924046977194e-06,
"loss": 0.23286752700805663,
"step": 4320,
"token_acc": 0.9091850828729282
},
{
"epoch": 0.5540609787343069,
"grad_norm": 3.515625,
"learning_rate": 9.041835343402445e-06,
"loss": 0.23487985134124756,
"step": 4325,
"token_acc": 0.9075176937683411
},
{
"epoch": 0.5547015116576992,
"grad_norm": 2.734375,
"learning_rate": 9.020750939718518e-06,
"loss": 0.2381136178970337,
"step": 4330,
"token_acc": 0.9072947672662957
},
{
"epoch": 0.5553420445810915,
"grad_norm": 3.234375,
"learning_rate": 8.999670930544459e-06,
"loss": 0.2352077007293701,
"step": 4335,
"token_acc": 0.9067487855655795
},
{
"epoch": 0.5559825775044838,
"grad_norm": 3.625,
"learning_rate": 8.978595410479609e-06,
"loss": 0.2357017993927002,
"step": 4340,
"token_acc": 0.9078482104355325
},
{
"epoch": 0.556623110427876,
"grad_norm": 2.65625,
"learning_rate": 8.957524474103146e-06,
"loss": 0.2459559679031372,
"step": 4345,
"token_acc": 0.9045916609235011
},
{
"epoch": 0.5572636433512682,
"grad_norm": 3.21875,
"learning_rate": 8.936458215973698e-06,
"loss": 0.23383736610412598,
"step": 4350,
"token_acc": 0.9094742276912486
},
{
"epoch": 0.5579041762746605,
"grad_norm": 2.734375,
"learning_rate": 8.915396730628882e-06,
"loss": 0.24825828075408934,
"step": 4355,
"token_acc": 0.9029686759446767
},
{
"epoch": 0.5585447091980528,
"grad_norm": 3.25,
"learning_rate": 8.894340112584909e-06,
"loss": 0.23654026985168458,
"step": 4360,
"token_acc": 0.9066741350338231
},
{
"epoch": 0.559185242121445,
"grad_norm": 3.5,
"learning_rate": 8.873288456336138e-06,
"loss": 0.23700532913208008,
"step": 4365,
"token_acc": 0.9077705469120373
},
{
"epoch": 0.5598257750448373,
"grad_norm": 3.765625,
"learning_rate": 8.852241856354669e-06,
"loss": 0.23611578941345215,
"step": 4370,
"token_acc": 0.9087608592298051
},
{
"epoch": 0.5604663079682296,
"grad_norm": 4.5625,
"learning_rate": 8.831200407089897e-06,
"loss": 0.24070956707000732,
"step": 4375,
"token_acc": 0.9068157385508493
},
{
"epoch": 0.5611068408916219,
"grad_norm": 2.453125,
"learning_rate": 8.810164202968123e-06,
"loss": 0.2372671604156494,
"step": 4380,
"token_acc": 0.9063080980587142
},
{
"epoch": 0.561747373815014,
"grad_norm": 2.875,
"learning_rate": 8.789133338392099e-06,
"loss": 0.2328266382217407,
"step": 4385,
"token_acc": 0.9102890972732379
},
{
"epoch": 0.5623879067384063,
"grad_norm": 2.984375,
"learning_rate": 8.76810790774061e-06,
"loss": 0.2348611831665039,
"step": 4390,
"token_acc": 0.9081421424874936
},
{
"epoch": 0.5630284396617986,
"grad_norm": 3.59375,
"learning_rate": 8.747088005368068e-06,
"loss": 0.2405010223388672,
"step": 4395,
"token_acc": 0.9055416702576919
},
{
"epoch": 0.5636689725851909,
"grad_norm": 2.921875,
"learning_rate": 8.726073725604061e-06,
"loss": 0.2389441728591919,
"step": 4400,
"token_acc": 0.9059644888812274
},
{
"epoch": 0.5636689725851909,
"eval_loss": 0.328523188829422,
"eval_runtime": 102.5856,
"eval_samples_per_second": 97.48,
"eval_steps_per_second": 12.185,
"eval_token_acc": 0.8815963590257515,
"step": 4400
},
{
"epoch": 0.5643095055085832,
"grad_norm": 2.90625,
"learning_rate": 8.705065162752961e-06,
"loss": 0.24200544357299805,
"step": 4405,
"token_acc": 0.9059365448361961
},
{
"epoch": 0.5649500384319754,
"grad_norm": 5.03125,
"learning_rate": 8.68406241109347e-06,
"loss": 0.24370207786560058,
"step": 4410,
"token_acc": 0.904322319040635
},
{
"epoch": 0.5655905713553676,
"grad_norm": 2.65625,
"learning_rate": 8.663065564878223e-06,
"loss": 0.2380732536315918,
"step": 4415,
"token_acc": 0.9064141196728368
},
{
"epoch": 0.5662311042787599,
"grad_norm": 3.421875,
"learning_rate": 8.642074718333345e-06,
"loss": 0.2384279727935791,
"step": 4420,
"token_acc": 0.9063185513355413
},
{
"epoch": 0.5668716372021522,
"grad_norm": 4.1875,
"learning_rate": 8.621089965658046e-06,
"loss": 0.23173861503601073,
"step": 4425,
"token_acc": 0.9093538222471619
},
{
"epoch": 0.5675121701255444,
"grad_norm": 2.859375,
"learning_rate": 8.600111401024177e-06,
"loss": 0.2245471954345703,
"step": 4430,
"token_acc": 0.9126788570440496
},
{
"epoch": 0.5681527030489367,
"grad_norm": 2.75,
"learning_rate": 8.57913911857583e-06,
"loss": 0.23515353202819825,
"step": 4435,
"token_acc": 0.9106183959812922
},
{
"epoch": 0.568793235972329,
"grad_norm": 2.75,
"learning_rate": 8.558173212428895e-06,
"loss": 0.23450264930725098,
"step": 4440,
"token_acc": 0.9078670050324745
},
{
"epoch": 0.5694337688957213,
"grad_norm": 12.875,
"learning_rate": 8.537213776670656e-06,
"loss": 0.23401873111724852,
"step": 4445,
"token_acc": 0.9095069510404974
},
{
"epoch": 0.5700743018191134,
"grad_norm": 5.25,
"learning_rate": 8.516260905359364e-06,
"loss": 0.23944463729858398,
"step": 4450,
"token_acc": 0.9062594106259411
},
{
"epoch": 0.5707148347425057,
"grad_norm": 3.34375,
"learning_rate": 8.495314692523795e-06,
"loss": 0.23881077766418457,
"step": 4455,
"token_acc": 0.90836533068726
},
{
"epoch": 0.571355367665898,
"grad_norm": 2.609375,
"learning_rate": 8.47437523216286e-06,
"loss": 0.23443114757537842,
"step": 4460,
"token_acc": 0.9087260486794407
},
{
"epoch": 0.5719959005892903,
"grad_norm": 6.125,
"learning_rate": 8.453442618245155e-06,
"loss": 0.24273183345794677,
"step": 4465,
"token_acc": 0.9040428010527678
},
{
"epoch": 0.5726364335126826,
"grad_norm": 2.8125,
"learning_rate": 8.432516944708565e-06,
"loss": 0.23376893997192383,
"step": 4470,
"token_acc": 0.9095967220185465
},
{
"epoch": 0.5732769664360748,
"grad_norm": 2.953125,
"learning_rate": 8.411598305459812e-06,
"loss": 0.23575949668884277,
"step": 4475,
"token_acc": 0.9096040329182644
},
{
"epoch": 0.5739174993594671,
"grad_norm": 3.625,
"learning_rate": 8.390686794374072e-06,
"loss": 0.24351611137390136,
"step": 4480,
"token_acc": 0.905288150226635
},
{
"epoch": 0.5745580322828593,
"grad_norm": 4.84375,
"learning_rate": 8.369782505294511e-06,
"loss": 0.2270632266998291,
"step": 4485,
"token_acc": 0.9119619294830197
},
{
"epoch": 0.5751985652062516,
"grad_norm": 3.15625,
"learning_rate": 8.348885532031904e-06,
"loss": 0.23725566864013672,
"step": 4490,
"token_acc": 0.9062796515138446
},
{
"epoch": 0.5758390981296438,
"grad_norm": 2.9375,
"learning_rate": 8.327995968364178e-06,
"loss": 0.23767762184143065,
"step": 4495,
"token_acc": 0.907425097698654
},
{
"epoch": 0.5764796310530361,
"grad_norm": 3.03125,
"learning_rate": 8.307113908036024e-06,
"loss": 0.24003219604492188,
"step": 4500,
"token_acc": 0.9065685894954187
},
{
"epoch": 0.5764796310530361,
"eval_loss": 0.32945460081100464,
"eval_runtime": 103.1647,
"eval_samples_per_second": 96.932,
"eval_steps_per_second": 12.117,
"eval_token_acc": 0.881673873684327,
"step": 4500
},
{
"epoch": 0.5771201639764284,
"grad_norm": 2.921875,
"learning_rate": 8.286239444758448e-06,
"loss": 0.225927734375,
"step": 4505,
"token_acc": 0.9105997573236263
},
{
"epoch": 0.5777606968998207,
"grad_norm": 2.53125,
"learning_rate": 8.265372672208375e-06,
"loss": 0.24204869270324708,
"step": 4510,
"token_acc": 0.907190275023709
},
{
"epoch": 0.578401229823213,
"grad_norm": 3.109375,
"learning_rate": 8.244513684028208e-06,
"loss": 0.23642313480377197,
"step": 4515,
"token_acc": 0.9095570492933471
},
{
"epoch": 0.5790417627466051,
"grad_norm": 5.25,
"learning_rate": 8.223662573825418e-06,
"loss": 0.23264212608337403,
"step": 4520,
"token_acc": 0.9079717630853994
},
{
"epoch": 0.5796822956699974,
"grad_norm": 2.640625,
"learning_rate": 8.202819435172129e-06,
"loss": 0.22397842407226562,
"step": 4525,
"token_acc": 0.9115804932832275
},
{
"epoch": 0.5803228285933897,
"grad_norm": 3.8125,
"learning_rate": 8.181984361604677e-06,
"loss": 0.24578235149383545,
"step": 4530,
"token_acc": 0.9068775316728432
},
{
"epoch": 0.580963361516782,
"grad_norm": 3.234375,
"learning_rate": 8.161157446623227e-06,
"loss": 0.23125510215759276,
"step": 4535,
"token_acc": 0.9093144656801415
},
{
"epoch": 0.5816038944401742,
"grad_norm": 3.125,
"learning_rate": 8.140338783691308e-06,
"loss": 0.2348803997039795,
"step": 4540,
"token_acc": 0.9077382239716251
},
{
"epoch": 0.5822444273635665,
"grad_norm": 3.125,
"learning_rate": 8.119528466235434e-06,
"loss": 0.22919659614562987,
"step": 4545,
"token_acc": 0.9098924731182796
},
{
"epoch": 0.5828849602869588,
"grad_norm": 18.5,
"learning_rate": 8.098726587644659e-06,
"loss": 0.23590612411499023,
"step": 4550,
"token_acc": 0.9070518339934561
},
{
"epoch": 0.583525493210351,
"grad_norm": 3.359375,
"learning_rate": 8.07793324127017e-06,
"loss": 0.23590869903564454,
"step": 4555,
"token_acc": 0.9086673281849951
},
{
"epoch": 0.5841660261337432,
"grad_norm": 12.1875,
"learning_rate": 8.05714852042486e-06,
"loss": 0.23389995098114014,
"step": 4560,
"token_acc": 0.9086344946981173
},
{
"epoch": 0.5848065590571355,
"grad_norm": 3.53125,
"learning_rate": 8.036372518382922e-06,
"loss": 0.2384809970855713,
"step": 4565,
"token_acc": 0.9059391015978293
},
{
"epoch": 0.5854470919805278,
"grad_norm": 3.84375,
"learning_rate": 8.015605328379407e-06,
"loss": 0.23714299201965333,
"step": 4570,
"token_acc": 0.9076115033580162
},
{
"epoch": 0.5860876249039201,
"grad_norm": 2.609375,
"learning_rate": 7.994847043609844e-06,
"loss": 0.23302805423736572,
"step": 4575,
"token_acc": 0.9086178721940311
},
{
"epoch": 0.5867281578273124,
"grad_norm": 4.59375,
"learning_rate": 7.974097757229781e-06,
"loss": 0.23717694282531737,
"step": 4580,
"token_acc": 0.9076014314663907
},
{
"epoch": 0.5873686907507046,
"grad_norm": 3.0,
"learning_rate": 7.953357562354384e-06,
"loss": 0.23976330757141112,
"step": 4585,
"token_acc": 0.9052459298819882
},
{
"epoch": 0.5880092236740968,
"grad_norm": 5.625,
"learning_rate": 7.932626552058032e-06,
"loss": 0.23990461826324463,
"step": 4590,
"token_acc": 0.9076478454039598
},
{
"epoch": 0.5886497565974891,
"grad_norm": 2.78125,
"learning_rate": 7.911904819373873e-06,
"loss": 0.23198351860046387,
"step": 4595,
"token_acc": 0.9084361252479944
},
{
"epoch": 0.5892902895208814,
"grad_norm": 3.515625,
"learning_rate": 7.891192457293433e-06,
"loss": 0.2373666524887085,
"step": 4600,
"token_acc": 0.9076724137931035
},
{
"epoch": 0.5892902895208814,
"eval_loss": 0.3317316174507141,
"eval_runtime": 102.7383,
"eval_samples_per_second": 97.335,
"eval_steps_per_second": 12.167,
"eval_token_acc": 0.8819451749893418,
"step": 4600
},
{
"epoch": 0.5899308224442736,
"grad_norm": 2.734375,
"learning_rate": 7.870489558766178e-06,
"loss": 0.23856124877929688,
"step": 4605,
"token_acc": 0.9061164587559891
},
{
"epoch": 0.5905713553676659,
"grad_norm": 5.84375,
"learning_rate": 7.84979621669911e-06,
"loss": 0.23322293758392335,
"step": 4610,
"token_acc": 0.9075289241927128
},
{
"epoch": 0.5912118882910582,
"grad_norm": 2.546875,
"learning_rate": 7.829112523956335e-06,
"loss": 0.23959455490112305,
"step": 4615,
"token_acc": 0.9058361730578441
},
{
"epoch": 0.5918524212144505,
"grad_norm": 3.0,
"learning_rate": 7.808438573358674e-06,
"loss": 0.2323786735534668,
"step": 4620,
"token_acc": 0.9092474599621146
},
{
"epoch": 0.5924929541378426,
"grad_norm": 2.890625,
"learning_rate": 7.787774457683209e-06,
"loss": 0.23595137596130372,
"step": 4625,
"token_acc": 0.9091104889080336
},
{
"epoch": 0.5931334870612349,
"grad_norm": 4.25,
"learning_rate": 7.767120269662905e-06,
"loss": 0.2342782974243164,
"step": 4630,
"token_acc": 0.9082426127527217
},
{
"epoch": 0.5937740199846272,
"grad_norm": 3.59375,
"learning_rate": 7.746476101986164e-06,
"loss": 0.2340677261352539,
"step": 4635,
"token_acc": 0.9087346024636058
},
{
"epoch": 0.5944145529080195,
"grad_norm": 3.0625,
"learning_rate": 7.725842047296419e-06,
"loss": 0.23336553573608398,
"step": 4640,
"token_acc": 0.9081429560401523
},
{
"epoch": 0.5950550858314118,
"grad_norm": 3.203125,
"learning_rate": 7.70521819819173e-06,
"loss": 0.24391114711761475,
"step": 4645,
"token_acc": 0.9032941379906623
},
{
"epoch": 0.595695618754804,
"grad_norm": 4.625,
"learning_rate": 7.684604647224345e-06,
"loss": 0.23319551944732667,
"step": 4650,
"token_acc": 0.9080335989661856
},
{
"epoch": 0.5963361516781963,
"grad_norm": 2.9375,
"learning_rate": 7.66400148690031e-06,
"loss": 0.22878189086914064,
"step": 4655,
"token_acc": 0.9093414875748309
},
{
"epoch": 0.5969766846015885,
"grad_norm": 2.859375,
"learning_rate": 7.643408809679034e-06,
"loss": 0.2268078327178955,
"step": 4660,
"token_acc": 0.9114099182844049
},
{
"epoch": 0.5976172175249808,
"grad_norm": 3.25,
"learning_rate": 7.622826707972883e-06,
"loss": 0.23129682540893554,
"step": 4665,
"token_acc": 0.9086430423509075
},
{
"epoch": 0.598257750448373,
"grad_norm": 2.703125,
"learning_rate": 7.602255274146767e-06,
"loss": 0.2353008508682251,
"step": 4670,
"token_acc": 0.9068710222106767
},
{
"epoch": 0.5988982833717653,
"grad_norm": 2.875,
"learning_rate": 7.58169460051772e-06,
"loss": 0.2389591693878174,
"step": 4675,
"token_acc": 0.9058991190188288
},
{
"epoch": 0.5995388162951576,
"grad_norm": 3.078125,
"learning_rate": 7.561144779354483e-06,
"loss": 0.23087067604064943,
"step": 4680,
"token_acc": 0.9091809064692463
},
{
"epoch": 0.6001793492185499,
"grad_norm": 5.78125,
"learning_rate": 7.540605902877108e-06,
"loss": 0.2390049457550049,
"step": 4685,
"token_acc": 0.9069245380763253
},
{
"epoch": 0.600819882141942,
"grad_norm": 2.9375,
"learning_rate": 7.520078063256517e-06,
"loss": 0.23379735946655272,
"step": 4690,
"token_acc": 0.9081875135018362
},
{
"epoch": 0.6014604150653343,
"grad_norm": 3.5,
"learning_rate": 7.4995613526141156e-06,
"loss": 0.2288158893585205,
"step": 4695,
"token_acc": 0.9082493403123243
},
{
"epoch": 0.6021009479887266,
"grad_norm": 3.515625,
"learning_rate": 7.47905586302136e-06,
"loss": 0.23635220527648926,
"step": 4700,
"token_acc": 0.908278174159718
},
{
"epoch": 0.6021009479887266,
"eval_loss": 0.33331820368766785,
"eval_runtime": 103.2322,
"eval_samples_per_second": 96.869,
"eval_steps_per_second": 12.109,
"eval_token_acc": 0.8817569251042295,
"step": 4700
},
{
"epoch": 0.6027414809121189,
"grad_norm": 3.03125,
"learning_rate": 7.458561686499345e-06,
"loss": 0.22352910041809082,
"step": 4705,
"token_acc": 0.9113029146426093
},
{
"epoch": 0.6033820138355112,
"grad_norm": 18.75,
"learning_rate": 7.438078915018409e-06,
"loss": 0.22866015434265136,
"step": 4710,
"token_acc": 0.9103150625809237
},
{
"epoch": 0.6040225467589034,
"grad_norm": 2.953125,
"learning_rate": 7.417607640497697e-06,
"loss": 0.23653111457824708,
"step": 4715,
"token_acc": 0.9067353067353068
},
{
"epoch": 0.6046630796822957,
"grad_norm": 4.40625,
"learning_rate": 7.397147954804771e-06,
"loss": 0.23970022201538085,
"step": 4720,
"token_acc": 0.9069306076680899
},
{
"epoch": 0.6053036126056879,
"grad_norm": 5.3125,
"learning_rate": 7.376699949755176e-06,
"loss": 0.2359128475189209,
"step": 4725,
"token_acc": 0.9068213176957571
},
{
"epoch": 0.6059441455290802,
"grad_norm": 3.3125,
"learning_rate": 7.356263717112047e-06,
"loss": 0.23450722694396972,
"step": 4730,
"token_acc": 0.9093179469514295
},
{
"epoch": 0.6065846784524724,
"grad_norm": 3.03125,
"learning_rate": 7.335839348585676e-06,
"loss": 0.23415303230285645,
"step": 4735,
"token_acc": 0.9075492812257898
},
{
"epoch": 0.6072252113758647,
"grad_norm": 2.921875,
"learning_rate": 7.315426935833135e-06,
"loss": 0.22811522483825683,
"step": 4740,
"token_acc": 0.9106996417627001
},
{
"epoch": 0.607865744299257,
"grad_norm": 3.453125,
"learning_rate": 7.29502657045782e-06,
"loss": 0.23572731018066406,
"step": 4745,
"token_acc": 0.9084832017941088
},
{
"epoch": 0.6085062772226493,
"grad_norm": 2.921875,
"learning_rate": 7.274638344009079e-06,
"loss": 0.22873611450195314,
"step": 4750,
"token_acc": 0.9101705895055063
},
{
"epoch": 0.6091468101460416,
"grad_norm": 2.953125,
"learning_rate": 7.254262347981777e-06,
"loss": 0.23314647674560546,
"step": 4755,
"token_acc": 0.9090830933241628
},
{
"epoch": 0.6097873430694337,
"grad_norm": 3.078125,
"learning_rate": 7.233898673815891e-06,
"loss": 0.2401879072189331,
"step": 4760,
"token_acc": 0.9065476960213802
},
{
"epoch": 0.610427875992826,
"grad_norm": 4.625,
"learning_rate": 7.213547412896116e-06,
"loss": 0.23366336822509765,
"step": 4765,
"token_acc": 0.9075221619760737
},
{
"epoch": 0.6110684089162183,
"grad_norm": 2.921875,
"learning_rate": 7.193208656551419e-06,
"loss": 0.22110800743103026,
"step": 4770,
"token_acc": 0.9112323547241707
},
{
"epoch": 0.6117089418396106,
"grad_norm": 3.328125,
"learning_rate": 7.172882496054675e-06,
"loss": 0.22980756759643556,
"step": 4775,
"token_acc": 0.9108833830587625
},
{
"epoch": 0.6123494747630028,
"grad_norm": 8.6875,
"learning_rate": 7.152569022622213e-06,
"loss": 0.238081693649292,
"step": 4780,
"token_acc": 0.9061473283762753
},
{
"epoch": 0.6129900076863951,
"grad_norm": 3.03125,
"learning_rate": 7.1322683274134405e-06,
"loss": 0.23080739974975586,
"step": 4785,
"token_acc": 0.9094276239286792
},
{
"epoch": 0.6136305406097874,
"grad_norm": 3.65625,
"learning_rate": 7.111980501530413e-06,
"loss": 0.23122069835662842,
"step": 4790,
"token_acc": 0.9078907501190322
},
{
"epoch": 0.6142710735331796,
"grad_norm": 3.03125,
"learning_rate": 7.091705636017443e-06,
"loss": 0.23598337173461914,
"step": 4795,
"token_acc": 0.9072490063936409
},
{
"epoch": 0.6149116064565718,
"grad_norm": 5.1875,
"learning_rate": 7.071443821860664e-06,
"loss": 0.23058700561523438,
"step": 4800,
"token_acc": 0.9092871637666767
},
{
"epoch": 0.6149116064565718,
"eval_loss": 0.33330273628234863,
"eval_runtime": 103.0939,
"eval_samples_per_second": 96.999,
"eval_steps_per_second": 12.125,
"eval_token_acc": 0.882147266777771,
"step": 4800
},
{
"epoch": 0.6155521393799641,
"grad_norm": 3.4375,
"learning_rate": 7.051195149987662e-06,
"loss": 0.23541276454925536,
"step": 4805,
"token_acc": 0.9070878340577527
},
{
"epoch": 0.6161926723033564,
"grad_norm": 3.578125,
"learning_rate": 7.030959711267026e-06,
"loss": 0.24111108779907225,
"step": 4810,
"token_acc": 0.9057868281995103
},
{
"epoch": 0.6168332052267487,
"grad_norm": 2.75,
"learning_rate": 7.010737596507975e-06,
"loss": 0.2280668020248413,
"step": 4815,
"token_acc": 0.9101400414937759
},
{
"epoch": 0.617473738150141,
"grad_norm": 3.171875,
"learning_rate": 6.990528896459922e-06,
"loss": 0.23039546012878417,
"step": 4820,
"token_acc": 0.9103489771359807
},
{
"epoch": 0.6181142710735332,
"grad_norm": 3.328125,
"learning_rate": 6.9703337018120845e-06,
"loss": 0.233514666557312,
"step": 4825,
"token_acc": 0.9083129058616093
},
{
"epoch": 0.6187548039969254,
"grad_norm": 4.4375,
"learning_rate": 6.9501521031930816e-06,
"loss": 0.23697190284729003,
"step": 4830,
"token_acc": 0.9069807427785419
},
{
"epoch": 0.6193953369203177,
"grad_norm": 2.953125,
"learning_rate": 6.9299841911705e-06,
"loss": 0.23227353096008302,
"step": 4835,
"token_acc": 0.9092987147416545
},
{
"epoch": 0.62003586984371,
"grad_norm": 3.5625,
"learning_rate": 6.909830056250527e-06,
"loss": 0.23467817306518554,
"step": 4840,
"token_acc": 0.909126180109497
},
{
"epoch": 0.6206764027671022,
"grad_norm": 3.125,
"learning_rate": 6.889689788877505e-06,
"loss": 0.22795021533966064,
"step": 4845,
"token_acc": 0.9109432333261386
},
{
"epoch": 0.6213169356904945,
"grad_norm": 2.609375,
"learning_rate": 6.869563479433555e-06,
"loss": 0.23201088905334472,
"step": 4850,
"token_acc": 0.9089618990281242
},
{
"epoch": 0.6219574686138868,
"grad_norm": 2.875,
"learning_rate": 6.849451218238152e-06,
"loss": 0.23549177646636962,
"step": 4855,
"token_acc": 0.9081416921948483
},
{
"epoch": 0.6225980015372791,
"grad_norm": 2.75,
"learning_rate": 6.82935309554774e-06,
"loss": 0.22994422912597656,
"step": 4860,
"token_acc": 0.9110630942091617
},
{
"epoch": 0.6232385344606712,
"grad_norm": 3.03125,
"learning_rate": 6.8092692015552984e-06,
"loss": 0.22758188247680664,
"step": 4865,
"token_acc": 0.9083592938733126
},
{
"epoch": 0.6238790673840635,
"grad_norm": 3.375,
"learning_rate": 6.789199626389971e-06,
"loss": 0.22297306060791017,
"step": 4870,
"token_acc": 0.913397067093481
},
{
"epoch": 0.6245196003074558,
"grad_norm": 3.5,
"learning_rate": 6.7691444601166255e-06,
"loss": 0.2313997268676758,
"step": 4875,
"token_acc": 0.9092045160734293
},
{
"epoch": 0.6251601332308481,
"grad_norm": 4.125,
"learning_rate": 6.749103792735481e-06,
"loss": 0.236191987991333,
"step": 4880,
"token_acc": 0.9096788100883811
},
{
"epoch": 0.6258006661542403,
"grad_norm": 2.984375,
"learning_rate": 6.729077714181692e-06,
"loss": 0.2335993766784668,
"step": 4885,
"token_acc": 0.9090713486530683
},
{
"epoch": 0.6264411990776326,
"grad_norm": 2.671875,
"learning_rate": 6.709066314324929e-06,
"loss": 0.23459205627441407,
"step": 4890,
"token_acc": 0.9073938032064301
},
{
"epoch": 0.6270817320010249,
"grad_norm": 3.359375,
"learning_rate": 6.689069682969009e-06,
"loss": 0.2288151502609253,
"step": 4895,
"token_acc": 0.9099611901681759
},
{
"epoch": 0.6277222649244171,
"grad_norm": 3.203125,
"learning_rate": 6.669087909851459e-06,
"loss": 0.23342595100402833,
"step": 4900,
"token_acc": 0.9083329752030599
},
{
"epoch": 0.6277222649244171,
"eval_loss": 0.33611026406288147,
"eval_runtime": 102.7976,
"eval_samples_per_second": 97.278,
"eval_steps_per_second": 12.16,
"eval_token_acc": 0.8819313330860247,
"step": 4900
},
{
"epoch": 0.6283627978478094,
"grad_norm": 9.75,
"learning_rate": 6.649121084643133e-06,
"loss": 0.2269826889038086,
"step": 4905,
"token_acc": 0.9081998359169221
},
{
"epoch": 0.6290033307712016,
"grad_norm": 3.265625,
"learning_rate": 6.629169296947804e-06,
"loss": 0.2403498649597168,
"step": 4910,
"token_acc": 0.9052165312002752
},
{
"epoch": 0.6296438636945939,
"grad_norm": 3.234375,
"learning_rate": 6.6092326363017635e-06,
"loss": 0.23246257305145263,
"step": 4915,
"token_acc": 0.9084171289875174
},
{
"epoch": 0.6302843966179862,
"grad_norm": 3.109375,
"learning_rate": 6.589311192173414e-06,
"loss": 0.228167724609375,
"step": 4920,
"token_acc": 0.910772955213557
},
{
"epoch": 0.6309249295413785,
"grad_norm": 9.5,
"learning_rate": 6.5694050539628805e-06,
"loss": 0.2342754125595093,
"step": 4925,
"token_acc": 0.9082616179001721
},
{
"epoch": 0.6315654624647707,
"grad_norm": 3.15625,
"learning_rate": 6.549514311001587e-06,
"loss": 0.23288652896881104,
"step": 4930,
"token_acc": 0.9084992673045427
},
{
"epoch": 0.6322059953881629,
"grad_norm": 7.25,
"learning_rate": 6.529639052551886e-06,
"loss": 0.23185653686523439,
"step": 4935,
"token_acc": 0.911041091160221
},
{
"epoch": 0.6328465283115552,
"grad_norm": 3.1875,
"learning_rate": 6.509779367806625e-06,
"loss": 0.23133904933929444,
"step": 4940,
"token_acc": 0.9111350884764782
},
{
"epoch": 0.6334870612349475,
"grad_norm": 4.3125,
"learning_rate": 6.489935345888774e-06,
"loss": 0.22587313652038574,
"step": 4945,
"token_acc": 0.9098948272161408
},
{
"epoch": 0.6341275941583397,
"grad_norm": 3.109375,
"learning_rate": 6.470107075851011e-06,
"loss": 0.2315220832824707,
"step": 4950,
"token_acc": 0.9086874084288546
},
{
"epoch": 0.634768127081732,
"grad_norm": 2.640625,
"learning_rate": 6.450294646675319e-06,
"loss": 0.22459986209869384,
"step": 4955,
"token_acc": 0.9098201578470695
},
{
"epoch": 0.6354086600051243,
"grad_norm": 2.796875,
"learning_rate": 6.430498147272607e-06,
"loss": 0.2365894317626953,
"step": 4960,
"token_acc": 0.9069646344109351
},
{
"epoch": 0.6360491929285165,
"grad_norm": 7.15625,
"learning_rate": 6.41071766648228e-06,
"loss": 0.2363147735595703,
"step": 4965,
"token_acc": 0.9058361730578441
},
{
"epoch": 0.6366897258519087,
"grad_norm": 2.875,
"learning_rate": 6.390953293071871e-06,
"loss": 0.22636122703552247,
"step": 4970,
"token_acc": 0.9104109944249967
},
{
"epoch": 0.637330258775301,
"grad_norm": 3.265625,
"learning_rate": 6.371205115736618e-06,
"loss": 0.22853505611419678,
"step": 4975,
"token_acc": 0.9105326667815894
},
{
"epoch": 0.6379707916986933,
"grad_norm": 4.125,
"learning_rate": 6.351473223099089e-06,
"loss": 0.23797154426574707,
"step": 4980,
"token_acc": 0.9067096774193548
},
{
"epoch": 0.6386113246220856,
"grad_norm": 2.78125,
"learning_rate": 6.33175770370876e-06,
"loss": 0.23546228408813477,
"step": 4985,
"token_acc": 0.9074697754749568
},
{
"epoch": 0.6392518575454779,
"grad_norm": 5.90625,
"learning_rate": 6.3120586460416454e-06,
"loss": 0.22152302265167237,
"step": 4990,
"token_acc": 0.9129477772982305
},
{
"epoch": 0.6398923904688701,
"grad_norm": 3.0625,
"learning_rate": 6.292376138499865e-06,
"loss": 0.23244686126708985,
"step": 4995,
"token_acc": 0.9084896688856229
},
{
"epoch": 0.6405329233922623,
"grad_norm": 4.4375,
"learning_rate": 6.272710269411286e-06,
"loss": 0.2383200168609619,
"step": 5000,
"token_acc": 0.9069365908404196
},
{
"epoch": 0.6405329233922623,
"eval_loss": 0.3339642584323883,
"eval_runtime": 102.7379,
"eval_samples_per_second": 97.335,
"eval_steps_per_second": 12.167,
"eval_token_acc": 0.8819368698473515,
"step": 5000
},
{
"epoch": 0.6411734563156546,
"grad_norm": 3.203125,
"learning_rate": 6.2530611270290935e-06,
"loss": 0.22576665878295898,
"step": 5005,
"token_acc": 0.9115372986048119
},
{
"epoch": 0.6418139892390469,
"grad_norm": 3.078125,
"learning_rate": 6.23342879953142e-06,
"loss": 0.23447873592376708,
"step": 5010,
"token_acc": 0.9083354860931715
},
{
"epoch": 0.6424545221624391,
"grad_norm": 3.375,
"learning_rate": 6.2138133750209425e-06,
"loss": 0.223459792137146,
"step": 5015,
"token_acc": 0.9112359550561798
},
{
"epoch": 0.6430950550858314,
"grad_norm": 2.828125,
"learning_rate": 6.19421494152447e-06,
"loss": 0.22827987670898436,
"step": 5020,
"token_acc": 0.9120494424755813
},
{
"epoch": 0.6437355880092237,
"grad_norm": 3.40625,
"learning_rate": 6.174633586992569e-06,
"loss": 0.22968311309814454,
"step": 5025,
"token_acc": 0.9102702236779727
},
{
"epoch": 0.644376120932616,
"grad_norm": 3.859375,
"learning_rate": 6.155069399299163e-06,
"loss": 0.23479781150817872,
"step": 5030,
"token_acc": 0.9084179721122396
},
{
"epoch": 0.6450166538560081,
"grad_norm": 2.890625,
"learning_rate": 6.1355224662411375e-06,
"loss": 0.2318052291870117,
"step": 5035,
"token_acc": 0.909961603175288
},
{
"epoch": 0.6456571867794004,
"grad_norm": 3.703125,
"learning_rate": 6.115992875537937e-06,
"loss": 0.23980298042297363,
"step": 5040,
"token_acc": 0.9071697134707637
},
{
"epoch": 0.6462977197027927,
"grad_norm": 3.40625,
"learning_rate": 6.096480714831197e-06,
"loss": 0.22896120548248292,
"step": 5045,
"token_acc": 0.9098480925254617
},
{
"epoch": 0.646938252626185,
"grad_norm": 3.015625,
"learning_rate": 6.076986071684313e-06,
"loss": 0.22948775291442872,
"step": 5050,
"token_acc": 0.9118307426597582
},
{
"epoch": 0.6475787855495773,
"grad_norm": 2.9375,
"learning_rate": 6.057509033582087e-06,
"loss": 0.23411431312561035,
"step": 5055,
"token_acc": 0.9089658138034831
},
{
"epoch": 0.6482193184729695,
"grad_norm": 3.234375,
"learning_rate": 6.038049687930303e-06,
"loss": 0.22734377384185792,
"step": 5060,
"token_acc": 0.9120784583620096
},
{
"epoch": 0.6488598513963618,
"grad_norm": 3.078125,
"learning_rate": 6.018608122055352e-06,
"loss": 0.21841344833374024,
"step": 5065,
"token_acc": 0.9142105036033314
},
{
"epoch": 0.649500384319754,
"grad_norm": 3.21875,
"learning_rate": 5.9991844232038385e-06,
"loss": 0.23631734848022462,
"step": 5070,
"token_acc": 0.9079987900263602
},
{
"epoch": 0.6501409172431463,
"grad_norm": 5.9375,
"learning_rate": 5.9797786785421806e-06,
"loss": 0.22841448783874513,
"step": 5075,
"token_acc": 0.9127186352839559
},
{
"epoch": 0.6507814501665385,
"grad_norm": 3.265625,
"learning_rate": 5.960390975156234e-06,
"loss": 0.2350531816482544,
"step": 5080,
"token_acc": 0.9096751930293749
},
{
"epoch": 0.6514219830899308,
"grad_norm": 3.15625,
"learning_rate": 5.94102140005088e-06,
"loss": 0.23367114067077638,
"step": 5085,
"token_acc": 0.9095145631067961
},
{
"epoch": 0.6520625160133231,
"grad_norm": 3.234375,
"learning_rate": 5.921670040149655e-06,
"loss": 0.2327101230621338,
"step": 5090,
"token_acc": 0.9080370942812983
},
{
"epoch": 0.6527030489367154,
"grad_norm": 4.0,
"learning_rate": 5.902336982294346e-06,
"loss": 0.22089247703552245,
"step": 5095,
"token_acc": 0.9123602296766394
},
{
"epoch": 0.6533435818601077,
"grad_norm": 5.75,
"learning_rate": 5.88302231324462e-06,
"loss": 0.23006696701049806,
"step": 5100,
"token_acc": 0.9100451710045171
},
{
"epoch": 0.6533435818601077,
"eval_loss": 0.3327247202396393,
"eval_runtime": 103.2522,
"eval_samples_per_second": 96.85,
"eval_steps_per_second": 12.106,
"eval_token_acc": 0.8824545570314101,
"step": 5100
},
{
"epoch": 0.6539841147834998,
"grad_norm": 3.515625,
"learning_rate": 5.863726119677602e-06,
"loss": 0.23648326396942138,
"step": 5105,
"token_acc": 0.9088638125592213
},
{
"epoch": 0.6546246477068921,
"grad_norm": 2.65625,
"learning_rate": 5.844448488187526e-06,
"loss": 0.22581000328063966,
"step": 5110,
"token_acc": 0.9106820331985895
},
{
"epoch": 0.6552651806302844,
"grad_norm": 2.9375,
"learning_rate": 5.825189505285308e-06,
"loss": 0.2255998134613037,
"step": 5115,
"token_acc": 0.9123876210235131
},
{
"epoch": 0.6559057135536767,
"grad_norm": 5.0,
"learning_rate": 5.805949257398195e-06,
"loss": 0.23895587921142578,
"step": 5120,
"token_acc": 0.9072859041982932
},
{
"epoch": 0.6565462464770689,
"grad_norm": 2.9375,
"learning_rate": 5.786727830869337e-06,
"loss": 0.2289639711380005,
"step": 5125,
"token_acc": 0.9104838361603868
},
{
"epoch": 0.6571867794004612,
"grad_norm": 16.75,
"learning_rate": 5.767525311957441e-06,
"loss": 0.22975871562957764,
"step": 5130,
"token_acc": 0.9101954341058457
},
{
"epoch": 0.6578273123238535,
"grad_norm": 12.4375,
"learning_rate": 5.748341786836353e-06,
"loss": 0.23110666275024414,
"step": 5135,
"token_acc": 0.910392569978931
},
{
"epoch": 0.6584678452472457,
"grad_norm": 2.65625,
"learning_rate": 5.729177341594674e-06,
"loss": 0.23442704677581788,
"step": 5140,
"token_acc": 0.9096712966957122
},
{
"epoch": 0.6591083781706379,
"grad_norm": 2.828125,
"learning_rate": 5.710032062235404e-06,
"loss": 0.23014814853668214,
"step": 5145,
"token_acc": 0.9096832657288341
},
{
"epoch": 0.6597489110940302,
"grad_norm": 3.03125,
"learning_rate": 5.690906034675505e-06,
"loss": 0.2316150188446045,
"step": 5150,
"token_acc": 0.9095728632386535
},
{
"epoch": 0.6603894440174225,
"grad_norm": 2.796875,
"learning_rate": 5.671799344745577e-06,
"loss": 0.22539763450622557,
"step": 5155,
"token_acc": 0.9121212121212121
},
{
"epoch": 0.6610299769408148,
"grad_norm": 2.484375,
"learning_rate": 5.652712078189408e-06,
"loss": 0.23087406158447266,
"step": 5160,
"token_acc": 0.9089225734217552
},
{
"epoch": 0.6616705098642071,
"grad_norm": 3.453125,
"learning_rate": 5.633644320663638e-06,
"loss": 0.2334925651550293,
"step": 5165,
"token_acc": 0.9076453650780008
},
{
"epoch": 0.6623110427875993,
"grad_norm": 2.96875,
"learning_rate": 5.614596157737357e-06,
"loss": 0.22363200187683105,
"step": 5170,
"token_acc": 0.9122427805637212
},
{
"epoch": 0.6629515757109915,
"grad_norm": 3.640625,
"learning_rate": 5.5955676748917195e-06,
"loss": 0.2343050003051758,
"step": 5175,
"token_acc": 0.9070922598479613
},
{
"epoch": 0.6635921086343838,
"grad_norm": 3.484375,
"learning_rate": 5.57655895751956e-06,
"loss": 0.23191659450531005,
"step": 5180,
"token_acc": 0.9086604683195593
},
{
"epoch": 0.6642326415577761,
"grad_norm": 2.9375,
"learning_rate": 5.557570090925019e-06,
"loss": 0.22515347003936767,
"step": 5185,
"token_acc": 0.9112387698686939
},
{
"epoch": 0.6648731744811683,
"grad_norm": 3.828125,
"learning_rate": 5.538601160323147e-06,
"loss": 0.23082191944122316,
"step": 5190,
"token_acc": 0.9089143103820418
},
{
"epoch": 0.6655137074045606,
"grad_norm": 3.8125,
"learning_rate": 5.519652250839537e-06,
"loss": 0.22431740760803223,
"step": 5195,
"token_acc": 0.912159537272845
},
{
"epoch": 0.6661542403279529,
"grad_norm": 3.234375,
"learning_rate": 5.500723447509925e-06,
"loss": 0.23847784996032714,
"step": 5200,
"token_acc": 0.9073309241094476
},
{
"epoch": 0.6661542403279529,
"eval_loss": 0.3346344828605652,
"eval_runtime": 103.736,
"eval_samples_per_second": 96.399,
"eval_steps_per_second": 12.05,
"eval_token_acc": 0.8817237045362686,
"step": 5200
},
{
"epoch": 0.6667947732513452,
"grad_norm": 3.265625,
"learning_rate": 5.48181483527983e-06,
"loss": 0.24246997833251954,
"step": 5205,
"token_acc": 0.9048132493746226
},
{
"epoch": 0.6674353061747373,
"grad_norm": 2.96875,
"learning_rate": 5.462926499004148e-06,
"loss": 0.23247838020324707,
"step": 5210,
"token_acc": 0.9081055404413352
},
{
"epoch": 0.6680758390981296,
"grad_norm": 4.0,
"learning_rate": 5.4440585234467935e-06,
"loss": 0.2290191411972046,
"step": 5215,
"token_acc": 0.9098120365580272
},
{
"epoch": 0.6687163720215219,
"grad_norm": 2.625,
"learning_rate": 5.425210993280306e-06,
"loss": 0.22439243793487548,
"step": 5220,
"token_acc": 0.9100359509680773
},
{
"epoch": 0.6693569049449142,
"grad_norm": 5.3125,
"learning_rate": 5.406383993085471e-06,
"loss": 0.22781476974487305,
"step": 5225,
"token_acc": 0.910941475826972
},
{
"epoch": 0.6699974378683065,
"grad_norm": 2.828125,
"learning_rate": 5.387577607350951e-06,
"loss": 0.2305924892425537,
"step": 5230,
"token_acc": 0.9094285837688421
},
{
"epoch": 0.6706379707916987,
"grad_norm": 4.125,
"learning_rate": 5.368791920472884e-06,
"loss": 0.2318443775177002,
"step": 5235,
"token_acc": 0.9093386392144989
},
{
"epoch": 0.6712785037150909,
"grad_norm": 3.84375,
"learning_rate": 5.35002701675454e-06,
"loss": 0.2296751022338867,
"step": 5240,
"token_acc": 0.9120812882114872
},
{
"epoch": 0.6719190366384832,
"grad_norm": 2.734375,
"learning_rate": 5.331282980405896e-06,
"loss": 0.2311159610748291,
"step": 5245,
"token_acc": 0.9103889922547704
},
{
"epoch": 0.6725595695618755,
"grad_norm": 4.03125,
"learning_rate": 5.3125598955433145e-06,
"loss": 0.23089895248413086,
"step": 5250,
"token_acc": 0.909507544640927
},
{
"epoch": 0.6732001024852677,
"grad_norm": 2.84375,
"learning_rate": 5.293857846189108e-06,
"loss": 0.23441662788391113,
"step": 5255,
"token_acc": 0.9084364357460016
},
{
"epoch": 0.67384063540866,
"grad_norm": 2.96875,
"learning_rate": 5.275176916271197e-06,
"loss": 0.2311511754989624,
"step": 5260,
"token_acc": 0.9103763417683322
},
{
"epoch": 0.6744811683320523,
"grad_norm": 2.734375,
"learning_rate": 5.256517189622742e-06,
"loss": 0.23376543521881105,
"step": 5265,
"token_acc": 0.9086750107898144
},
{
"epoch": 0.6751217012554446,
"grad_norm": 2.609375,
"learning_rate": 5.237878749981724e-06,
"loss": 0.22374820709228516,
"step": 5270,
"token_acc": 0.912248865845755
},
{
"epoch": 0.6757622341788367,
"grad_norm": 3.859375,
"learning_rate": 5.219261680990624e-06,
"loss": 0.22372374534606934,
"step": 5275,
"token_acc": 0.9098982583204
},
{
"epoch": 0.676402767102229,
"grad_norm": 3.84375,
"learning_rate": 5.200666066195993e-06,
"loss": 0.22683911323547362,
"step": 5280,
"token_acc": 0.9123762590239053
},
{
"epoch": 0.6770433000256213,
"grad_norm": 3.046875,
"learning_rate": 5.182091989048121e-06,
"loss": 0.22960472106933594,
"step": 5285,
"token_acc": 0.9087181700474752
},
{
"epoch": 0.6776838329490136,
"grad_norm": 3.21875,
"learning_rate": 5.163539532900639e-06,
"loss": 0.23558075428009034,
"step": 5290,
"token_acc": 0.9076750989502668
},
{
"epoch": 0.6783243658724059,
"grad_norm": 9.5,
"learning_rate": 5.14500878101015e-06,
"loss": 0.23191981315612792,
"step": 5295,
"token_acc": 0.9099460625674218
},
{
"epoch": 0.6789648987957981,
"grad_norm": 2.765625,
"learning_rate": 5.126499816535861e-06,
"loss": 0.22278881072998047,
"step": 5300,
"token_acc": 0.9129082426127527
},
{
"epoch": 0.6789648987957981,
"eval_loss": 0.3326459527015686,
"eval_runtime": 102.5695,
"eval_samples_per_second": 97.495,
"eval_steps_per_second": 12.187,
"eval_token_acc": 0.8820392999318979,
"step": 5300
},
{
"epoch": 0.6796054317191904,
"grad_norm": 5.0,
"learning_rate": 5.108012722539199e-06,
"loss": 0.22774300575256348,
"step": 5305,
"token_acc": 0.910229284511421
},
{
"epoch": 0.6802459646425826,
"grad_norm": 4.5,
"learning_rate": 5.0895475819834474e-06,
"loss": 0.23403663635253907,
"step": 5310,
"token_acc": 0.9082355973707952
},
{
"epoch": 0.6808864975659749,
"grad_norm": 4.25,
"learning_rate": 5.071104477733372e-06,
"loss": 0.23252689838409424,
"step": 5315,
"token_acc": 0.9085381630012936
},
{
"epoch": 0.6815270304893671,
"grad_norm": 2.96875,
"learning_rate": 5.052683492554844e-06,
"loss": 0.23012104034423828,
"step": 5320,
"token_acc": 0.9094350987394054
},
{
"epoch": 0.6821675634127594,
"grad_norm": 5.03125,
"learning_rate": 5.034284709114476e-06,
"loss": 0.2321260929107666,
"step": 5325,
"token_acc": 0.9089814695386732
},
{
"epoch": 0.6828080963361517,
"grad_norm": 7.90625,
"learning_rate": 5.0159082099792465e-06,
"loss": 0.22481832504272461,
"step": 5330,
"token_acc": 0.9132952973720608
},
{
"epoch": 0.683448629259544,
"grad_norm": 2.890625,
"learning_rate": 4.997554077616128e-06,
"loss": 0.2297644853591919,
"step": 5335,
"token_acc": 0.9089265731255918
},
{
"epoch": 0.6840891621829363,
"grad_norm": 3.34375,
"learning_rate": 4.979222394391721e-06,
"loss": 0.22588052749633789,
"step": 5340,
"token_acc": 0.911449325492909
},
{
"epoch": 0.6847296951063284,
"grad_norm": 3.515625,
"learning_rate": 4.960913242571882e-06,
"loss": 0.22864861488342286,
"step": 5345,
"token_acc": 0.9100142014890046
},
{
"epoch": 0.6853702280297207,
"grad_norm": 2.953125,
"learning_rate": 4.9426267043213594e-06,
"loss": 0.23536896705627441,
"step": 5350,
"token_acc": 0.9080583865952668
},
{
"epoch": 0.686010760953113,
"grad_norm": 2.96875,
"learning_rate": 4.924362861703405e-06,
"loss": 0.22937750816345215,
"step": 5355,
"token_acc": 0.9104786545924968
},
{
"epoch": 0.6866512938765053,
"grad_norm": 2.953125,
"learning_rate": 4.906121796679445e-06,
"loss": 0.2339865207672119,
"step": 5360,
"token_acc": 0.9082639996551278
},
{
"epoch": 0.6872918267998975,
"grad_norm": 3.046875,
"learning_rate": 4.887903591108663e-06,
"loss": 0.23555207252502441,
"step": 5365,
"token_acc": 0.9080790717662226
},
{
"epoch": 0.6879323597232898,
"grad_norm": 3.546875,
"learning_rate": 4.869708326747681e-06,
"loss": 0.2278905391693115,
"step": 5370,
"token_acc": 0.9106388481765669
},
{
"epoch": 0.6885728926466821,
"grad_norm": 3.671875,
"learning_rate": 4.8515360852501496e-06,
"loss": 0.22571067810058593,
"step": 5375,
"token_acc": 0.9102165846923808
},
{
"epoch": 0.6892134255700743,
"grad_norm": 6.8125,
"learning_rate": 4.833386948166409e-06,
"loss": 0.23547790050506592,
"step": 5380,
"token_acc": 0.9070218543902755
},
{
"epoch": 0.6898539584934665,
"grad_norm": 6.1875,
"learning_rate": 4.815260996943126e-06,
"loss": 0.23141322135925294,
"step": 5385,
"token_acc": 0.9082988267770876
},
{
"epoch": 0.6904944914168588,
"grad_norm": 4.125,
"learning_rate": 4.797158312922895e-06,
"loss": 0.2272815227508545,
"step": 5390,
"token_acc": 0.9105712070302404
},
{
"epoch": 0.6911350243402511,
"grad_norm": 3.671875,
"learning_rate": 4.779078977343922e-06,
"loss": 0.22905888557434081,
"step": 5395,
"token_acc": 0.9104548394050442
},
{
"epoch": 0.6917755572636434,
"grad_norm": 3.46875,
"learning_rate": 4.761023071339608e-06,
"loss": 0.22437114715576173,
"step": 5400,
"token_acc": 0.9122693567856527
},
{
"epoch": 0.6917755572636434,
"eval_loss": 0.3339126706123352,
"eval_runtime": 103.3301,
"eval_samples_per_second": 96.777,
"eval_steps_per_second": 12.097,
"eval_token_acc": 0.8821528035390979,
"step": 5400
},
{
"epoch": 0.6924160901870356,
"grad_norm": 7.34375,
"learning_rate": 4.742990675938228e-06,
"loss": 0.22792973518371581,
"step": 5405,
"token_acc": 0.9097371822490306
},
{
"epoch": 0.6930566231104279,
"grad_norm": 3.109375,
"learning_rate": 4.724981872062545e-06,
"loss": 0.22467894554138185,
"step": 5410,
"token_acc": 0.9115411195577056
},
{
"epoch": 0.6936971560338201,
"grad_norm": 4.875,
"learning_rate": 4.706996740529453e-06,
"loss": 0.22711763381958008,
"step": 5415,
"token_acc": 0.9103552206673843
},
{
"epoch": 0.6943376889572124,
"grad_norm": 3.546875,
"learning_rate": 4.689035362049609e-06,
"loss": 0.22871413230895996,
"step": 5420,
"token_acc": 0.9115798536375377
},
{
"epoch": 0.6949782218806047,
"grad_norm": 3.203125,
"learning_rate": 4.6710978172270794e-06,
"loss": 0.22836050987243653,
"step": 5425,
"token_acc": 0.9107814729922588
},
{
"epoch": 0.6956187548039969,
"grad_norm": 3.265625,
"learning_rate": 4.653184186558975e-06,
"loss": 0.22787034511566162,
"step": 5430,
"token_acc": 0.9093102408340873
},
{
"epoch": 0.6962592877273892,
"grad_norm": 5.4375,
"learning_rate": 4.635294550435086e-06,
"loss": 0.21838183403015138,
"step": 5435,
"token_acc": 0.9151326592342927
},
{
"epoch": 0.6968998206507815,
"grad_norm": 3.875,
"learning_rate": 4.617428989137517e-06,
"loss": 0.2287057876586914,
"step": 5440,
"token_acc": 0.9102171191781413
},
{
"epoch": 0.6975403535741738,
"grad_norm": 4.0625,
"learning_rate": 4.599587582840349e-06,
"loss": 0.23020198345184326,
"step": 5445,
"token_acc": 0.9096006210644354
},
{
"epoch": 0.6981808864975659,
"grad_norm": 2.8125,
"learning_rate": 4.581770411609254e-06,
"loss": 0.22472758293151857,
"step": 5450,
"token_acc": 0.9107643229727982
},
{
"epoch": 0.6988214194209582,
"grad_norm": 3.546875,
"learning_rate": 4.563977555401148e-06,
"loss": 0.22312564849853517,
"step": 5455,
"token_acc": 0.9104580812445981
},
{
"epoch": 0.6994619523443505,
"grad_norm": 4.21875,
"learning_rate": 4.546209094063829e-06,
"loss": 0.23030247688293456,
"step": 5460,
"token_acc": 0.9101837837837837
},
{
"epoch": 0.7001024852677428,
"grad_norm": 3.359375,
"learning_rate": 4.528465107335621e-06,
"loss": 0.22946977615356445,
"step": 5465,
"token_acc": 0.9094075156935248
},
{
"epoch": 0.700743018191135,
"grad_norm": 4.90625,
"learning_rate": 4.5107456748450206e-06,
"loss": 0.23560161590576173,
"step": 5470,
"token_acc": 0.9065336143490043
},
{
"epoch": 0.7013835511145273,
"grad_norm": 3.3125,
"learning_rate": 4.4930508761103145e-06,
"loss": 0.23189268112182618,
"step": 5475,
"token_acc": 0.909141750914175
},
{
"epoch": 0.7020240840379196,
"grad_norm": 5.46875,
"learning_rate": 4.475380790539272e-06,
"loss": 0.2286592483520508,
"step": 5480,
"token_acc": 0.9110030970406057
},
{
"epoch": 0.7026646169613118,
"grad_norm": 3.328125,
"learning_rate": 4.457735497428728e-06,
"loss": 0.22808377742767333,
"step": 5485,
"token_acc": 0.9114810810810811
},
{
"epoch": 0.703305149884704,
"grad_norm": 3.484375,
"learning_rate": 4.4401150759642875e-06,
"loss": 0.22788479328155517,
"step": 5490,
"token_acc": 0.9103065034297126
},
{
"epoch": 0.7039456828080963,
"grad_norm": 3.71875,
"learning_rate": 4.422519605219914e-06,
"loss": 0.23326406478881836,
"step": 5495,
"token_acc": 0.9091496232508073
},
{
"epoch": 0.7045862157314886,
"grad_norm": 3.828125,
"learning_rate": 4.404949164157617e-06,
"loss": 0.23126420974731446,
"step": 5500,
"token_acc": 0.9086606720302887
},
{
"epoch": 0.7045862157314886,
"eval_loss": 0.3346463441848755,
"eval_runtime": 103.293,
"eval_samples_per_second": 96.812,
"eval_steps_per_second": 12.101,
"eval_token_acc": 0.8824379467474296,
"step": 5500
},
{
"epoch": 0.7052267486548809,
"grad_norm": 3.03125,
"learning_rate": 4.387403831627079e-06,
"loss": 0.22369828224182128,
"step": 5505,
"token_acc": 0.9125932956555503
},
{
"epoch": 0.7058672815782732,
"grad_norm": 3.46875,
"learning_rate": 4.3698836863653005e-06,
"loss": 0.23041157722473143,
"step": 5510,
"token_acc": 0.9091222179172586
},
{
"epoch": 0.7065078145016653,
"grad_norm": 14.625,
"learning_rate": 4.352388806996263e-06,
"loss": 0.2362978458404541,
"step": 5515,
"token_acc": 0.9075854931217388
},
{
"epoch": 0.7071483474250576,
"grad_norm": 2.578125,
"learning_rate": 4.334919272030547e-06,
"loss": 0.23041419982910155,
"step": 5520,
"token_acc": 0.909314147854558
},
{
"epoch": 0.7077888803484499,
"grad_norm": 2.96875,
"learning_rate": 4.317475159865005e-06,
"loss": 0.22999229431152343,
"step": 5525,
"token_acc": 0.9102149842746974
},
{
"epoch": 0.7084294132718422,
"grad_norm": 3.0625,
"learning_rate": 4.300056548782404e-06,
"loss": 0.22720894813537598,
"step": 5530,
"token_acc": 0.9110218883564305
},
{
"epoch": 0.7090699461952344,
"grad_norm": 2.96875,
"learning_rate": 4.282663516951068e-06,
"loss": 0.23367710113525392,
"step": 5535,
"token_acc": 0.9050570260383043
},
{
"epoch": 0.7097104791186267,
"grad_norm": 3.0,
"learning_rate": 4.265296142424529e-06,
"loss": 0.22929010391235352,
"step": 5540,
"token_acc": 0.9078120967048094
},
{
"epoch": 0.710351012042019,
"grad_norm": 4.03125,
"learning_rate": 4.247954503141183e-06,
"loss": 0.2340301513671875,
"step": 5545,
"token_acc": 0.9078794652452392
},
{
"epoch": 0.7109915449654112,
"grad_norm": 5.09375,
"learning_rate": 4.230638676923932e-06,
"loss": 0.2315293788909912,
"step": 5550,
"token_acc": 0.9097634408602151
},
{
"epoch": 0.7116320778888034,
"grad_norm": 3.5625,
"learning_rate": 4.213348741479847e-06,
"loss": 0.22180113792419434,
"step": 5555,
"token_acc": 0.9152183311716385
},
{
"epoch": 0.7122726108121957,
"grad_norm": 4.53125,
"learning_rate": 4.196084774399788e-06,
"loss": 0.2180586576461792,
"step": 5560,
"token_acc": 0.913914992671782
},
{
"epoch": 0.712913143735588,
"grad_norm": 11.0,
"learning_rate": 4.1788468531581065e-06,
"loss": 0.23073256015777588,
"step": 5565,
"token_acc": 0.9099750408813151
},
{
"epoch": 0.7135536766589803,
"grad_norm": 5.15625,
"learning_rate": 4.161635055112254e-06,
"loss": 0.2295978307723999,
"step": 5570,
"token_acc": 0.9100305784056161
},
{
"epoch": 0.7141942095823726,
"grad_norm": 2.84375,
"learning_rate": 4.1444494575024555e-06,
"loss": 0.23021929264068602,
"step": 5575,
"token_acc": 0.9086412318809411
},
{
"epoch": 0.7148347425057648,
"grad_norm": 3.078125,
"learning_rate": 4.1272901374513555e-06,
"loss": 0.23160245418548583,
"step": 5580,
"token_acc": 0.9093453919035315
},
{
"epoch": 0.715475275429157,
"grad_norm": 3.34375,
"learning_rate": 4.110157171963674e-06,
"loss": 0.22630250453948975,
"step": 5585,
"token_acc": 0.9112262521588946
},
{
"epoch": 0.7161158083525493,
"grad_norm": 5.09375,
"learning_rate": 4.093050637925871e-06,
"loss": 0.22265501022338868,
"step": 5590,
"token_acc": 0.9139391854113802
},
{
"epoch": 0.7167563412759416,
"grad_norm": 3.546875,
"learning_rate": 4.07597061210577e-06,
"loss": 0.22470180988311766,
"step": 5595,
"token_acc": 0.911774876804703
},
{
"epoch": 0.7173968741993338,
"grad_norm": 2.953125,
"learning_rate": 4.0589171711522626e-06,
"loss": 0.238523530960083,
"step": 5600,
"token_acc": 0.9068379005240099
},
{
"epoch": 0.7173968741993338,
"eval_loss": 0.33452367782592773,
"eval_runtime": 102.4933,
"eval_samples_per_second": 97.567,
"eval_steps_per_second": 12.196,
"eval_token_acc": 0.882180487345732,
"step": 5600
},
{
"epoch": 0.7180374071227261,
"grad_norm": 2.859375,
"learning_rate": 4.0418903915949125e-06,
"loss": 0.22467451095581054,
"step": 5605,
"token_acc": 0.9131561892417369
},
{
"epoch": 0.7186779400461184,
"grad_norm": 4.03125,
"learning_rate": 4.0248903498436624e-06,
"loss": 0.22909164428710938,
"step": 5610,
"token_acc": 0.9093295464325674
},
{
"epoch": 0.7193184729695107,
"grad_norm": 3.171875,
"learning_rate": 4.007917122188438e-06,
"loss": 0.22771682739257812,
"step": 5615,
"token_acc": 0.9116303129580137
},
{
"epoch": 0.7199590058929028,
"grad_norm": 2.921875,
"learning_rate": 3.990970784798854e-06,
"loss": 0.23022587299346925,
"step": 5620,
"token_acc": 0.9101649768001375
},
{
"epoch": 0.7205995388162951,
"grad_norm": 3.28125,
"learning_rate": 3.974051413723842e-06,
"loss": 0.23161954879760743,
"step": 5625,
"token_acc": 0.9094234079173839
},
{
"epoch": 0.7212400717396874,
"grad_norm": 4.90625,
"learning_rate": 3.957159084891318e-06,
"loss": 0.23545317649841307,
"step": 5630,
"token_acc": 0.908342315154128
},
{
"epoch": 0.7218806046630797,
"grad_norm": 3.890625,
"learning_rate": 3.940293874107854e-06,
"loss": 0.2253598690032959,
"step": 5635,
"token_acc": 0.9121528376746593
},
{
"epoch": 0.722521137586472,
"grad_norm": 3.546875,
"learning_rate": 3.923455857058311e-06,
"loss": 0.2275296449661255,
"step": 5640,
"token_acc": 0.9096149709614971
},
{
"epoch": 0.7231616705098642,
"grad_norm": 2.828125,
"learning_rate": 3.906645109305521e-06,
"loss": 0.23534011840820312,
"step": 5645,
"token_acc": 0.908305245873304
},
{
"epoch": 0.7238022034332565,
"grad_norm": 3.0625,
"learning_rate": 3.88986170628994e-06,
"loss": 0.23898892402648925,
"step": 5650,
"token_acc": 0.9062513444908145
},
{
"epoch": 0.7244427363566487,
"grad_norm": 3.109375,
"learning_rate": 3.873105723329317e-06,
"loss": 0.23146333694458007,
"step": 5655,
"token_acc": 0.908890330953926
},
{
"epoch": 0.725083269280041,
"grad_norm": 5.375,
"learning_rate": 3.856377235618341e-06,
"loss": 0.24037771224975585,
"step": 5660,
"token_acc": 0.9058757646247954
},
{
"epoch": 0.7257238022034332,
"grad_norm": 5.6875,
"learning_rate": 3.839676318228319e-06,
"loss": 0.2313528299331665,
"step": 5665,
"token_acc": 0.9094474614257392
},
{
"epoch": 0.7263643351268255,
"grad_norm": 3.90625,
"learning_rate": 3.823003046106828e-06,
"loss": 0.23002188205718993,
"step": 5670,
"token_acc": 0.9098721713594748
},
{
"epoch": 0.7270048680502178,
"grad_norm": 3.421875,
"learning_rate": 3.8063574940773907e-06,
"loss": 0.2305138111114502,
"step": 5675,
"token_acc": 0.9112459129237652
},
{
"epoch": 0.7276454009736101,
"grad_norm": 7.6875,
"learning_rate": 3.789739736839114e-06,
"loss": 0.2200489044189453,
"step": 5680,
"token_acc": 0.9114310270734852
},
{
"epoch": 0.7282859338970024,
"grad_norm": 3.109375,
"learning_rate": 3.773149848966401e-06,
"loss": 0.22987012863159179,
"step": 5685,
"token_acc": 0.9078311172509066
},
{
"epoch": 0.7289264668203945,
"grad_norm": 3.15625,
"learning_rate": 3.7565879049085562e-06,
"loss": 0.22706859111785888,
"step": 5690,
"token_acc": 0.9124580573001807
},
{
"epoch": 0.7295669997437868,
"grad_norm": 3.859375,
"learning_rate": 3.7400539789895074e-06,
"loss": 0.23126349449157715,
"step": 5695,
"token_acc": 0.9095687389599759
},
{
"epoch": 0.7302075326671791,
"grad_norm": 3.125,
"learning_rate": 3.7235481454074373e-06,
"loss": 0.2237870693206787,
"step": 5700,
"token_acc": 0.9137209201950882
},
{
"epoch": 0.7302075326671791,
"eval_loss": 0.33403199911117554,
"eval_runtime": 103.444,
"eval_samples_per_second": 96.671,
"eval_steps_per_second": 12.084,
"eval_token_acc": 0.8826123547292247,
"step": 5700
},
{
"epoch": 0.7308480655905714,
"grad_norm": 3.34375,
"learning_rate": 3.70707047823445e-06,
"loss": 0.22239408493041993,
"step": 5705,
"token_acc": 0.9109814094249892
},
{
"epoch": 0.7314885985139636,
"grad_norm": 3.09375,
"learning_rate": 3.6906210514162744e-06,
"loss": 0.23225040435791017,
"step": 5710,
"token_acc": 0.9099184606756116
},
{
"epoch": 0.7321291314373559,
"grad_norm": 3.671875,
"learning_rate": 3.6741999387718773e-06,
"loss": 0.2249077320098877,
"step": 5715,
"token_acc": 0.9111187815506753
},
{
"epoch": 0.7327696643607482,
"grad_norm": 3.953125,
"learning_rate": 3.657807213993192e-06,
"loss": 0.2272716522216797,
"step": 5720,
"token_acc": 0.9099200345796412
},
{
"epoch": 0.7334101972841404,
"grad_norm": 2.875,
"learning_rate": 3.641442950644728e-06,
"loss": 0.22431583404541017,
"step": 5725,
"token_acc": 0.910762060930353
},
{
"epoch": 0.7340507302075326,
"grad_norm": 2.875,
"learning_rate": 3.6251072221632978e-06,
"loss": 0.2208378553390503,
"step": 5730,
"token_acc": 0.9123809523809524
},
{
"epoch": 0.7346912631309249,
"grad_norm": 4.09375,
"learning_rate": 3.608800101857637e-06,
"loss": 0.22057173252105713,
"step": 5735,
"token_acc": 0.9117215168005528
},
{
"epoch": 0.7353317960543172,
"grad_norm": 9.25,
"learning_rate": 3.5925216629081116e-06,
"loss": 0.2260368824005127,
"step": 5740,
"token_acc": 0.910849706997587
},
{
"epoch": 0.7359723289777095,
"grad_norm": 3.015625,
"learning_rate": 3.5762719783663724e-06,
"loss": 0.22467224597930907,
"step": 5745,
"token_acc": 0.9124238891048063
},
{
"epoch": 0.7366128619011018,
"grad_norm": 3.015625,
"learning_rate": 3.5600511211550283e-06,
"loss": 0.2277822256088257,
"step": 5750,
"token_acc": 0.9120300427331981
},
{
"epoch": 0.7372533948244939,
"grad_norm": 2.6875,
"learning_rate": 3.5438591640673346e-06,
"loss": 0.21924290657043458,
"step": 5755,
"token_acc": 0.9147353856796956
},
{
"epoch": 0.7378939277478862,
"grad_norm": 2.8125,
"learning_rate": 3.527696179766833e-06,
"loss": 0.229719877243042,
"step": 5760,
"token_acc": 0.9090753057283845
},
{
"epoch": 0.7385344606712785,
"grad_norm": 4.75,
"learning_rate": 3.5115622407870607e-06,
"loss": 0.22485427856445311,
"step": 5765,
"token_acc": 0.9123470045093408
},
{
"epoch": 0.7391749935946708,
"grad_norm": 3.6875,
"learning_rate": 3.495457419531206e-06,
"loss": 0.2279944896697998,
"step": 5770,
"token_acc": 0.9102829537612146
},
{
"epoch": 0.739815526518063,
"grad_norm": 3.53125,
"learning_rate": 3.4793817882717863e-06,
"loss": 0.22675998210906984,
"step": 5775,
"token_acc": 0.9114967836636014
},
{
"epoch": 0.7404560594414553,
"grad_norm": 2.90625,
"learning_rate": 3.463335419150328e-06,
"loss": 0.23147711753845215,
"step": 5780,
"token_acc": 0.908021712907117
},
{
"epoch": 0.7410965923648476,
"grad_norm": 4.8125,
"learning_rate": 3.4473183841770364e-06,
"loss": 0.22812228202819823,
"step": 5785,
"token_acc": 0.9115860226636219
},
{
"epoch": 0.7417371252882398,
"grad_norm": 3.421875,
"learning_rate": 3.4313307552304785e-06,
"loss": 0.22540197372436524,
"step": 5790,
"token_acc": 0.9123684664481628
},
{
"epoch": 0.742377658211632,
"grad_norm": 4.25,
"learning_rate": 3.4153726040572612e-06,
"loss": 0.23054356575012208,
"step": 5795,
"token_acc": 0.90987696808053
},
{
"epoch": 0.7430181911350243,
"grad_norm": 3.125,
"learning_rate": 3.3994440022716902e-06,
"loss": 0.2308722972869873,
"step": 5800,
"token_acc": 0.9087033288833384
},
{
"epoch": 0.7430181911350243,
"eval_loss": 0.33602866530418396,
"eval_runtime": 106.6156,
"eval_samples_per_second": 93.795,
"eval_steps_per_second": 11.724,
"eval_token_acc": 0.8821970976297124,
"step": 5800
},
{
"epoch": 0.7436587240584166,
"grad_norm": 2.765625,
"learning_rate": 3.3835450213554887e-06,
"loss": 0.23508167266845703,
"step": 5805,
"token_acc": 0.9071099435855475
},
{
"epoch": 0.7442992569818089,
"grad_norm": 2.6875,
"learning_rate": 3.3676757326574293e-06,
"loss": 0.2318406581878662,
"step": 5810,
"token_acc": 0.9082233589820745
},
{
"epoch": 0.7449397899052012,
"grad_norm": 3.421875,
"learning_rate": 3.351836207393054e-06,
"loss": 0.2296595573425293,
"step": 5815,
"token_acc": 0.9098555100280353
},
{
"epoch": 0.7455803228285934,
"grad_norm": 3.453125,
"learning_rate": 3.3360265166443316e-06,
"loss": 0.2280057430267334,
"step": 5820,
"token_acc": 0.9113049486138699
},
{
"epoch": 0.7462208557519856,
"grad_norm": 4.875,
"learning_rate": 3.3202467313593345e-06,
"loss": 0.22925915718078613,
"step": 5825,
"token_acc": 0.9111722005068511
},
{
"epoch": 0.7468613886753779,
"grad_norm": 3.0,
"learning_rate": 3.304496922351952e-06,
"loss": 0.22095022201538086,
"step": 5830,
"token_acc": 0.9120665861652579
},
{
"epoch": 0.7475019215987702,
"grad_norm": 3.0625,
"learning_rate": 3.2887771603015237e-06,
"loss": 0.22771050930023193,
"step": 5835,
"token_acc": 0.9096476473886229
},
{
"epoch": 0.7481424545221624,
"grad_norm": 10.0625,
"learning_rate": 3.273087515752579e-06,
"loss": 0.23041772842407227,
"step": 5840,
"token_acc": 0.9116174693595719
},
{
"epoch": 0.7487829874455547,
"grad_norm": 3.75,
"learning_rate": 3.2574280591144623e-06,
"loss": 0.22076497077941895,
"step": 5845,
"token_acc": 0.9142450633784599
},
{
"epoch": 0.749423520368947,
"grad_norm": 3.953125,
"learning_rate": 3.2417988606610738e-06,
"loss": 0.2274242639541626,
"step": 5850,
"token_acc": 0.9110910575394268
},
{
"epoch": 0.7500640532923393,
"grad_norm": 3.71875,
"learning_rate": 3.2261999905304996e-06,
"loss": 0.23234589099884034,
"step": 5855,
"token_acc": 0.9092159559834938
},
{
"epoch": 0.7507045862157314,
"grad_norm": 2.90625,
"learning_rate": 3.2106315187247417e-06,
"loss": 0.2272249221801758,
"step": 5860,
"token_acc": 0.9115452624315349
},
{
"epoch": 0.7513451191391237,
"grad_norm": 3.421875,
"learning_rate": 3.1950935151093778e-06,
"loss": 0.23643298149108888,
"step": 5865,
"token_acc": 0.9063857235003225
},
{
"epoch": 0.751985652062516,
"grad_norm": 3.34375,
"learning_rate": 3.179586049413257e-06,
"loss": 0.23007550239562988,
"step": 5870,
"token_acc": 0.9103555536354603
},
{
"epoch": 0.7526261849859083,
"grad_norm": 3.46875,
"learning_rate": 3.164109191228187e-06,
"loss": 0.22181496620178223,
"step": 5875,
"token_acc": 0.912943921195887
},
{
"epoch": 0.7532667179093006,
"grad_norm": 2.703125,
"learning_rate": 3.148663010008618e-06,
"loss": 0.22368183135986328,
"step": 5880,
"token_acc": 0.9129305868097628
},
{
"epoch": 0.7539072508326928,
"grad_norm": 3.046875,
"learning_rate": 3.1332475750713352e-06,
"loss": 0.23119454383850097,
"step": 5885,
"token_acc": 0.9087276008766275
},
{
"epoch": 0.7545477837560851,
"grad_norm": 3.859375,
"learning_rate": 3.1178629555951446e-06,
"loss": 0.2248836040496826,
"step": 5890,
"token_acc": 0.9119996545619413
},
{
"epoch": 0.7551883166794773,
"grad_norm": 3.671875,
"learning_rate": 3.1025092206205642e-06,
"loss": 0.22220723628997802,
"step": 5895,
"token_acc": 0.9144963780614005
},
{
"epoch": 0.7558288496028696,
"grad_norm": 5.96875,
"learning_rate": 3.087186439049512e-06,
"loss": 0.23192427158355713,
"step": 5900,
"token_acc": 0.9082466248172671
},
{
"epoch": 0.7558288496028696,
"eval_loss": 0.3338736891746521,
"eval_runtime": 103.6094,
"eval_samples_per_second": 96.516,
"eval_steps_per_second": 12.065,
"eval_token_acc": 0.8824739356960539,
"step": 5900
},
{
"epoch": 0.7564693825262618,
"grad_norm": 5.3125,
"learning_rate": 3.0718946796450012e-06,
"loss": 0.23041715621948242,
"step": 5905,
"token_acc": 0.9090869902577808
},
{
"epoch": 0.7571099154496541,
"grad_norm": 2.6875,
"learning_rate": 3.056634011030828e-06,
"loss": 0.23436269760131836,
"step": 5910,
"token_acc": 0.9085108217642494
},
{
"epoch": 0.7577504483730464,
"grad_norm": 2.96875,
"learning_rate": 3.0414045016912673e-06,
"loss": 0.22445986270904542,
"step": 5915,
"token_acc": 0.9102779573367809
},
{
"epoch": 0.7583909812964387,
"grad_norm": 2.90625,
"learning_rate": 3.0262062199707486e-06,
"loss": 0.22754263877868652,
"step": 5920,
"token_acc": 0.910606582801999
},
{
"epoch": 0.759031514219831,
"grad_norm": 3.515625,
"learning_rate": 3.0110392340735892e-06,
"loss": 0.2298940658569336,
"step": 5925,
"token_acc": 0.9089810539035864
},
{
"epoch": 0.7596720471432231,
"grad_norm": 3.875,
"learning_rate": 2.995903612063634e-06,
"loss": 0.22265000343322755,
"step": 5930,
"token_acc": 0.9121761658031088
},
{
"epoch": 0.7603125800666154,
"grad_norm": 3.09375,
"learning_rate": 2.9807994218640035e-06,
"loss": 0.22582578659057617,
"step": 5935,
"token_acc": 0.9107952827335954
},
{
"epoch": 0.7609531129900077,
"grad_norm": 2.859375,
"learning_rate": 2.965726731256743e-06,
"loss": 0.23047933578491211,
"step": 5940,
"token_acc": 0.9101230304338441
},
{
"epoch": 0.7615936459134,
"grad_norm": 3.15625,
"learning_rate": 2.9506856078825473e-06,
"loss": 0.22990131378173828,
"step": 5945,
"token_acc": 0.9090360926867086
},
{
"epoch": 0.7622341788367922,
"grad_norm": 2.6875,
"learning_rate": 2.9356761192404616e-06,
"loss": 0.23607187271118163,
"step": 5950,
"token_acc": 0.905852417302799
},
{
"epoch": 0.7628747117601845,
"grad_norm": 3.515625,
"learning_rate": 2.9206983326875393e-06,
"loss": 0.22556428909301757,
"step": 5955,
"token_acc": 0.9103385965667082
},
{
"epoch": 0.7635152446835768,
"grad_norm": 2.671875,
"learning_rate": 2.905752315438596e-06,
"loss": 0.22193589210510253,
"step": 5960,
"token_acc": 0.9134333505776858
},
{
"epoch": 0.764155777606969,
"grad_norm": 8.5625,
"learning_rate": 2.8908381345658497e-06,
"loss": 0.22921185493469237,
"step": 5965,
"token_acc": 0.9122292224044187
},
{
"epoch": 0.7647963105303612,
"grad_norm": 3.03125,
"learning_rate": 2.875955856998677e-06,
"loss": 0.2280503749847412,
"step": 5970,
"token_acc": 0.9099413692015865
},
{
"epoch": 0.7654368434537535,
"grad_norm": 3.3125,
"learning_rate": 2.8611055495232585e-06,
"loss": 0.2285156488418579,
"step": 5975,
"token_acc": 0.9090477833362084
},
{
"epoch": 0.7660773763771458,
"grad_norm": 3.328125,
"learning_rate": 2.8462872787823213e-06,
"loss": 0.22320642471313476,
"step": 5980,
"token_acc": 0.9124087591240876
},
{
"epoch": 0.7667179093005381,
"grad_norm": 2.65625,
"learning_rate": 2.831501111274816e-06,
"loss": 0.23166375160217284,
"step": 5985,
"token_acc": 0.9094241966788872
},
{
"epoch": 0.7673584422239303,
"grad_norm": 2.6875,
"learning_rate": 2.81674711335563e-06,
"loss": 0.22401225566864014,
"step": 5990,
"token_acc": 0.9130397385171168
},
{
"epoch": 0.7679989751473226,
"grad_norm": 3.671875,
"learning_rate": 2.8020253512352814e-06,
"loss": 0.23468830585479736,
"step": 5995,
"token_acc": 0.9090440165061898
},
{
"epoch": 0.7686395080707148,
"grad_norm": 4.40625,
"learning_rate": 2.7873358909796287e-06,
"loss": 0.2302248954772949,
"step": 6000,
"token_acc": 0.9099036841877942
},
{
"epoch": 0.7686395080707148,
"eval_loss": 0.334545373916626,
"eval_runtime": 103.0056,
"eval_samples_per_second": 97.082,
"eval_steps_per_second": 12.135,
"eval_token_acc": 0.8822773806689514,
"step": 6000
},
{
"epoch": 0.7692800409941071,
"grad_norm": 2.984375,
"learning_rate": 2.7726787985095717e-06,
"loss": 0.23136000633239745,
"step": 6005,
"token_acc": 0.9077314256162731
},
{
"epoch": 0.7699205739174994,
"grad_norm": 3.890625,
"learning_rate": 2.7580541396007523e-06,
"loss": 0.22109587192535402,
"step": 6010,
"token_acc": 0.9138497449641221
},
{
"epoch": 0.7705611068408916,
"grad_norm": 3.125,
"learning_rate": 2.743461979883265e-06,
"loss": 0.2210922956466675,
"step": 6015,
"token_acc": 0.913861557051614
},
{
"epoch": 0.7712016397642839,
"grad_norm": 3.328125,
"learning_rate": 2.728902384841361e-06,
"loss": 0.22745194435119628,
"step": 6020,
"token_acc": 0.9132437785240679
},
{
"epoch": 0.7718421726876762,
"grad_norm": 3.328125,
"learning_rate": 2.71437541981315e-06,
"loss": 0.2273806095123291,
"step": 6025,
"token_acc": 0.9103077254142458
},
{
"epoch": 0.7724827056110684,
"grad_norm": 3.546875,
"learning_rate": 2.699881149990313e-06,
"loss": 0.2318946361541748,
"step": 6030,
"token_acc": 0.9076658793214516
},
{
"epoch": 0.7731232385344606,
"grad_norm": 2.8125,
"learning_rate": 2.6854196404178077e-06,
"loss": 0.22452447414398194,
"step": 6035,
"token_acc": 0.9115502437761573
},
{
"epoch": 0.7737637714578529,
"grad_norm": 10.4375,
"learning_rate": 2.6709909559935652e-06,
"loss": 0.23456428050994874,
"step": 6040,
"token_acc": 0.9073548387096774
},
{
"epoch": 0.7744043043812452,
"grad_norm": 4.3125,
"learning_rate": 2.6565951614682316e-06,
"loss": 0.22777628898620605,
"step": 6045,
"token_acc": 0.9110881364693719
},
{
"epoch": 0.7750448373046375,
"grad_norm": 13.1875,
"learning_rate": 2.6422323214448275e-06,
"loss": 0.2248152017593384,
"step": 6050,
"token_acc": 0.9121726395589249
},
{
"epoch": 0.7756853702280297,
"grad_norm": 2.953125,
"learning_rate": 2.6279025003785132e-06,
"loss": 0.2368108034133911,
"step": 6055,
"token_acc": 0.9071379369726192
},
{
"epoch": 0.776325903151422,
"grad_norm": 4.0625,
"learning_rate": 2.6136057625762503e-06,
"loss": 0.22743830680847169,
"step": 6060,
"token_acc": 0.9096523429064997
},
{
"epoch": 0.7769664360748142,
"grad_norm": 3.09375,
"learning_rate": 2.5993421721965416e-06,
"loss": 0.22994532585144042,
"step": 6065,
"token_acc": 0.9099663183349167
},
{
"epoch": 0.7776069689982065,
"grad_norm": 2.90625,
"learning_rate": 2.58511179324915e-06,
"loss": 0.22852482795715331,
"step": 6070,
"token_acc": 0.9125949585635359
},
{
"epoch": 0.7782475019215987,
"grad_norm": 2.703125,
"learning_rate": 2.5709146895947713e-06,
"loss": 0.23030381202697753,
"step": 6075,
"token_acc": 0.9101974108640488
},
{
"epoch": 0.778888034844991,
"grad_norm": 2.96875,
"learning_rate": 2.556750924944802e-06,
"loss": 0.22189459800720215,
"step": 6080,
"token_acc": 0.9149477863122465
},
{
"epoch": 0.7795285677683833,
"grad_norm": 3.5625,
"learning_rate": 2.5426205628610046e-06,
"loss": 0.22595663070678712,
"step": 6085,
"token_acc": 0.911052608864529
},
{
"epoch": 0.7801691006917756,
"grad_norm": 3.734375,
"learning_rate": 2.5285236667552503e-06,
"loss": 0.22210302352905273,
"step": 6090,
"token_acc": 0.9138005344366865
},
{
"epoch": 0.7808096336151679,
"grad_norm": 3.1875,
"learning_rate": 2.5144602998892308e-06,
"loss": 0.22484986782073973,
"step": 6095,
"token_acc": 0.9116413781178403
},
{
"epoch": 0.78145016653856,
"grad_norm": 3.578125,
"learning_rate": 2.500430525374167e-06,
"loss": 0.2381572961807251,
"step": 6100,
"token_acc": 0.9064785339413233
},
{
"epoch": 0.78145016653856,
"eval_loss": 0.3342524766921997,
"eval_runtime": 102.5935,
"eval_samples_per_second": 97.472,
"eval_steps_per_second": 12.184,
"eval_token_acc": 0.8827729208077028,
"step": 6100
},
{
"epoch": 0.7820906994619523,
"grad_norm": 4.0625,
"learning_rate": 2.486434406170529e-06,
"loss": 0.23040971755981446,
"step": 6105,
"token_acc": 0.9120883863450002
},
{
"epoch": 0.7827312323853446,
"grad_norm": 4.15625,
"learning_rate": 2.472472005087758e-06,
"loss": 0.23743114471435547,
"step": 6110,
"token_acc": 0.9058546000428909
},
{
"epoch": 0.7833717653087369,
"grad_norm": 2.515625,
"learning_rate": 2.4585433847839757e-06,
"loss": 0.2203622817993164,
"step": 6115,
"token_acc": 0.913583977208961
},
{
"epoch": 0.7840122982321291,
"grad_norm": 3.28125,
"learning_rate": 2.444648607765713e-06,
"loss": 0.2203676223754883,
"step": 6120,
"token_acc": 0.9142634112494037
},
{
"epoch": 0.7846528311555214,
"grad_norm": 3.703125,
"learning_rate": 2.430787736387621e-06,
"loss": 0.2319796562194824,
"step": 6125,
"token_acc": 0.9104683790200734
},
{
"epoch": 0.7852933640789137,
"grad_norm": 3.578125,
"learning_rate": 2.4169608328521966e-06,
"loss": 0.22085697650909425,
"step": 6130,
"token_acc": 0.9123194047928022
},
{
"epoch": 0.7859338970023059,
"grad_norm": 3.203125,
"learning_rate": 2.4031679592095014e-06,
"loss": 0.22805500030517578,
"step": 6135,
"token_acc": 0.911108238538435
},
{
"epoch": 0.7865744299256981,
"grad_norm": 3.046875,
"learning_rate": 2.3894091773568818e-06,
"loss": 0.22629399299621583,
"step": 6140,
"token_acc": 0.9122247597707588
},
{
"epoch": 0.7872149628490904,
"grad_norm": 3.6875,
"learning_rate": 2.3756845490386947e-06,
"loss": 0.22798571586608887,
"step": 6145,
"token_acc": 0.9104625171939478
},
{
"epoch": 0.7878554957724827,
"grad_norm": 4.15625,
"learning_rate": 2.3619941358460263e-06,
"loss": 0.23149216175079346,
"step": 6150,
"token_acc": 0.9089227327482361
},
{
"epoch": 0.788496028695875,
"grad_norm": 3.296875,
"learning_rate": 2.3483379992164245e-06,
"loss": 0.23463683128356932,
"step": 6155,
"token_acc": 0.9076479697178252
},
{
"epoch": 0.7891365616192673,
"grad_norm": 2.953125,
"learning_rate": 2.334716200433601e-06,
"loss": 0.2272404193878174,
"step": 6160,
"token_acc": 0.9092513668259503
},
{
"epoch": 0.7897770945426595,
"grad_norm": 3.546875,
"learning_rate": 2.3211288006271936e-06,
"loss": 0.22353928089141845,
"step": 6165,
"token_acc": 0.9137633666781649
},
{
"epoch": 0.7904176274660517,
"grad_norm": 2.921875,
"learning_rate": 2.3075758607724486e-06,
"loss": 0.22103281021118165,
"step": 6170,
"token_acc": 0.9112282824790389
},
{
"epoch": 0.791058160389444,
"grad_norm": 3.046875,
"learning_rate": 2.2940574416899895e-06,
"loss": 0.22877752780914307,
"step": 6175,
"token_acc": 0.9084078248477782
},
{
"epoch": 0.7916986933128363,
"grad_norm": 2.875,
"learning_rate": 2.280573604045504e-06,
"loss": 0.229004168510437,
"step": 6180,
"token_acc": 0.9116313220748931
},
{
"epoch": 0.7923392262362285,
"grad_norm": 2.984375,
"learning_rate": 2.2671244083495026e-06,
"loss": 0.22659940719604493,
"step": 6185,
"token_acc": 0.9117138908085695
},
{
"epoch": 0.7929797591596208,
"grad_norm": 3.203125,
"learning_rate": 2.253709914957032e-06,
"loss": 0.2304908275604248,
"step": 6190,
"token_acc": 0.9093925032313658
},
{
"epoch": 0.7936202920830131,
"grad_norm": 3.75,
"learning_rate": 2.2403301840674062e-06,
"loss": 0.23479413986206055,
"step": 6195,
"token_acc": 0.9068654915312675
},
{
"epoch": 0.7942608250064054,
"grad_norm": 3.9375,
"learning_rate": 2.2269852757239473e-06,
"loss": 0.22974464893341065,
"step": 6200,
"token_acc": 0.9106674125392659
},
{
"epoch": 0.7942608250064054,
"eval_loss": 0.33391064405441284,
"eval_runtime": 103.739,
"eval_samples_per_second": 96.396,
"eval_steps_per_second": 12.049,
"eval_token_acc": 0.8825237665479955,
"step": 6200
},
{
"epoch": 0.7949013579297975,
"grad_norm": 5.34375,
"learning_rate": 2.2136752498136924e-06,
"loss": 0.232399320602417,
"step": 6205,
"token_acc": 0.9098353590207741
},
{
"epoch": 0.7955418908531898,
"grad_norm": 7.78125,
"learning_rate": 2.200400166067147e-06,
"loss": 0.22328581809997558,
"step": 6210,
"token_acc": 0.9126268076840061
},
{
"epoch": 0.7961824237765821,
"grad_norm": 2.78125,
"learning_rate": 2.1871600840580087e-06,
"loss": 0.22782430648803711,
"step": 6215,
"token_acc": 0.9107918620155706
},
{
"epoch": 0.7968229566999744,
"grad_norm": 7.3125,
"learning_rate": 2.1739550632028995e-06,
"loss": 0.22463743686676024,
"step": 6220,
"token_acc": 0.9123086872170727
},
{
"epoch": 0.7974634896233667,
"grad_norm": 3.171875,
"learning_rate": 2.160785162761099e-06,
"loss": 0.22946014404296874,
"step": 6225,
"token_acc": 0.9104503339797457
},
{
"epoch": 0.7981040225467589,
"grad_norm": 3.234375,
"learning_rate": 2.1476504418342803e-06,
"loss": 0.22696642875671386,
"step": 6230,
"token_acc": 0.9110344827586206
},
{
"epoch": 0.7987445554701512,
"grad_norm": 3.359375,
"learning_rate": 2.1345509593662426e-06,
"loss": 0.2333219289779663,
"step": 6235,
"token_acc": 0.9078953042128411
},
{
"epoch": 0.7993850883935434,
"grad_norm": 4.6875,
"learning_rate": 2.1214867741426505e-06,
"loss": 0.2281118631362915,
"step": 6240,
"token_acc": 0.9095139607032058
},
{
"epoch": 0.8000256213169357,
"grad_norm": 11.375,
"learning_rate": 2.108457944790764e-06,
"loss": 0.22590672969818115,
"step": 6245,
"token_acc": 0.9117824773413897
},
{
"epoch": 0.8006661542403279,
"grad_norm": 2.90625,
"learning_rate": 2.095464529779182e-06,
"loss": 0.22068183422088622,
"step": 6250,
"token_acc": 0.9125511302475781
},
{
"epoch": 0.8013066871637202,
"grad_norm": 3.3125,
"learning_rate": 2.0825065874175744e-06,
"loss": 0.2325758457183838,
"step": 6255,
"token_acc": 0.9100626770842277
},
{
"epoch": 0.8019472200871125,
"grad_norm": 4.125,
"learning_rate": 2.069584175856424e-06,
"loss": 0.22739195823669434,
"step": 6260,
"token_acc": 0.9109864018994173
},
{
"epoch": 0.8025877530105048,
"grad_norm": 4.09375,
"learning_rate": 2.056697353086765e-06,
"loss": 0.22868261337280274,
"step": 6265,
"token_acc": 0.9094489893087477
},
{
"epoch": 0.8032282859338971,
"grad_norm": 3.546875,
"learning_rate": 2.0438461769399207e-06,
"loss": 0.23165996074676515,
"step": 6270,
"token_acc": 0.908895110919664
},
{
"epoch": 0.8038688188572892,
"grad_norm": 3.015625,
"learning_rate": 2.031030705087251e-06,
"loss": 0.2177964687347412,
"step": 6275,
"token_acc": 0.9145767686795874
},
{
"epoch": 0.8045093517806815,
"grad_norm": 3.328125,
"learning_rate": 2.0182509950398732e-06,
"loss": 0.2247143268585205,
"step": 6280,
"token_acc": 0.9119068162208801
},
{
"epoch": 0.8051498847040738,
"grad_norm": 10.4375,
"learning_rate": 2.005507104148441e-06,
"loss": 0.22496967315673827,
"step": 6285,
"token_acc": 0.9104271735850683
},
{
"epoch": 0.8057904176274661,
"grad_norm": 2.984375,
"learning_rate": 1.9927990896028416e-06,
"loss": 0.22278683185577391,
"step": 6290,
"token_acc": 0.9130923555863023
},
{
"epoch": 0.8064309505508583,
"grad_norm": 3.890625,
"learning_rate": 1.9801270084319847e-06,
"loss": 0.22296977043151855,
"step": 6295,
"token_acc": 0.9139506811519228
},
{
"epoch": 0.8070714834742506,
"grad_norm": 5.84375,
"learning_rate": 1.967490917503504e-06,
"loss": 0.2246922492980957,
"step": 6300,
"token_acc": 0.9113056226284926
},
{
"epoch": 0.8070714834742506,
"eval_loss": 0.33457258343696594,
"eval_runtime": 102.1762,
"eval_samples_per_second": 97.87,
"eval_steps_per_second": 12.234,
"eval_token_acc": 0.8825071562640149,
"step": 6300
},
{
"epoch": 0.8077120163976428,
"grad_norm": 2.59375,
"learning_rate": 1.954890873523535e-06,
"loss": 0.22967491149902344,
"step": 6305,
"token_acc": 0.9100146387668991
},
{
"epoch": 0.8083525493210351,
"grad_norm": 3.421875,
"learning_rate": 1.9423269330364446e-06,
"loss": 0.23272688388824464,
"step": 6310,
"token_acc": 0.9100631361937894
},
{
"epoch": 0.8089930822444273,
"grad_norm": 3.734375,
"learning_rate": 1.929799152424576e-06,
"loss": 0.22082395553588868,
"step": 6315,
"token_acc": 0.9147945323616921
},
{
"epoch": 0.8096336151678196,
"grad_norm": 2.96875,
"learning_rate": 1.917307587908013e-06,
"loss": 0.22631459236145018,
"step": 6320,
"token_acc": 0.910735097336729
},
{
"epoch": 0.8102741480912119,
"grad_norm": 3.578125,
"learning_rate": 1.9048522955442973e-06,
"loss": 0.22592225074768066,
"step": 6325,
"token_acc": 0.9107575233483224
},
{
"epoch": 0.8109146810146042,
"grad_norm": 2.875,
"learning_rate": 1.8924333312282072e-06,
"loss": 0.22494149208068848,
"step": 6330,
"token_acc": 0.9138549272043893
},
{
"epoch": 0.8115552139379965,
"grad_norm": 4.09375,
"learning_rate": 1.880050750691489e-06,
"loss": 0.23039345741271972,
"step": 6335,
"token_acc": 0.9105968858131488
},
{
"epoch": 0.8121957468613886,
"grad_norm": 3.109375,
"learning_rate": 1.867704609502613e-06,
"loss": 0.22507119178771973,
"step": 6340,
"token_acc": 0.9115109155233411
},
{
"epoch": 0.8128362797847809,
"grad_norm": 3.4375,
"learning_rate": 1.8553949630665246e-06,
"loss": 0.23071153163909913,
"step": 6345,
"token_acc": 0.9095776837378012
},
{
"epoch": 0.8134768127081732,
"grad_norm": 5.34375,
"learning_rate": 1.843121866624391e-06,
"loss": 0.22440800666809083,
"step": 6350,
"token_acc": 0.9121522693997072
},
{
"epoch": 0.8141173456315655,
"grad_norm": 3.796875,
"learning_rate": 1.8308853752533595e-06,
"loss": 0.22544093132019044,
"step": 6355,
"token_acc": 0.9106696543997242
},
{
"epoch": 0.8147578785549577,
"grad_norm": 2.34375,
"learning_rate": 1.8186855438663042e-06,
"loss": 0.2227323532104492,
"step": 6360,
"token_acc": 0.9120110525861325
},
{
"epoch": 0.81539841147835,
"grad_norm": 2.78125,
"learning_rate": 1.8065224272115866e-06,
"loss": 0.22800102233886718,
"step": 6365,
"token_acc": 0.9097010109701011
},
{
"epoch": 0.8160389444017423,
"grad_norm": 3.734375,
"learning_rate": 1.7943960798728056e-06,
"loss": 0.22401859760284423,
"step": 6370,
"token_acc": 0.9109385113268609
},
{
"epoch": 0.8166794773251345,
"grad_norm": 3.328125,
"learning_rate": 1.7823065562685437e-06,
"loss": 0.23256373405456543,
"step": 6375,
"token_acc": 0.9090204520990313
},
{
"epoch": 0.8173200102485267,
"grad_norm": 2.9375,
"learning_rate": 1.7702539106521467e-06,
"loss": 0.22349081039428711,
"step": 6380,
"token_acc": 0.9125831820931639
},
{
"epoch": 0.817960543171919,
"grad_norm": 3.265625,
"learning_rate": 1.7582381971114548e-06,
"loss": 0.23039307594299316,
"step": 6385,
"token_acc": 0.9086009915930158
},
{
"epoch": 0.8186010760953113,
"grad_norm": 3.703125,
"learning_rate": 1.7462594695685763e-06,
"loss": 0.22513654232025146,
"step": 6390,
"token_acc": 0.9117127975549911
},
{
"epoch": 0.8192416090187036,
"grad_norm": 3.8125,
"learning_rate": 1.7343177817796397e-06,
"loss": 0.2271491050720215,
"step": 6395,
"token_acc": 0.9126460569999569
},
{
"epoch": 0.8198821419420959,
"grad_norm": 5.0,
"learning_rate": 1.7224131873345417e-06,
"loss": 0.2326582908630371,
"step": 6400,
"token_acc": 0.9083699681061977
},
{
"epoch": 0.8198821419420959,
"eval_loss": 0.334193617105484,
"eval_runtime": 106.2327,
"eval_samples_per_second": 94.133,
"eval_steps_per_second": 11.767,
"eval_token_acc": 0.8823493585662003,
"step": 6400
},
{
"epoch": 0.8205226748654881,
"grad_norm": 3.75,
"learning_rate": 1.7105457396567383e-06,
"loss": 0.2375797748565674,
"step": 6405,
"token_acc": 0.9064124038998411
},
{
"epoch": 0.8211632077888803,
"grad_norm": 3.9375,
"learning_rate": 1.6987154920029625e-06,
"loss": 0.22246260643005372,
"step": 6410,
"token_acc": 0.9119291304721768
},
{
"epoch": 0.8218037407122726,
"grad_norm": 3.203125,
"learning_rate": 1.6869224974630283e-06,
"loss": 0.23738515377044678,
"step": 6415,
"token_acc": 0.9083624143724958
},
{
"epoch": 0.8224442736356649,
"grad_norm": 3.765625,
"learning_rate": 1.675166808959552e-06,
"loss": 0.23724117279052734,
"step": 6420,
"token_acc": 0.9078502673796791
},
{
"epoch": 0.8230848065590571,
"grad_norm": 3.203125,
"learning_rate": 1.6634484792477468e-06,
"loss": 0.23424534797668456,
"step": 6425,
"token_acc": 0.910155913515376
},
{
"epoch": 0.8237253394824494,
"grad_norm": 3.578125,
"learning_rate": 1.6517675609151683e-06,
"loss": 0.23035151958465577,
"step": 6430,
"token_acc": 0.9098056155507559
},
{
"epoch": 0.8243658724058417,
"grad_norm": 2.796875,
"learning_rate": 1.6401241063814854e-06,
"loss": 0.22955503463745117,
"step": 6435,
"token_acc": 0.9094276239286792
},
{
"epoch": 0.825006405329234,
"grad_norm": 3.125,
"learning_rate": 1.6285181678982432e-06,
"loss": 0.2227609395980835,
"step": 6440,
"token_acc": 0.9112541026083952
},
{
"epoch": 0.8256469382526261,
"grad_norm": 2.6875,
"learning_rate": 1.6169497975486282e-06,
"loss": 0.22880702018737792,
"step": 6445,
"token_acc": 0.9112530754953166
},
{
"epoch": 0.8262874711760184,
"grad_norm": 2.96875,
"learning_rate": 1.605419047247232e-06,
"loss": 0.2208636999130249,
"step": 6450,
"token_acc": 0.9133209711501142
},
{
"epoch": 0.8269280040994107,
"grad_norm": 2.875,
"learning_rate": 1.5939259687398279e-06,
"loss": 0.22008955478668213,
"step": 6455,
"token_acc": 0.9133350640359986
},
{
"epoch": 0.827568537022803,
"grad_norm": 2.90625,
"learning_rate": 1.5824706136031255e-06,
"loss": 0.22201809883117676,
"step": 6460,
"token_acc": 0.9131427094996124
},
{
"epoch": 0.8282090699461953,
"grad_norm": 7.78125,
"learning_rate": 1.5710530332445484e-06,
"loss": 0.22498104572296143,
"step": 6465,
"token_acc": 0.9109814094249892
},
{
"epoch": 0.8288496028695875,
"grad_norm": 5.0625,
"learning_rate": 1.559673278902002e-06,
"loss": 0.23075518608093262,
"step": 6470,
"token_acc": 0.909899408539481
},
{
"epoch": 0.8294901357929798,
"grad_norm": 2.734375,
"learning_rate": 1.5483314016436402e-06,
"loss": 0.23160152435302733,
"step": 6475,
"token_acc": 0.9085847468600284
},
{
"epoch": 0.830130668716372,
"grad_norm": 2.78125,
"learning_rate": 1.537027452367641e-06,
"loss": 0.2284604549407959,
"step": 6480,
"token_acc": 0.9090869865377977
},
{
"epoch": 0.8307712016397643,
"grad_norm": 2.84375,
"learning_rate": 1.5257614818019716e-06,
"loss": 0.22905595302581788,
"step": 6485,
"token_acc": 0.9103867022650934
},
{
"epoch": 0.8314117345631565,
"grad_norm": 4.53125,
"learning_rate": 1.5145335405041728e-06,
"loss": 0.23354558944702147,
"step": 6490,
"token_acc": 0.9073544698544699
},
{
"epoch": 0.8320522674865488,
"grad_norm": 4.25,
"learning_rate": 1.50334367886111e-06,
"loss": 0.2220928192138672,
"step": 6495,
"token_acc": 0.9137692440754195
},
{
"epoch": 0.8326928004099411,
"grad_norm": 2.671875,
"learning_rate": 1.4921919470887758e-06,
"loss": 0.22195751667022706,
"step": 6500,
"token_acc": 0.9108550636749545
},
{
"epoch": 0.8326928004099411,
"eval_loss": 0.3345060646533966,
"eval_runtime": 102.629,
"eval_samples_per_second": 97.438,
"eval_steps_per_second": 12.18,
"eval_token_acc": 0.8823133696175759,
"step": 6500
},
{
"epoch": 0.8333333333333334,
"grad_norm": 3.21875,
"learning_rate": 1.4810783952320417e-06,
"loss": 0.2198798656463623,
"step": 6505,
"token_acc": 0.91326310335895
},
{
"epoch": 0.8339738662567256,
"grad_norm": 2.96875,
"learning_rate": 1.4700030731644444e-06,
"loss": 0.22199637889862062,
"step": 6510,
"token_acc": 0.9108222490931076
},
{
"epoch": 0.8346143991801178,
"grad_norm": 3.59375,
"learning_rate": 1.4589660305879615e-06,
"loss": 0.22134122848510743,
"step": 6515,
"token_acc": 0.9125576981148354
},
{
"epoch": 0.8352549321035101,
"grad_norm": 3.203125,
"learning_rate": 1.4479673170327745e-06,
"loss": 0.22954387664794923,
"step": 6520,
"token_acc": 0.9111398405516052
},
{
"epoch": 0.8358954650269024,
"grad_norm": 4.3125,
"learning_rate": 1.4370069818570787e-06,
"loss": 0.22780919075012207,
"step": 6525,
"token_acc": 0.9098325276243094
},
{
"epoch": 0.8365359979502947,
"grad_norm": 3.453125,
"learning_rate": 1.4260850742468202e-06,
"loss": 0.22985472679138183,
"step": 6530,
"token_acc": 0.909024211298606
},
{
"epoch": 0.8371765308736869,
"grad_norm": 3.3125,
"learning_rate": 1.4152016432155158e-06,
"loss": 0.22617745399475098,
"step": 6535,
"token_acc": 0.9123404622283546
},
{
"epoch": 0.8378170637970792,
"grad_norm": 2.9375,
"learning_rate": 1.4043567376039956e-06,
"loss": 0.22737021446228028,
"step": 6540,
"token_acc": 0.910641053313188
},
{
"epoch": 0.8384575967204715,
"grad_norm": 3.515625,
"learning_rate": 1.393550406080213e-06,
"loss": 0.22855916023254394,
"step": 6545,
"token_acc": 0.9108817204301075
},
{
"epoch": 0.8390981296438637,
"grad_norm": 4.625,
"learning_rate": 1.3827826971390135e-06,
"loss": 0.21400003433227538,
"step": 6550,
"token_acc": 0.9176013805004314
},
{
"epoch": 0.8397386625672559,
"grad_norm": 5.09375,
"learning_rate": 1.372053659101915e-06,
"loss": 0.22439954280853272,
"step": 6555,
"token_acc": 0.9112418357195381
},
{
"epoch": 0.8403791954906482,
"grad_norm": 3.515625,
"learning_rate": 1.361363340116899e-06,
"loss": 0.22323524951934814,
"step": 6560,
"token_acc": 0.9125355634106388
},
{
"epoch": 0.8410197284140405,
"grad_norm": 3.0625,
"learning_rate": 1.3507117881581866e-06,
"loss": 0.2269625186920166,
"step": 6565,
"token_acc": 0.9102128574500108
},
{
"epoch": 0.8416602613374328,
"grad_norm": 3.125,
"learning_rate": 1.3400990510260282e-06,
"loss": 0.21720943450927735,
"step": 6570,
"token_acc": 0.9142647249470637
},
{
"epoch": 0.842300794260825,
"grad_norm": 3.015625,
"learning_rate": 1.3295251763464877e-06,
"loss": 0.22070887088775634,
"step": 6575,
"token_acc": 0.91288746703558
},
{
"epoch": 0.8429413271842172,
"grad_norm": 3.109375,
"learning_rate": 1.3189902115712294e-06,
"loss": 0.23354511260986327,
"step": 6580,
"token_acc": 0.9081190159288995
},
{
"epoch": 0.8435818601076095,
"grad_norm": 2.953125,
"learning_rate": 1.3084942039773018e-06,
"loss": 0.22521576881408692,
"step": 6585,
"token_acc": 0.9107534747622531
},
{
"epoch": 0.8442223930310018,
"grad_norm": 3.0625,
"learning_rate": 1.2980372006669296e-06,
"loss": 0.2297739267349243,
"step": 6590,
"token_acc": 0.9092908902691511
},
{
"epoch": 0.844862925954394,
"grad_norm": 4.15625,
"learning_rate": 1.287619248567301e-06,
"loss": 0.22501018047332763,
"step": 6595,
"token_acc": 0.9111034393475165
},
{
"epoch": 0.8455034588777863,
"grad_norm": 4.0625,
"learning_rate": 1.2772403944303556e-06,
"loss": 0.23542351722717286,
"step": 6600,
"token_acc": 0.9083812301621343
},
{
"epoch": 0.8455034588777863,
"eval_loss": 0.33506593108177185,
"eval_runtime": 103.0867,
"eval_samples_per_second": 97.006,
"eval_steps_per_second": 12.126,
"eval_token_acc": 0.882360432088854,
"step": 6600
},
{
"epoch": 0.8461439918011786,
"grad_norm": 2.53125,
"learning_rate": 1.266900684832576e-06,
"loss": 0.22258315086364747,
"step": 6605,
"token_acc": 0.9125866597769453
},
{
"epoch": 0.8467845247245709,
"grad_norm": 3.3125,
"learning_rate": 1.2566001661747807e-06,
"loss": 0.22833826541900634,
"step": 6610,
"token_acc": 0.9116735537190083
},
{
"epoch": 0.847425057647963,
"grad_norm": 3.265625,
"learning_rate": 1.2463388846819058e-06,
"loss": 0.23099522590637206,
"step": 6615,
"token_acc": 0.91005291005291
},
{
"epoch": 0.8480655905713553,
"grad_norm": 2.90625,
"learning_rate": 1.2361168864028183e-06,
"loss": 0.2343848466873169,
"step": 6620,
"token_acc": 0.9084549356223176
},
{
"epoch": 0.8487061234947476,
"grad_norm": 2.890625,
"learning_rate": 1.225934217210083e-06,
"loss": 0.22270684242248534,
"step": 6625,
"token_acc": 0.9118066047917116
},
{
"epoch": 0.8493466564181399,
"grad_norm": 4.0,
"learning_rate": 1.2157909227997822e-06,
"loss": 0.22519948482513427,
"step": 6630,
"token_acc": 0.9111332783970161
},
{
"epoch": 0.8499871893415322,
"grad_norm": 2.71875,
"learning_rate": 1.205687048691293e-06,
"loss": 0.2298964500427246,
"step": 6635,
"token_acc": 0.9109686303197212
},
{
"epoch": 0.8506277222649244,
"grad_norm": 6.0625,
"learning_rate": 1.1956226402270821e-06,
"loss": 0.22732067108154297,
"step": 6640,
"token_acc": 0.9125354411891056
},
{
"epoch": 0.8512682551883167,
"grad_norm": 3.953125,
"learning_rate": 1.1855977425725252e-06,
"loss": 0.23059117794036865,
"step": 6645,
"token_acc": 0.9109086197961651
},
{
"epoch": 0.8519087881117089,
"grad_norm": 3.40625,
"learning_rate": 1.1756124007156699e-06,
"loss": 0.23375325202941893,
"step": 6650,
"token_acc": 0.9093841389987958
},
{
"epoch": 0.8525493210351012,
"grad_norm": 3.609375,
"learning_rate": 1.1656666594670673e-06,
"loss": 0.22103147506713866,
"step": 6655,
"token_acc": 0.9112014180104622
},
{
"epoch": 0.8531898539584934,
"grad_norm": 37.75,
"learning_rate": 1.1557605634595437e-06,
"loss": 0.2286379814147949,
"step": 6660,
"token_acc": 0.9100968783638321
},
{
"epoch": 0.8538303868818857,
"grad_norm": 3.1875,
"learning_rate": 1.1458941571480198e-06,
"loss": 0.22343990802764893,
"step": 6665,
"token_acc": 0.911326860841424
},
{
"epoch": 0.854470919805278,
"grad_norm": 2.921875,
"learning_rate": 1.136067484809299e-06,
"loss": 0.22610747814178467,
"step": 6670,
"token_acc": 0.9108187134502924
},
{
"epoch": 0.8551114527286703,
"grad_norm": 3.1875,
"learning_rate": 1.126280590541876e-06,
"loss": 0.2264204740524292,
"step": 6675,
"token_acc": 0.9111684958037444
},
{
"epoch": 0.8557519856520626,
"grad_norm": 4.15625,
"learning_rate": 1.1165335182657365e-06,
"loss": 0.23050973415374756,
"step": 6680,
"token_acc": 0.9092639868460906
},
{
"epoch": 0.8563925185754547,
"grad_norm": 2.671875,
"learning_rate": 1.1068263117221568e-06,
"loss": 0.2229710578918457,
"step": 6685,
"token_acc": 0.9126929378287488
},
{
"epoch": 0.857033051498847,
"grad_norm": 4.09375,
"learning_rate": 1.0971590144735122e-06,
"loss": 0.22901148796081544,
"step": 6690,
"token_acc": 0.9086844368013758
},
{
"epoch": 0.8576735844222393,
"grad_norm": 3.09375,
"learning_rate": 1.0875316699030802e-06,
"loss": 0.22709619998931885,
"step": 6695,
"token_acc": 0.9104509880226574
},
{
"epoch": 0.8583141173456316,
"grad_norm": 3.171875,
"learning_rate": 1.0779443212148444e-06,
"loss": 0.2268310546875,
"step": 6700,
"token_acc": 0.9107427341227126
},
{
"epoch": 0.8583141173456316,
"eval_loss": 0.3350731432437897,
"eval_runtime": 104.1893,
"eval_samples_per_second": 95.979,
"eval_steps_per_second": 11.997,
"eval_token_acc": 0.8825652922579467,
"step": 6700
},
{
"epoch": 0.8589546502690238,
"grad_norm": 3.1875,
"learning_rate": 1.0683970114333032e-06,
"loss": 0.22931032180786132,
"step": 6705,
"token_acc": 0.9111389236545682
},
{
"epoch": 0.8595951831924161,
"grad_norm": 5.25,
"learning_rate": 1.0588897834032718e-06,
"loss": 0.2266333818435669,
"step": 6710,
"token_acc": 0.9122988654501532
},
{
"epoch": 0.8602357161158084,
"grad_norm": 4.28125,
"learning_rate": 1.0494226797896978e-06,
"loss": 0.22155840396881105,
"step": 6715,
"token_acc": 0.9117913343392973
},
{
"epoch": 0.8608762490392006,
"grad_norm": 2.859375,
"learning_rate": 1.0399957430774598e-06,
"loss": 0.23419654369354248,
"step": 6720,
"token_acc": 0.9080201906898485
},
{
"epoch": 0.8615167819625928,
"grad_norm": 3.875,
"learning_rate": 1.030609015571188e-06,
"loss": 0.23095030784606935,
"step": 6725,
"token_acc": 0.9089889579020014
},
{
"epoch": 0.8621573148859851,
"grad_norm": 3.265625,
"learning_rate": 1.021262539395066e-06,
"loss": 0.2203512191772461,
"step": 6730,
"token_acc": 0.9137580554474287
},
{
"epoch": 0.8627978478093774,
"grad_norm": 3.0,
"learning_rate": 1.0119563564926372e-06,
"loss": 0.22832462787628174,
"step": 6735,
"token_acc": 0.9118601531738133
},
{
"epoch": 0.8634383807327697,
"grad_norm": 4.90625,
"learning_rate": 1.0026905086266392e-06,
"loss": 0.22600264549255372,
"step": 6740,
"token_acc": 0.9125167076273013
},
{
"epoch": 0.864078913656162,
"grad_norm": 3.046875,
"learning_rate": 9.934650373787823e-07,
"loss": 0.22522459030151368,
"step": 6745,
"token_acc": 0.9116609294320138
},
{
"epoch": 0.8647194465795542,
"grad_norm": 3.421875,
"learning_rate": 9.842799841495986e-07,
"loss": 0.22795772552490234,
"step": 6750,
"token_acc": 0.9116564948275115
},
{
"epoch": 0.8653599795029464,
"grad_norm": 4.75,
"learning_rate": 9.751353901582294e-07,
"loss": 0.22496397495269777,
"step": 6755,
"token_acc": 0.9126722718210973
},
{
"epoch": 0.8660005124263387,
"grad_norm": 2.765625,
"learning_rate": 9.660312964422469e-07,
"loss": 0.2258981943130493,
"step": 6760,
"token_acc": 0.9107835531419706
},
{
"epoch": 0.866641045349731,
"grad_norm": 3.734375,
"learning_rate": 9.569677438574842e-07,
"loss": 0.22349743843078612,
"step": 6765,
"token_acc": 0.912551306977749
},
{
"epoch": 0.8672815782731232,
"grad_norm": 5.46875,
"learning_rate": 9.479447730778268e-07,
"loss": 0.22322914600372315,
"step": 6770,
"token_acc": 0.911628910463862
},
{
"epoch": 0.8679221111965155,
"grad_norm": 2.984375,
"learning_rate": 9.389624245950601e-07,
"loss": 0.217413330078125,
"step": 6775,
"token_acc": 0.9126826316244488
},
{
"epoch": 0.8685626441199078,
"grad_norm": 3.15625,
"learning_rate": 9.300207387186555e-07,
"loss": 0.237738037109375,
"step": 6780,
"token_acc": 0.9056814760655456
},
{
"epoch": 0.8692031770433001,
"grad_norm": 3.96875,
"learning_rate": 9.211197555756157e-07,
"loss": 0.22690942287445068,
"step": 6785,
"token_acc": 0.9100542775911088
},
{
"epoch": 0.8698437099666922,
"grad_norm": 4.40625,
"learning_rate": 9.122595151102809e-07,
"loss": 0.23275787830352784,
"step": 6790,
"token_acc": 0.9072804862278546
},
{
"epoch": 0.8704842428900845,
"grad_norm": 4.65625,
"learning_rate": 9.034400570841551e-07,
"loss": 0.22703733444213867,
"step": 6795,
"token_acc": 0.9107196692364012
},
{
"epoch": 0.8711247758134768,
"grad_norm": 3.0,
"learning_rate": 8.946614210757221e-07,
"loss": 0.22760224342346191,
"step": 6800,
"token_acc": 0.9104903571737438
},
{
"epoch": 0.8711247758134768,
"eval_loss": 0.33460694551467896,
"eval_runtime": 104.7122,
"eval_samples_per_second": 95.5,
"eval_steps_per_second": 11.937,
"eval_token_acc": 0.8824462518894198,
"step": 6800
},
{
"epoch": 0.8717653087368691,
"grad_norm": 3.1875,
"learning_rate": 8.859236464802756e-07,
"loss": 0.22689156532287597,
"step": 6805,
"token_acc": 0.9119266844505637
},
{
"epoch": 0.8724058416602614,
"grad_norm": 2.84375,
"learning_rate": 8.772267725097361e-07,
"loss": 0.23056597709655763,
"step": 6810,
"token_acc": 0.9115866839602248
},
{
"epoch": 0.8730463745836536,
"grad_norm": 3.375,
"learning_rate": 8.685708381924784e-07,
"loss": 0.23043975830078126,
"step": 6815,
"token_acc": 0.9112482202183199
},
{
"epoch": 0.8736869075070458,
"grad_norm": 2.828125,
"learning_rate": 8.599558823731524e-07,
"loss": 0.22515459060668946,
"step": 6820,
"token_acc": 0.9119896305897602
},
{
"epoch": 0.8743274404304381,
"grad_norm": 4.8125,
"learning_rate": 8.513819437125148e-07,
"loss": 0.2265780448913574,
"step": 6825,
"token_acc": 0.9123948304276173
},
{
"epoch": 0.8749679733538304,
"grad_norm": 3.21875,
"learning_rate": 8.428490606872519e-07,
"loss": 0.22168455123901368,
"step": 6830,
"token_acc": 0.9113792656026989
},
{
"epoch": 0.8756085062772226,
"grad_norm": 2.90625,
"learning_rate": 8.343572715898041e-07,
"loss": 0.2171454668045044,
"step": 6835,
"token_acc": 0.914162535029101
},
{
"epoch": 0.8762490392006149,
"grad_norm": 3.109375,
"learning_rate": 8.259066145282024e-07,
"loss": 0.21893837451934814,
"step": 6840,
"token_acc": 0.9133895738697815
},
{
"epoch": 0.8768895721240072,
"grad_norm": 3.953125,
"learning_rate": 8.17497127425888e-07,
"loss": 0.22579605579376222,
"step": 6845,
"token_acc": 0.9109958954417801
},
{
"epoch": 0.8775301050473995,
"grad_norm": 2.90625,
"learning_rate": 8.091288480215509e-07,
"loss": 0.2259922981262207,
"step": 6850,
"token_acc": 0.9134802754081324
},
{
"epoch": 0.8781706379707916,
"grad_norm": 2.921875,
"learning_rate": 8.008018138689477e-07,
"loss": 0.23148341178894044,
"step": 6855,
"token_acc": 0.9080638206123329
},
{
"epoch": 0.8788111708941839,
"grad_norm": 2.75,
"learning_rate": 7.925160623367534e-07,
"loss": 0.22035045623779298,
"step": 6860,
"token_acc": 0.9124632924512005
},
{
"epoch": 0.8794517038175762,
"grad_norm": 3.125,
"learning_rate": 7.842716306083709e-07,
"loss": 0.22205777168273927,
"step": 6865,
"token_acc": 0.9132595729968018
},
{
"epoch": 0.8800922367409685,
"grad_norm": 2.640625,
"learning_rate": 7.760685556817837e-07,
"loss": 0.22817633152008057,
"step": 6870,
"token_acc": 0.908126751455055
},
{
"epoch": 0.8807327696643608,
"grad_norm": 5.34375,
"learning_rate": 7.679068743693741e-07,
"loss": 0.2194456100463867,
"step": 6875,
"token_acc": 0.914544352044352
},
{
"epoch": 0.881373302587753,
"grad_norm": 3.546875,
"learning_rate": 7.59786623297768e-07,
"loss": 0.22601814270019532,
"step": 6880,
"token_acc": 0.911353032659409
},
{
"epoch": 0.8820138355111453,
"grad_norm": 3.0,
"learning_rate": 7.517078389076715e-07,
"loss": 0.23260602951049805,
"step": 6885,
"token_acc": 0.9088834345261163
},
{
"epoch": 0.8826543684345375,
"grad_norm": 2.765625,
"learning_rate": 7.43670557453694e-07,
"loss": 0.22155818939208985,
"step": 6890,
"token_acc": 0.9120893334483056
},
{
"epoch": 0.8832949013579298,
"grad_norm": 56.0,
"learning_rate": 7.35674815004207e-07,
"loss": 0.23186612129211426,
"step": 6895,
"token_acc": 0.9093760742523204
},
{
"epoch": 0.883935434281322,
"grad_norm": 3.671875,
"learning_rate": 7.277206474411591e-07,
"loss": 0.22928218841552733,
"step": 6900,
"token_acc": 0.9102180604326527
},
{
"epoch": 0.883935434281322,
"eval_loss": 0.3345526456832886,
"eval_runtime": 103.6333,
"eval_samples_per_second": 96.494,
"eval_steps_per_second": 12.062,
"eval_token_acc": 0.8824102629407954,
"step": 6900
},
{
"epoch": 0.8845759672047143,
"grad_norm": 3.9375,
"learning_rate": 7.198080904599314e-07,
"loss": 0.22185420989990234,
"step": 6905,
"token_acc": 0.9122216468151217
},
{
"epoch": 0.8852165001281066,
"grad_norm": 3.5,
"learning_rate": 7.119371795691732e-07,
"loss": 0.22938218116760253,
"step": 6910,
"token_acc": 0.9106750053914169
},
{
"epoch": 0.8858570330514989,
"grad_norm": 3.078125,
"learning_rate": 7.041079500906389e-07,
"loss": 0.22525992393493652,
"step": 6915,
"token_acc": 0.9117697816895332
},
{
"epoch": 0.8864975659748912,
"grad_norm": 3.015625,
"learning_rate": 6.963204371590327e-07,
"loss": 0.22642955780029297,
"step": 6920,
"token_acc": 0.9109674639086404
},
{
"epoch": 0.8871380988982833,
"grad_norm": 10.5,
"learning_rate": 6.885746757218504e-07,
"loss": 0.2312746524810791,
"step": 6925,
"token_acc": 0.9084555651423641
},
{
"epoch": 0.8877786318216756,
"grad_norm": 4.15625,
"learning_rate": 6.808707005392234e-07,
"loss": 0.22308661937713622,
"step": 6930,
"token_acc": 0.9129004329004329
},
{
"epoch": 0.8884191647450679,
"grad_norm": 4.21875,
"learning_rate": 6.73208546183759e-07,
"loss": 0.23537328243255615,
"step": 6935,
"token_acc": 0.9080825451418745
},
{
"epoch": 0.8890596976684602,
"grad_norm": 2.828125,
"learning_rate": 6.655882470403918e-07,
"loss": 0.22550048828125,
"step": 6940,
"token_acc": 0.9109734436598362
},
{
"epoch": 0.8897002305918524,
"grad_norm": 2.734375,
"learning_rate": 6.580098373062227e-07,
"loss": 0.21899161338806153,
"step": 6945,
"token_acc": 0.9117697816895332
},
{
"epoch": 0.8903407635152447,
"grad_norm": 2.453125,
"learning_rate": 6.504733509903693e-07,
"loss": 0.22932813167572022,
"step": 6950,
"token_acc": 0.9097964815453604
},
{
"epoch": 0.890981296438637,
"grad_norm": 4.28125,
"learning_rate": 6.429788219138111e-07,
"loss": 0.22290611267089844,
"step": 6955,
"token_acc": 0.9123820195664354
},
{
"epoch": 0.8916218293620292,
"grad_norm": 4.53125,
"learning_rate": 6.355262837092424e-07,
"loss": 0.2280646324157715,
"step": 6960,
"token_acc": 0.909892094063024
},
{
"epoch": 0.8922623622854214,
"grad_norm": 3.609375,
"learning_rate": 6.281157698209139e-07,
"loss": 0.23290627002716063,
"step": 6965,
"token_acc": 0.9101938603687233
},
{
"epoch": 0.8929028952088137,
"grad_norm": 4.15625,
"learning_rate": 6.207473135044905e-07,
"loss": 0.22637267112731935,
"step": 6970,
"token_acc": 0.9106436069523318
},
{
"epoch": 0.893543428132206,
"grad_norm": 2.84375,
"learning_rate": 6.134209478268904e-07,
"loss": 0.22555007934570312,
"step": 6975,
"token_acc": 0.9121092067866678
},
{
"epoch": 0.8941839610555983,
"grad_norm": 3.59375,
"learning_rate": 6.061367056661582e-07,
"loss": 0.2194199800491333,
"step": 6980,
"token_acc": 0.914880720439884
},
{
"epoch": 0.8948244939789906,
"grad_norm": 3.34375,
"learning_rate": 5.988946197112866e-07,
"loss": 0.22179160118103028,
"step": 6985,
"token_acc": 0.9131431041936878
},
{
"epoch": 0.8954650269023828,
"grad_norm": 2.546875,
"learning_rate": 5.916947224621039e-07,
"loss": 0.2265388011932373,
"step": 6990,
"token_acc": 0.9122412824612194
},
{
"epoch": 0.896105559825775,
"grad_norm": 3.109375,
"learning_rate": 5.845370462290978e-07,
"loss": 0.22730591297149658,
"step": 6995,
"token_acc": 0.909079168281028
},
{
"epoch": 0.8967460927491673,
"grad_norm": 3.15625,
"learning_rate": 5.774216231332875e-07,
"loss": 0.22771029472351073,
"step": 7000,
"token_acc": 0.9113219754151392
},
{
"epoch": 0.8967460927491673,
"eval_loss": 0.33467555046081543,
"eval_runtime": 103.9504,
"eval_samples_per_second": 96.2,
"eval_steps_per_second": 12.025,
"eval_token_acc": 0.8823936526568149,
"step": 7000
},
{
"epoch": 0.8973866256725596,
"grad_norm": 3.375,
"learning_rate": 5.703484851060825e-07,
"loss": 0.23281164169311525,
"step": 7005,
"token_acc": 0.9091221882929766
},
{
"epoch": 0.8980271585959518,
"grad_norm": 3.078125,
"learning_rate": 5.633176638891191e-07,
"loss": 0.2271268367767334,
"step": 7010,
"token_acc": 0.9112791702679343
},
{
"epoch": 0.8986676915193441,
"grad_norm": 4.0625,
"learning_rate": 5.563291910341462e-07,
"loss": 0.22890748977661132,
"step": 7015,
"token_acc": 0.9119872731963196
},
{
"epoch": 0.8993082244427364,
"grad_norm": 3.109375,
"learning_rate": 5.493830979028569e-07,
"loss": 0.22680349349975587,
"step": 7020,
"token_acc": 0.913206895061995
},
{
"epoch": 0.8999487573661287,
"grad_norm": 23.25,
"learning_rate": 5.424794156667645e-07,
"loss": 0.22985119819641114,
"step": 7025,
"token_acc": 0.9109837054918527
},
{
"epoch": 0.9005892902895208,
"grad_norm": 4.3125,
"learning_rate": 5.356181753070588e-07,
"loss": 0.22275919914245607,
"step": 7030,
"token_acc": 0.9113749190589251
},
{
"epoch": 0.9012298232129131,
"grad_norm": 3.203125,
"learning_rate": 5.287994076144643e-07,
"loss": 0.22965426445007325,
"step": 7035,
"token_acc": 0.9091612903225806
},
{
"epoch": 0.9018703561363054,
"grad_norm": 5.8125,
"learning_rate": 5.220231431891032e-07,
"loss": 0.2193136692047119,
"step": 7040,
"token_acc": 0.9128340853870184
},
{
"epoch": 0.9025108890596977,
"grad_norm": 19.625,
"learning_rate": 5.152894124403618e-07,
"loss": 0.2251948356628418,
"step": 7045,
"token_acc": 0.9117292456079917
},
{
"epoch": 0.90315142198309,
"grad_norm": 3.84375,
"learning_rate": 5.085982455867477e-07,
"loss": 0.22256324291229249,
"step": 7050,
"token_acc": 0.9116405307599518
},
{
"epoch": 0.9037919549064822,
"grad_norm": 4.1875,
"learning_rate": 5.019496726557571e-07,
"loss": 0.23459949493408203,
"step": 7055,
"token_acc": 0.9083365578915689
},
{
"epoch": 0.9044324878298745,
"grad_norm": 5.3125,
"learning_rate": 4.953437234837444e-07,
"loss": 0.22082552909851075,
"step": 7060,
"token_acc": 0.9143460643158893
},
{
"epoch": 0.9050730207532667,
"grad_norm": 2.859375,
"learning_rate": 4.887804277157803e-07,
"loss": 0.228281831741333,
"step": 7065,
"token_acc": 0.9109963417258446
},
{
"epoch": 0.905713553676659,
"grad_norm": 2.671875,
"learning_rate": 4.822598148055235e-07,
"loss": 0.2322796106338501,
"step": 7070,
"token_acc": 0.9108313211452225
},
{
"epoch": 0.9063540866000512,
"grad_norm": 3.078125,
"learning_rate": 4.757819140150888e-07,
"loss": 0.23224186897277832,
"step": 7075,
"token_acc": 0.9088714544357273
},
{
"epoch": 0.9069946195234435,
"grad_norm": 2.484375,
"learning_rate": 4.693467544149133e-07,
"loss": 0.21920361518859863,
"step": 7080,
"token_acc": 0.9131602894657018
},
{
"epoch": 0.9076351524468358,
"grad_norm": 3.484375,
"learning_rate": 4.629543648836288e-07,
"loss": 0.21608197689056396,
"step": 7085,
"token_acc": 0.9152600757836721
},
{
"epoch": 0.9082756853702281,
"grad_norm": 3.203125,
"learning_rate": 4.566047741079316e-07,
"loss": 0.2328326940536499,
"step": 7090,
"token_acc": 0.9090478037846459
},
{
"epoch": 0.9089162182936202,
"grad_norm": 3.515625,
"learning_rate": 4.5029801058244726e-07,
"loss": 0.23201301097869872,
"step": 7095,
"token_acc": 0.9082667817828621
},
{
"epoch": 0.9095567512170125,
"grad_norm": 3.234375,
"learning_rate": 4.4403410260961733e-07,
"loss": 0.22749040126800538,
"step": 7100,
"token_acc": 0.9113321799307958
},
{
"epoch": 0.9095567512170125,
"eval_loss": 0.3351185917854309,
"eval_runtime": 103.2381,
"eval_samples_per_second": 96.863,
"eval_steps_per_second": 12.108,
"eval_token_acc": 0.8822164762943564,
"step": 7100
},
{
"epoch": 0.9101972841404048,
"grad_norm": 4.125,
"learning_rate": 4.3781307829955375e-07,
"loss": 0.22915854454040527,
"step": 7105,
"token_acc": 0.9114854122803249
},
{
"epoch": 0.9108378170637971,
"grad_norm": 2.9375,
"learning_rate": 4.3163496556993143e-07,
"loss": 0.22949614524841308,
"step": 7110,
"token_acc": 0.9098265398355787
},
{
"epoch": 0.9114783499871894,
"grad_norm": 4.65625,
"learning_rate": 4.2549979214584703e-07,
"loss": 0.234299373626709,
"step": 7115,
"token_acc": 0.9077444835579978
},
{
"epoch": 0.9121188829105816,
"grad_norm": 2.78125,
"learning_rate": 4.194075855597046e-07,
"loss": 0.21983301639556885,
"step": 7120,
"token_acc": 0.9148798481384012
},
{
"epoch": 0.9127594158339739,
"grad_norm": 2.59375,
"learning_rate": 4.133583731510893e-07,
"loss": 0.23418021202087402,
"step": 7125,
"token_acc": 0.9072267311345191
},
{
"epoch": 0.9133999487573661,
"grad_norm": 2.75,
"learning_rate": 4.073521820666393e-07,
"loss": 0.22026586532592773,
"step": 7130,
"token_acc": 0.9122671141517147
},
{
"epoch": 0.9140404816807584,
"grad_norm": 4.09375,
"learning_rate": 4.0138903925993957e-07,
"loss": 0.22925994396209717,
"step": 7135,
"token_acc": 0.910432351043235
},
{
"epoch": 0.9146810146041506,
"grad_norm": 3.140625,
"learning_rate": 3.954689714913762e-07,
"loss": 0.22760000228881835,
"step": 7140,
"token_acc": 0.911580763424628
},
{
"epoch": 0.9153215475275429,
"grad_norm": 3.9375,
"learning_rate": 3.895920053280422e-07,
"loss": 0.22435307502746582,
"step": 7145,
"token_acc": 0.9124141209004882
},
{
"epoch": 0.9159620804509352,
"grad_norm": 2.9375,
"learning_rate": 3.837581671435997e-07,
"loss": 0.2232006549835205,
"step": 7150,
"token_acc": 0.911838464199239
},
{
"epoch": 0.9166026133743275,
"grad_norm": 3.671875,
"learning_rate": 3.779674831181701e-07,
"loss": 0.2235502243041992,
"step": 7155,
"token_acc": 0.9120366369999136
},
{
"epoch": 0.9172431462977197,
"grad_norm": 2.71875,
"learning_rate": 3.722199792382164e-07,
"loss": 0.22374234199523926,
"step": 7160,
"token_acc": 0.9131279129984464
},
{
"epoch": 0.9178836792211119,
"grad_norm": 3.828125,
"learning_rate": 3.665156812964221e-07,
"loss": 0.22843289375305176,
"step": 7165,
"token_acc": 0.9109647990360616
},
{
"epoch": 0.9185242121445042,
"grad_norm": 2.609375,
"learning_rate": 3.608546148915804e-07,
"loss": 0.22373640537261963,
"step": 7170,
"token_acc": 0.9105750592289468
},
{
"epoch": 0.9191647450678965,
"grad_norm": 3.453125,
"learning_rate": 3.552368054284772e-07,
"loss": 0.21737513542175294,
"step": 7175,
"token_acc": 0.9161251191404558
},
{
"epoch": 0.9198052779912887,
"grad_norm": 3.515625,
"learning_rate": 3.496622781177761e-07,
"loss": 0.22703731060028076,
"step": 7180,
"token_acc": 0.9097332931190486
},
{
"epoch": 0.920445810914681,
"grad_norm": 3.984375,
"learning_rate": 3.441310579759072e-07,
"loss": 0.22722623348236085,
"step": 7185,
"token_acc": 0.9093100331425128
},
{
"epoch": 0.9210863438380733,
"grad_norm": 3.234375,
"learning_rate": 3.386431698249526e-07,
"loss": 0.2288762092590332,
"step": 7190,
"token_acc": 0.9103326439158911
},
{
"epoch": 0.9217268767614656,
"grad_norm": 3.484375,
"learning_rate": 3.3319863829253895e-07,
"loss": 0.22250890731811523,
"step": 7195,
"token_acc": 0.9124709527498064
},
{
"epoch": 0.9223674096848578,
"grad_norm": 3.390625,
"learning_rate": 3.277974878117207e-07,
"loss": 0.22609634399414064,
"step": 7200,
"token_acc": 0.9101618122977346
},
{
"epoch": 0.9223674096848578,
"eval_loss": 0.3345736861228943,
"eval_runtime": 102.7358,
"eval_samples_per_second": 97.337,
"eval_steps_per_second": 12.167,
"eval_token_acc": 0.8823410534242101,
"step": 7200
},
{
"epoch": 0.92300794260825,
"grad_norm": 3.390625,
"learning_rate": 3.2243974262087805e-07,
"loss": 0.2214569091796875,
"step": 7205,
"token_acc": 0.9122594594594594
},
{
"epoch": 0.9236484755316423,
"grad_norm": 3.125,
"learning_rate": 3.171254267636015e-07,
"loss": 0.23588757514953612,
"step": 7210,
"token_acc": 0.9061101549053356
},
{
"epoch": 0.9242890084550346,
"grad_norm": 5.15625,
"learning_rate": 3.1185456408858505e-07,
"loss": 0.22405190467834474,
"step": 7215,
"token_acc": 0.9125026992010364
},
{
"epoch": 0.9249295413784269,
"grad_norm": 2.921875,
"learning_rate": 3.0662717824952894e-07,
"loss": 0.22633728981018067,
"step": 7220,
"token_acc": 0.9114834596829773
},
{
"epoch": 0.9255700743018191,
"grad_norm": 2.890625,
"learning_rate": 3.014432927050126e-07,
"loss": 0.22840723991394044,
"step": 7225,
"token_acc": 0.910606582801999
},
{
"epoch": 0.9262106072252114,
"grad_norm": 4.09375,
"learning_rate": 2.9630293071841397e-07,
"loss": 0.22615447044372558,
"step": 7230,
"token_acc": 0.9125701943844492
},
{
"epoch": 0.9268511401486036,
"grad_norm": 8.75,
"learning_rate": 2.912061153577872e-07,
"loss": 0.22545180320739747,
"step": 7235,
"token_acc": 0.9107838891294933
},
{
"epoch": 0.9274916730719959,
"grad_norm": 3.375,
"learning_rate": 2.861528694957649e-07,
"loss": 0.22807738780975342,
"step": 7240,
"token_acc": 0.9106092073381793
},
{
"epoch": 0.9281322059953881,
"grad_norm": 3.140625,
"learning_rate": 2.8114321580945846e-07,
"loss": 0.23368797302246094,
"step": 7245,
"token_acc": 0.9072138340431023
},
{
"epoch": 0.9287727389187804,
"grad_norm": 3.359375,
"learning_rate": 2.761771767803512e-07,
"loss": 0.2348182201385498,
"step": 7250,
"token_acc": 0.9079960428405522
},
{
"epoch": 0.9294132718421727,
"grad_norm": 11.25,
"learning_rate": 2.71254774694204e-07,
"loss": 0.22567691802978515,
"step": 7255,
"token_acc": 0.9117279965569185
},
{
"epoch": 0.930053804765565,
"grad_norm": 2.546875,
"learning_rate": 2.6637603164094584e-07,
"loss": 0.2227564811706543,
"step": 7260,
"token_acc": 0.9113984055160526
},
{
"epoch": 0.9306943376889573,
"grad_norm": 12.5625,
"learning_rate": 2.615409695145832e-07,
"loss": 0.22351694107055664,
"step": 7265,
"token_acc": 0.9124437910757524
},
{
"epoch": 0.9313348706123494,
"grad_norm": 3.125,
"learning_rate": 2.567496100130973e-07,
"loss": 0.22547354698181152,
"step": 7270,
"token_acc": 0.9113066735688711
},
{
"epoch": 0.9319754035357417,
"grad_norm": 3.453125,
"learning_rate": 2.5200197463834843e-07,
"loss": 0.23171014785766603,
"step": 7275,
"token_acc": 0.9080668134144763
},
{
"epoch": 0.932615936459134,
"grad_norm": 9.375,
"learning_rate": 2.472980846959794e-07,
"loss": 0.22420947551727294,
"step": 7280,
"token_acc": 0.9112663303582977
},
{
"epoch": 0.9332564693825263,
"grad_norm": 9.25,
"learning_rate": 2.4263796129532e-07,
"loss": 0.22904155254364014,
"step": 7285,
"token_acc": 0.9106152457113376
},
{
"epoch": 0.9338970023059185,
"grad_norm": 4.625,
"learning_rate": 2.3802162534929063e-07,
"loss": 0.22856383323669432,
"step": 7290,
"token_acc": 0.9091379087501615
},
{
"epoch": 0.9345375352293108,
"grad_norm": 3.203125,
"learning_rate": 2.33449097574312e-07,
"loss": 0.23378937244415282,
"step": 7295,
"token_acc": 0.90822689545435
},
{
"epoch": 0.9351780681527031,
"grad_norm": 2.71875,
"learning_rate": 2.2892039849020552e-07,
"loss": 0.22789459228515624,
"step": 7300,
"token_acc": 0.9115628641719539
},
{
"epoch": 0.9351780681527031,
"eval_loss": 0.3358408808708191,
"eval_runtime": 109.8432,
"eval_samples_per_second": 91.039,
"eval_steps_per_second": 11.38,
"eval_token_acc": 0.882407494560132,
"step": 7300
},
{
"epoch": 0.9358186010760953,
"grad_norm": 4.15625,
"learning_rate": 2.2443554842011107e-07,
"loss": 0.22101092338562012,
"step": 7305,
"token_acc": 0.9120034542314335
},
{
"epoch": 0.9364591339994875,
"grad_norm": 2.796875,
"learning_rate": 2.199945674903836e-07,
"loss": 0.22407989501953124,
"step": 7310,
"token_acc": 0.9132890651948948
},
{
"epoch": 0.9370996669228798,
"grad_norm": 3.25,
"learning_rate": 2.155974756305157e-07,
"loss": 0.22648565769195556,
"step": 7315,
"token_acc": 0.9100392258286996
},
{
"epoch": 0.9377401998462721,
"grad_norm": 3.484375,
"learning_rate": 2.112442925730407e-07,
"loss": 0.2312589168548584,
"step": 7320,
"token_acc": 0.9074082027056045
},
{
"epoch": 0.9383807327696644,
"grad_norm": 3.171875,
"learning_rate": 2.0693503785344294e-07,
"loss": 0.2254408359527588,
"step": 7325,
"token_acc": 0.9107181367263317
},
{
"epoch": 0.9390212656930567,
"grad_norm": 4.40625,
"learning_rate": 2.0266973081007335e-07,
"loss": 0.22427871227264404,
"step": 7330,
"token_acc": 0.9129628831314394
},
{
"epoch": 0.9396617986164489,
"grad_norm": 2.53125,
"learning_rate": 1.9844839058406174e-07,
"loss": 0.23152542114257812,
"step": 7335,
"token_acc": 0.9090241796200346
},
{
"epoch": 0.9403023315398411,
"grad_norm": 3.28125,
"learning_rate": 1.9427103611923458e-07,
"loss": 0.23547761440277098,
"step": 7340,
"token_acc": 0.9070280082987552
},
{
"epoch": 0.9409428644632334,
"grad_norm": 3.71875,
"learning_rate": 1.9013768616201856e-07,
"loss": 0.23559412956237794,
"step": 7345,
"token_acc": 0.9068175949040199
},
{
"epoch": 0.9415833973866257,
"grad_norm": 6.5,
"learning_rate": 1.860483592613749e-07,
"loss": 0.23159332275390626,
"step": 7350,
"token_acc": 0.9084540336098337
},
{
"epoch": 0.9422239303100179,
"grad_norm": 3.40625,
"learning_rate": 1.8200307376869396e-07,
"loss": 0.228605318069458,
"step": 7355,
"token_acc": 0.9085082587749483
},
{
"epoch": 0.9428644632334102,
"grad_norm": 3.234375,
"learning_rate": 1.7800184783773433e-07,
"loss": 0.22635889053344727,
"step": 7360,
"token_acc": 0.9109769247358206
},
{
"epoch": 0.9435049961568025,
"grad_norm": 3.484375,
"learning_rate": 1.7404469942452597e-07,
"loss": 0.21885204315185547,
"step": 7365,
"token_acc": 0.9139455635595048
},
{
"epoch": 0.9441455290801947,
"grad_norm": 3.234375,
"learning_rate": 1.7013164628729483e-07,
"loss": 0.22530250549316405,
"step": 7370,
"token_acc": 0.9120013769955678
},
{
"epoch": 0.944786062003587,
"grad_norm": 3.421875,
"learning_rate": 1.6626270598638972e-07,
"loss": 0.23129520416259766,
"step": 7375,
"token_acc": 0.9073307460112118
},
{
"epoch": 0.9454265949269792,
"grad_norm": 3.359375,
"learning_rate": 1.624378958841888e-07,
"loss": 0.23236556053161622,
"step": 7380,
"token_acc": 0.9091888166113815
},
{
"epoch": 0.9460671278503715,
"grad_norm": 4.34375,
"learning_rate": 1.5865723314503535e-07,
"loss": 0.2195420265197754,
"step": 7385,
"token_acc": 0.9148448976064979
},
{
"epoch": 0.9467076607737638,
"grad_norm": 3.5625,
"learning_rate": 1.5492073473515334e-07,
"loss": 0.2299337387084961,
"step": 7390,
"token_acc": 0.9092899459088177
},
{
"epoch": 0.9473481936971561,
"grad_norm": 3.296875,
"learning_rate": 1.5122841742257533e-07,
"loss": 0.2305469512939453,
"step": 7395,
"token_acc": 0.9094121703154628
},
{
"epoch": 0.9479887266205483,
"grad_norm": 3.0,
"learning_rate": 1.475802977770646e-07,
"loss": 0.2353046417236328,
"step": 7400,
"token_acc": 0.9065898637321068
},
{
"epoch": 0.9479887266205483,
"eval_loss": 0.3345721662044525,
"eval_runtime": 102.9085,
"eval_samples_per_second": 97.174,
"eval_steps_per_second": 12.147,
"eval_token_acc": 0.8821223513518003,
"step": 7400
},
{
"epoch": 0.9486292595439405,
"grad_norm": 3.3125,
"learning_rate": 1.43976392170041e-07,
"loss": 0.22667450904846193,
"step": 7405,
"token_acc": 0.9105691056910569
},
{
"epoch": 0.9492697924673328,
"grad_norm": 3.65625,
"learning_rate": 1.404167167745074e-07,
"loss": 0.23315582275390626,
"step": 7410,
"token_acc": 0.9090674355553643
},
{
"epoch": 0.9499103253907251,
"grad_norm": 4.09375,
"learning_rate": 1.3690128756498e-07,
"loss": 0.2316906452178955,
"step": 7415,
"token_acc": 0.9079900017238407
},
{
"epoch": 0.9505508583141173,
"grad_norm": 3.5,
"learning_rate": 1.3343012031741155e-07,
"loss": 0.22472708225250243,
"step": 7420,
"token_acc": 0.9114065659825309
},
{
"epoch": 0.9511913912375096,
"grad_norm": 3.453125,
"learning_rate": 1.30003230609127e-07,
"loss": 0.22815487384796143,
"step": 7425,
"token_acc": 0.9114841828147253
},
{
"epoch": 0.9518319241609019,
"grad_norm": 3.375,
"learning_rate": 1.266206338187448e-07,
"loss": 0.2252589225769043,
"step": 7430,
"token_acc": 0.911378744712078
},
{
"epoch": 0.9524724570842942,
"grad_norm": 4.03125,
"learning_rate": 1.2328234512611893e-07,
"loss": 0.23881807327270507,
"step": 7435,
"token_acc": 0.9063224808865218
},
{
"epoch": 0.9531129900076863,
"grad_norm": 3.765625,
"learning_rate": 1.1998837951226027e-07,
"loss": 0.2236201286315918,
"step": 7440,
"token_acc": 0.9124207256568445
},
{
"epoch": 0.9537535229310786,
"grad_norm": 3.671875,
"learning_rate": 1.1673875175927773e-07,
"loss": 0.22488207817077638,
"step": 7445,
"token_acc": 0.9117697816895332
},
{
"epoch": 0.9543940558544709,
"grad_norm": 15.125,
"learning_rate": 1.1353347645030488e-07,
"loss": 0.23006877899169922,
"step": 7450,
"token_acc": 0.9091143483305402
},
{
"epoch": 0.9550345887778632,
"grad_norm": 3.90625,
"learning_rate": 1.1037256796943896e-07,
"loss": 0.23117449283599853,
"step": 7455,
"token_acc": 0.9106395825246906
},
{
"epoch": 0.9556751217012555,
"grad_norm": 3.453125,
"learning_rate": 1.072560405016776e-07,
"loss": 0.22410707473754882,
"step": 7460,
"token_acc": 0.913257805067889
},
{
"epoch": 0.9563156546246477,
"grad_norm": 3.1875,
"learning_rate": 1.0418390803284772e-07,
"loss": 0.22124795913696288,
"step": 7465,
"token_acc": 0.9126255442044916
},
{
"epoch": 0.95695618754804,
"grad_norm": 3.0,
"learning_rate": 1.0115618434955233e-07,
"loss": 0.22695465087890626,
"step": 7470,
"token_acc": 0.9115334773218142
},
{
"epoch": 0.9575967204714322,
"grad_norm": 2.796875,
"learning_rate": 9.817288303910267e-08,
"loss": 0.22336146831512452,
"step": 7475,
"token_acc": 0.9113754903228587
},
{
"epoch": 0.9582372533948245,
"grad_norm": 2.828125,
"learning_rate": 9.523401748945837e-08,
"loss": 0.22532784938812256,
"step": 7480,
"token_acc": 0.9119910089046425
},
{
"epoch": 0.9588777863182167,
"grad_norm": 5.875,
"learning_rate": 9.233960088916749e-08,
"loss": 0.23188343048095703,
"step": 7485,
"token_acc": 0.9097377954114197
},
{
"epoch": 0.959518319241609,
"grad_norm": 12.875,
"learning_rate": 8.948964622730761e-08,
"loss": 0.22753703594207764,
"step": 7490,
"token_acc": 0.911497176359012
},
{
"epoch": 0.9601588521650013,
"grad_norm": 4.84375,
"learning_rate": 8.668416629342813e-08,
"loss": 0.23263895511627197,
"step": 7495,
"token_acc": 0.9096037898363479
},
{
"epoch": 0.9607993850883936,
"grad_norm": 3.40625,
"learning_rate": 8.392317367749259e-08,
"loss": 0.23171706199645997,
"step": 7500,
"token_acc": 0.9093455125166962
},
{
"epoch": 0.9607993850883936,
"eval_loss": 0.3347827196121216,
"eval_runtime": 102.5334,
"eval_samples_per_second": 97.529,
"eval_steps_per_second": 12.191,
"eval_token_acc": 0.8825099246446784,
"step": 7500
},
{
"epoch": 0.9614399180117859,
"grad_norm": 2.796875,
"learning_rate": 8.120668076982085e-08,
"loss": 0.23077220916748048,
"step": 7505,
"token_acc": 0.9088870682592385
},
{
"epoch": 0.962080450935178,
"grad_norm": 3.296875,
"learning_rate": 7.853469976103367e-08,
"loss": 0.2174984931945801,
"step": 7510,
"token_acc": 0.9130472325360505
},
{
"epoch": 0.9627209838585703,
"grad_norm": 2.546875,
"learning_rate": 7.590724264200044e-08,
"loss": 0.2254131555557251,
"step": 7515,
"token_acc": 0.9099638305201516
},
{
"epoch": 0.9633615167819626,
"grad_norm": 2.828125,
"learning_rate": 7.332432120378263e-08,
"loss": 0.21682121753692626,
"step": 7520,
"token_acc": 0.9132147340172272
},
{
"epoch": 0.9640020497053549,
"grad_norm": 4.53125,
"learning_rate": 7.07859470375838e-08,
"loss": 0.22123939990997316,
"step": 7525,
"token_acc": 0.913344287814581
},
{
"epoch": 0.9646425826287471,
"grad_norm": 4.0,
"learning_rate": 6.829213153469294e-08,
"loss": 0.2257563591003418,
"step": 7530,
"token_acc": 0.9111034244206156
},
{
"epoch": 0.9652831155521394,
"grad_norm": 3.25,
"learning_rate": 6.584288588643795e-08,
"loss": 0.21516809463500977,
"step": 7535,
"token_acc": 0.9163821788168186
},
{
"epoch": 0.9659236484755317,
"grad_norm": 5.46875,
"learning_rate": 6.343822108413111e-08,
"loss": 0.23532419204711913,
"step": 7540,
"token_acc": 0.9062943071965628
},
{
"epoch": 0.9665641813989239,
"grad_norm": 3.515625,
"learning_rate": 6.10781479190281e-08,
"loss": 0.22491927146911622,
"step": 7545,
"token_acc": 0.9103519579545944
},
{
"epoch": 0.9672047143223161,
"grad_norm": 3.078125,
"learning_rate": 5.8762676982265785e-08,
"loss": 0.23122644424438477,
"step": 7550,
"token_acc": 0.9087346024636058
},
{
"epoch": 0.9678452472457084,
"grad_norm": 2.75,
"learning_rate": 5.649181866483e-08,
"loss": 0.22680530548095704,
"step": 7555,
"token_acc": 0.911484593837535
},
{
"epoch": 0.9684857801691007,
"grad_norm": 4.34375,
"learning_rate": 5.426558315749675e-08,
"loss": 0.22133951187133788,
"step": 7560,
"token_acc": 0.9124602287384986
},
{
"epoch": 0.969126313092493,
"grad_norm": 4.71875,
"learning_rate": 5.208398045079222e-08,
"loss": 0.2312103033065796,
"step": 7565,
"token_acc": 0.9101176672678862
},
{
"epoch": 0.9697668460158853,
"grad_norm": 3.078125,
"learning_rate": 4.994702033494947e-08,
"loss": 0.22229225635528566,
"step": 7570,
"token_acc": 0.9136234136234136
},
{
"epoch": 0.9704073789392775,
"grad_norm": 3.125,
"learning_rate": 4.785471239985851e-08,
"loss": 0.2287161111831665,
"step": 7575,
"token_acc": 0.9087663454920853
},
{
"epoch": 0.9710479118626697,
"grad_norm": 2.75,
"learning_rate": 4.5807066035028494e-08,
"loss": 0.226922607421875,
"step": 7580,
"token_acc": 0.9117697816895332
},
{
"epoch": 0.971688444786062,
"grad_norm": 7.21875,
"learning_rate": 4.3804090429543366e-08,
"loss": 0.23184614181518554,
"step": 7585,
"token_acc": 0.9090673798636874
},
{
"epoch": 0.9723289777094543,
"grad_norm": 3.640625,
"learning_rate": 4.184579457202298e-08,
"loss": 0.22905006408691406,
"step": 7590,
"token_acc": 0.9108255451713395
},
{
"epoch": 0.9729695106328465,
"grad_norm": 3.046875,
"learning_rate": 3.993218725057868e-08,
"loss": 0.22608802318572999,
"step": 7595,
"token_acc": 0.911466643667256
},
{
"epoch": 0.9736100435562388,
"grad_norm": 3.140625,
"learning_rate": 3.806327705277557e-08,
"loss": 0.23126821517944335,
"step": 7600,
"token_acc": 0.9085889305897071
},
{
"epoch": 0.9736100435562388,
"eval_loss": 0.33461424708366394,
"eval_runtime": 103.0674,
"eval_samples_per_second": 97.024,
"eval_steps_per_second": 12.128,
"eval_token_acc": 0.8822912225722686,
"step": 7600
},
{
"epoch": 0.9742505764796311,
"grad_norm": 2.890625,
"learning_rate": 3.6239072365596984e-08,
"loss": 0.23053784370422364,
"step": 7605,
"token_acc": 0.9106689064047896
},
{
"epoch": 0.9748911094030234,
"grad_norm": 3.484375,
"learning_rate": 3.4459581375403395e-08,
"loss": 0.22285847663879393,
"step": 7610,
"token_acc": 0.9107173725151253
},
{
"epoch": 0.9755316423264155,
"grad_norm": 3.390625,
"learning_rate": 3.2724812067895795e-08,
"loss": 0.22678759098052978,
"step": 7615,
"token_acc": 0.9103671706263499
},
{
"epoch": 0.9761721752498078,
"grad_norm": 3.125,
"learning_rate": 3.103477222808016e-08,
"loss": 0.22554678916931153,
"step": 7620,
"token_acc": 0.9114635094845094
},
{
"epoch": 0.9768127081732001,
"grad_norm": 3.171875,
"learning_rate": 2.938946944023635e-08,
"loss": 0.2290804386138916,
"step": 7625,
"token_acc": 0.9116782006920415
},
{
"epoch": 0.9774532410965924,
"grad_norm": 2.96875,
"learning_rate": 2.7788911087877067e-08,
"loss": 0.22720465660095215,
"step": 7630,
"token_acc": 0.910641229921192
},
{
"epoch": 0.9780937740199847,
"grad_norm": 3.40625,
"learning_rate": 2.6233104353720063e-08,
"loss": 0.22470946311950685,
"step": 7635,
"token_acc": 0.9122565074987071
},
{
"epoch": 0.9787343069433769,
"grad_norm": 3.4375,
"learning_rate": 2.4722056219654843e-08,
"loss": 0.22875847816467285,
"step": 7640,
"token_acc": 0.9108098773959592
},
{
"epoch": 0.9793748398667691,
"grad_norm": 3.609375,
"learning_rate": 2.3255773466708266e-08,
"loss": 0.2208240509033203,
"step": 7645,
"token_acc": 0.9132234969378072
},
{
"epoch": 0.9800153727901614,
"grad_norm": 4.75,
"learning_rate": 2.1834262675021202e-08,
"loss": 0.2248084306716919,
"step": 7650,
"token_acc": 0.9121955410455775
},
{
"epoch": 0.9806559057135537,
"grad_norm": 3.625,
"learning_rate": 2.0457530223809695e-08,
"loss": 0.2183553695678711,
"step": 7655,
"token_acc": 0.9145100069013112
},
{
"epoch": 0.9812964386369459,
"grad_norm": 12.8125,
"learning_rate": 1.912558229134387e-08,
"loss": 0.22381486892700195,
"step": 7660,
"token_acc": 0.9117545822218398
},
{
"epoch": 0.9819369715603382,
"grad_norm": 2.734375,
"learning_rate": 1.7838424854915714e-08,
"loss": 0.22946505546569823,
"step": 7665,
"token_acc": 0.9092513668259503
},
{
"epoch": 0.9825775044837305,
"grad_norm": 3.125,
"learning_rate": 1.659606369081468e-08,
"loss": 0.23253355026245118,
"step": 7670,
"token_acc": 0.9086559186136736
},
{
"epoch": 0.9832180374071228,
"grad_norm": 4.375,
"learning_rate": 1.5398504374302124e-08,
"loss": 0.21708984375,
"step": 7675,
"token_acc": 0.9138436341694681
},
{
"epoch": 0.9838585703305149,
"grad_norm": 6.65625,
"learning_rate": 1.424575227958358e-08,
"loss": 0.2197282314300537,
"step": 7680,
"token_acc": 0.9130565972671236
},
{
"epoch": 0.9844991032539072,
"grad_norm": 3.65625,
"learning_rate": 1.3137812579785415e-08,
"loss": 0.22876739501953125,
"step": 7685,
"token_acc": 0.9095221666379162
},
{
"epoch": 0.9851396361772995,
"grad_norm": 2.953125,
"learning_rate": 1.2074690246937081e-08,
"loss": 0.22080717086791993,
"step": 7690,
"token_acc": 0.9124427942319316
},
{
"epoch": 0.9857801691006918,
"grad_norm": 5.46875,
"learning_rate": 1.1056390051936705e-08,
"loss": 0.23291680812835694,
"step": 7695,
"token_acc": 0.9067782067782068
},
{
"epoch": 0.986420702024084,
"grad_norm": 3.1875,
"learning_rate": 1.008291656454441e-08,
"loss": 0.22717700004577637,
"step": 7700,
"token_acc": 0.9099401610056395
},
{
"epoch": 0.986420702024084,
"eval_loss": 0.33420565724372864,
"eval_runtime": 102.458,
"eval_samples_per_second": 97.601,
"eval_steps_per_second": 12.2,
"eval_token_acc": 0.8827535421430588,
"step": 7700
},
{
"epoch": 0.9870612349474763,
"grad_norm": 2.84375,
"learning_rate": 9.154274153351239e-09,
"loss": 0.2262244701385498,
"step": 7705,
"token_acc": 0.9095655175389394
},
{
"epoch": 0.9877017678708686,
"grad_norm": 2.875,
"learning_rate": 8.270466985761393e-09,
"loss": 0.22812366485595703,
"step": 7710,
"token_acc": 0.9118508311084317
},
{
"epoch": 0.9883423007942608,
"grad_norm": 2.875,
"learning_rate": 7.431499027976685e-09,
"loss": 0.2235403537750244,
"step": 7715,
"token_acc": 0.9136387118994317
},
{
"epoch": 0.988982833717653,
"grad_norm": 2.84375,
"learning_rate": 6.637374044978772e-09,
"loss": 0.2251359224319458,
"step": 7720,
"token_acc": 0.9123222748815166
},
{
"epoch": 0.9896233666410453,
"grad_norm": 3.171875,
"learning_rate": 5.88809560050696e-09,
"loss": 0.22147438526153565,
"step": 7725,
"token_acc": 0.9122496546961326
},
{
"epoch": 0.9902638995644376,
"grad_norm": 11.375,
"learning_rate": 5.1836670570493135e-09,
"loss": 0.23077549934387206,
"step": 7730,
"token_acc": 0.9092400690846286
},
{
"epoch": 0.9909044324878299,
"grad_norm": 3.09375,
"learning_rate": 4.524091575819345e-09,
"loss": 0.23009955883026123,
"step": 7735,
"token_acc": 0.9085662603901977
},
{
"epoch": 0.9915449654112222,
"grad_norm": 3.125,
"learning_rate": 3.9093721167526854e-09,
"loss": 0.2305884838104248,
"step": 7740,
"token_acc": 0.9105272196462305
},
{
"epoch": 0.9921854983346144,
"grad_norm": 2.859375,
"learning_rate": 3.339511438481546e-09,
"loss": 0.23010706901550293,
"step": 7745,
"token_acc": 0.9100301334481274
},
{
"epoch": 0.9928260312580066,
"grad_norm": 2.671875,
"learning_rate": 2.8145120983336106e-09,
"loss": 0.23720641136169435,
"step": 7750,
"token_acc": 0.9077843280691941
},
{
"epoch": 0.9934665641813989,
"grad_norm": 3.515625,
"learning_rate": 2.334376452310938e-09,
"loss": 0.2344132900238037,
"step": 7755,
"token_acc": 0.9084367459496725
},
{
"epoch": 0.9941070971047912,
"grad_norm": 3.03125,
"learning_rate": 1.899106655087746e-09,
"loss": 0.23155610561370848,
"step": 7760,
"token_acc": 0.9105244966732913
},
{
"epoch": 0.9947476300281834,
"grad_norm": 2.8125,
"learning_rate": 1.5087046599926435e-09,
"loss": 0.22453222274780274,
"step": 7765,
"token_acc": 0.9133017649591046
},
{
"epoch": 0.9953881629515757,
"grad_norm": 2.984375,
"learning_rate": 1.1631722190086348e-09,
"loss": 0.22471303939819337,
"step": 7770,
"token_acc": 0.9115940774092995
},
{
"epoch": 0.996028695874968,
"grad_norm": 3.28125,
"learning_rate": 8.625108827564621e-10,
"loss": 0.22228624820709228,
"step": 7775,
"token_acc": 0.9125803251822142
},
{
"epoch": 0.9966692287983603,
"grad_norm": 4.40625,
"learning_rate": 6.067220004946084e-10,
"loss": 0.226347017288208,
"step": 7780,
"token_acc": 0.9129628831314394
},
{
"epoch": 0.9973097617217525,
"grad_norm": 2.671875,
"learning_rate": 3.958067201093041e-10,
"loss": 0.2226627826690674,
"step": 7785,
"token_acc": 0.91291213533575
},
{
"epoch": 0.9979502946451447,
"grad_norm": 5.34375,
"learning_rate": 2.297659881111969e-10,
"loss": 0.22344522476196288,
"step": 7790,
"token_acc": 0.9124097007223942
},
{
"epoch": 0.998590827568537,
"grad_norm": 2.765625,
"learning_rate": 1.0860054962980038e-10,
"loss": 0.22511889934539794,
"step": 7795,
"token_acc": 0.9111601540525337
},
{
"epoch": 0.9992313604919293,
"grad_norm": 2.859375,
"learning_rate": 3.23109484112738e-11,
"loss": 0.22252602577209474,
"step": 7800,
"token_acc": 0.9127169127169127
},
{
"epoch": 0.9992313604919293,
"eval_loss": 0.3345825672149658,
"eval_runtime": 103.1214,
"eval_samples_per_second": 96.973,
"eval_steps_per_second": 12.122,
"eval_token_acc": 0.8827258583364247,
"step": 7800
},
{
"epoch": 0.9998718934153216,
"grad_norm": 7.1875,
"learning_rate": 8.975268150912541e-13,
"loss": 0.22932782173156738,
"step": 7805,
"token_acc": 0.9116207163102293
},
{
"epoch": 1.0,
"eval_loss": 0.33455130457878113,
"eval_runtime": 101.6743,
"eval_samples_per_second": 98.353,
"eval_steps_per_second": 12.294,
"eval_token_acc": 0.8823687372308442,
"step": 7806
}
],
"logging_steps": 5,
"max_steps": 7806,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.2803424419153183e+19,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}