S1-DeepResearch-32B / trainer_state.json
ScienceOne-AI's picture
Upload folder using huggingface_hub
6d30fe1 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 1560,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0032102728731942215,
"grad_norm": 2.4753810438494113,
"learning_rate": 1.4893617021276595e-07,
"loss": 0.7879232168197632,
"step": 1,
"token_acc": 0.7756388598457696
},
{
"epoch": 0.006420545746388443,
"grad_norm": 2.0688547647937128,
"learning_rate": 2.978723404255319e-07,
"loss": 0.8343099355697632,
"step": 2,
"token_acc": 0.7691886054104653
},
{
"epoch": 0.009630818619582664,
"grad_norm": 2.2647251246154494,
"learning_rate": 4.4680851063829783e-07,
"loss": 0.8496907949447632,
"step": 3,
"token_acc": 0.7638933733394063
},
{
"epoch": 0.012841091492776886,
"grad_norm": 2.309927904076833,
"learning_rate": 5.957446808510638e-07,
"loss": 0.84716796875,
"step": 4,
"token_acc": 0.7630275625571407
},
{
"epoch": 0.016051364365971106,
"grad_norm": 2.537086534556216,
"learning_rate": 7.446808510638298e-07,
"loss": 0.85986328125,
"step": 5,
"token_acc": 0.7569718906167684
},
{
"epoch": 0.019261637239165328,
"grad_norm": 2.4172209878112727,
"learning_rate": 8.936170212765957e-07,
"loss": 0.800048828125,
"step": 6,
"token_acc": 0.7704034280523686
},
{
"epoch": 0.02247191011235955,
"grad_norm": 2.2999894579971696,
"learning_rate": 1.0425531914893615e-06,
"loss": 0.8470052480697632,
"step": 7,
"token_acc": 0.7622465717309417
},
{
"epoch": 0.025682182985553772,
"grad_norm": 1.964783949379289,
"learning_rate": 1.1914893617021276e-06,
"loss": 0.83154296875,
"step": 8,
"token_acc": 0.76941155597467
},
{
"epoch": 0.028892455858747994,
"grad_norm": 1.9180672449757712,
"learning_rate": 1.3404255319148935e-06,
"loss": 0.7979329824447632,
"step": 9,
"token_acc": 0.7754702921919281
},
{
"epoch": 0.03210272873194221,
"grad_norm": 2.240827763658055,
"learning_rate": 1.4893617021276596e-06,
"loss": 0.8218180537223816,
"step": 10,
"token_acc": 0.7658292813448604
},
{
"epoch": 0.03531300160513644,
"grad_norm": 1.8783322418040913,
"learning_rate": 1.6382978723404255e-06,
"loss": 0.8649088740348816,
"step": 11,
"token_acc": 0.7554619804912056
},
{
"epoch": 0.038523274478330656,
"grad_norm": 1.3644736752145021,
"learning_rate": 1.7872340425531913e-06,
"loss": 0.7535807490348816,
"step": 12,
"token_acc": 0.7855808513669389
},
{
"epoch": 0.04173354735152488,
"grad_norm": 1.4208147986467878,
"learning_rate": 1.9361702127659576e-06,
"loss": 0.7366536855697632,
"step": 13,
"token_acc": 0.7883947769962283
},
{
"epoch": 0.0449438202247191,
"grad_norm": 1.421550488421433,
"learning_rate": 2.085106382978723e-06,
"loss": 0.742919921875,
"step": 14,
"token_acc": 0.7863493121733124
},
{
"epoch": 0.048154093097913325,
"grad_norm": 1.3022390396539427,
"learning_rate": 2.2340425531914894e-06,
"loss": 0.7919921875,
"step": 15,
"token_acc": 0.7746175480275829
},
{
"epoch": 0.051364365971107544,
"grad_norm": 1.1231071316042414,
"learning_rate": 2.3829787234042553e-06,
"loss": 0.7566325068473816,
"step": 16,
"token_acc": 0.7794038646429666
},
{
"epoch": 0.05457463884430177,
"grad_norm": 0.9333351622252422,
"learning_rate": 2.5319148936170216e-06,
"loss": 0.7769368886947632,
"step": 17,
"token_acc": 0.777008609403375
},
{
"epoch": 0.05778491171749599,
"grad_norm": 0.929802928146088,
"learning_rate": 2.680851063829787e-06,
"loss": 0.7835286855697632,
"step": 18,
"token_acc": 0.772856360370574
},
{
"epoch": 0.060995184590690206,
"grad_norm": 0.9066023006988848,
"learning_rate": 2.829787234042553e-06,
"loss": 0.7682291865348816,
"step": 19,
"token_acc": 0.7768975575262956
},
{
"epoch": 0.06420545746388442,
"grad_norm": 0.7225896156448947,
"learning_rate": 2.978723404255319e-06,
"loss": 0.7460123896598816,
"step": 20,
"token_acc": 0.7847207901601844
},
{
"epoch": 0.06741573033707865,
"grad_norm": 0.7109085666571867,
"learning_rate": 3.127659574468085e-06,
"loss": 0.7208659052848816,
"step": 21,
"token_acc": 0.7857597824218354
},
{
"epoch": 0.07062600321027288,
"grad_norm": 0.8150780472279927,
"learning_rate": 3.276595744680851e-06,
"loss": 0.7197265625,
"step": 22,
"token_acc": 0.7923169108386342
},
{
"epoch": 0.0738362760834671,
"grad_norm": 0.7753241464505493,
"learning_rate": 3.425531914893617e-06,
"loss": 0.6572672724723816,
"step": 23,
"token_acc": 0.8042959454201616
},
{
"epoch": 0.07704654895666131,
"grad_norm": 1.0500219315630146,
"learning_rate": 3.5744680851063827e-06,
"loss": 0.7864583730697632,
"step": 24,
"token_acc": 0.7682198177819095
},
{
"epoch": 0.08025682182985554,
"grad_norm": 1.0637751453192557,
"learning_rate": 3.723404255319149e-06,
"loss": 0.7196452021598816,
"step": 25,
"token_acc": 0.7878780656644498
},
{
"epoch": 0.08346709470304976,
"grad_norm": 1.122751448745593,
"learning_rate": 3.872340425531915e-06,
"loss": 0.763916015625,
"step": 26,
"token_acc": 0.7736624491344681
},
{
"epoch": 0.08667736757624397,
"grad_norm": 0.8175549093574106,
"learning_rate": 4.0212765957446816e-06,
"loss": 0.7271322011947632,
"step": 27,
"token_acc": 0.7838902363141947
},
{
"epoch": 0.0898876404494382,
"grad_norm": 0.8938896614780193,
"learning_rate": 4.170212765957446e-06,
"loss": 0.710205078125,
"step": 28,
"token_acc": 0.7874908688244899
},
{
"epoch": 0.09309791332263243,
"grad_norm": 0.7111591256473304,
"learning_rate": 4.3191489361702125e-06,
"loss": 0.680419921875,
"step": 29,
"token_acc": 0.7951899206740415
},
{
"epoch": 0.09630818619582665,
"grad_norm": 0.5553474125446014,
"learning_rate": 4.468085106382979e-06,
"loss": 0.715576171875,
"step": 30,
"token_acc": 0.78816810172129
},
{
"epoch": 0.09951845906902086,
"grad_norm": 0.6125224035794444,
"learning_rate": 4.617021276595744e-06,
"loss": 0.7332357168197632,
"step": 31,
"token_acc": 0.7818265572355082
},
{
"epoch": 0.10272873194221509,
"grad_norm": 0.5248667594940402,
"learning_rate": 4.7659574468085105e-06,
"loss": 0.6513671875,
"step": 32,
"token_acc": 0.8066042872298029
},
{
"epoch": 0.10593900481540931,
"grad_norm": 0.6190819259396853,
"learning_rate": 4.914893617021277e-06,
"loss": 0.7200521230697632,
"step": 33,
"token_acc": 0.7850674209358465
},
{
"epoch": 0.10914927768860354,
"grad_norm": 0.6451198104159461,
"learning_rate": 5.063829787234043e-06,
"loss": 0.6764323115348816,
"step": 34,
"token_acc": 0.7948570707957826
},
{
"epoch": 0.11235955056179775,
"grad_norm": 0.5786823323345861,
"learning_rate": 5.2127659574468086e-06,
"loss": 0.6573486328125,
"step": 35,
"token_acc": 0.7999205209200293
},
{
"epoch": 0.11556982343499198,
"grad_norm": 0.5871758694058516,
"learning_rate": 5.361702127659574e-06,
"loss": 0.7178548574447632,
"step": 36,
"token_acc": 0.7870625485393392
},
{
"epoch": 0.1187800963081862,
"grad_norm": 0.5369227527562553,
"learning_rate": 5.51063829787234e-06,
"loss": 0.6741536855697632,
"step": 37,
"token_acc": 0.795958329652928
},
{
"epoch": 0.12199036918138041,
"grad_norm": 0.42895258820168175,
"learning_rate": 5.659574468085106e-06,
"loss": 0.706787109375,
"step": 38,
"token_acc": 0.788144228221681
},
{
"epoch": 0.12520064205457465,
"grad_norm": 0.43361997762214843,
"learning_rate": 5.808510638297872e-06,
"loss": 0.7215983271598816,
"step": 39,
"token_acc": 0.7811117204862373
},
{
"epoch": 0.12841091492776885,
"grad_norm": 0.39452497235838196,
"learning_rate": 5.957446808510638e-06,
"loss": 0.6888021230697632,
"step": 40,
"token_acc": 0.7913482530959579
},
{
"epoch": 0.13162118780096307,
"grad_norm": 0.38163409173267143,
"learning_rate": 6.106382978723405e-06,
"loss": 0.7011312246322632,
"step": 41,
"token_acc": 0.7881589276009903
},
{
"epoch": 0.1348314606741573,
"grad_norm": 0.4436482274993076,
"learning_rate": 6.25531914893617e-06,
"loss": 0.6253255605697632,
"step": 42,
"token_acc": 0.8081617238255353
},
{
"epoch": 0.13804173354735153,
"grad_norm": 0.4375956803307934,
"learning_rate": 6.404255319148936e-06,
"loss": 0.6582845449447632,
"step": 43,
"token_acc": 0.7959535510226482
},
{
"epoch": 0.14125200642054575,
"grad_norm": 0.39330235188333057,
"learning_rate": 6.553191489361702e-06,
"loss": 0.6446126699447632,
"step": 44,
"token_acc": 0.8014317040118041
},
{
"epoch": 0.14446227929373998,
"grad_norm": 0.37194650846262567,
"learning_rate": 6.702127659574468e-06,
"loss": 0.6810709834098816,
"step": 45,
"token_acc": 0.792782252006574
},
{
"epoch": 0.1476725521669342,
"grad_norm": 0.31530649176304015,
"learning_rate": 6.851063829787234e-06,
"loss": 0.6333822011947632,
"step": 46,
"token_acc": 0.8059375415995946
},
{
"epoch": 0.1508828250401284,
"grad_norm": 0.31437722737003226,
"learning_rate": 7e-06,
"loss": 0.630126953125,
"step": 47,
"token_acc": 0.8044129458240332
},
{
"epoch": 0.15409309791332262,
"grad_norm": 0.274516902775598,
"learning_rate": 6.999992454990655e-06,
"loss": 0.6744791865348816,
"step": 48,
"token_acc": 0.7951283361151459
},
{
"epoch": 0.15730337078651685,
"grad_norm": 0.3108158442861642,
"learning_rate": 6.999969819995152e-06,
"loss": 0.6378580927848816,
"step": 49,
"token_acc": 0.8030468177870123
},
{
"epoch": 0.16051364365971107,
"grad_norm": 0.3116005863059165,
"learning_rate": 6.999932095111077e-06,
"loss": 0.6702067255973816,
"step": 50,
"token_acc": 0.7972457736243002
},
{
"epoch": 0.1637239165329053,
"grad_norm": 0.32992462397943517,
"learning_rate": 6.999879280501081e-06,
"loss": 0.572021484375,
"step": 51,
"token_acc": 0.823265086079219
},
{
"epoch": 0.16693418940609953,
"grad_norm": 0.3633482687659926,
"learning_rate": 6.999811376392871e-06,
"loss": 0.6148681640625,
"step": 52,
"token_acc": 0.809984832111816
},
{
"epoch": 0.17014446227929375,
"grad_norm": 0.30599321045708705,
"learning_rate": 6.999728383079208e-06,
"loss": 0.62841796875,
"step": 53,
"token_acc": 0.8064036729664908
},
{
"epoch": 0.17335473515248795,
"grad_norm": 0.25905438447963114,
"learning_rate": 6.999630300917915e-06,
"loss": 0.6336263418197632,
"step": 54,
"token_acc": 0.8045163559291338
},
{
"epoch": 0.17656500802568217,
"grad_norm": 0.2447879175964714,
"learning_rate": 6.999517130331867e-06,
"loss": 0.5997314453125,
"step": 55,
"token_acc": 0.8159714374370309
},
{
"epoch": 0.1797752808988764,
"grad_norm": 0.2974010229219283,
"learning_rate": 6.999388871808989e-06,
"loss": 0.6444498896598816,
"step": 56,
"token_acc": 0.8016444560621508
},
{
"epoch": 0.18298555377207062,
"grad_norm": 0.27346434417319015,
"learning_rate": 6.999245525902262e-06,
"loss": 0.6520182490348816,
"step": 57,
"token_acc": 0.8006354831734891
},
{
"epoch": 0.18619582664526485,
"grad_norm": 0.24593285011001234,
"learning_rate": 6.9990870932297095e-06,
"loss": 0.6388346552848816,
"step": 58,
"token_acc": 0.804055135767979
},
{
"epoch": 0.18940609951845908,
"grad_norm": 0.2748750065623674,
"learning_rate": 6.998913574474406e-06,
"loss": 0.65362548828125,
"step": 59,
"token_acc": 0.8014784291270444
},
{
"epoch": 0.1926163723916533,
"grad_norm": 0.2968798961595123,
"learning_rate": 6.998724970384465e-06,
"loss": 0.6461588740348816,
"step": 60,
"token_acc": 0.7991732757932588
},
{
"epoch": 0.1958266452648475,
"grad_norm": 0.23922264245631886,
"learning_rate": 6.998521281773041e-06,
"loss": 0.6253255605697632,
"step": 61,
"token_acc": 0.8057149527805801
},
{
"epoch": 0.19903691813804172,
"grad_norm": 0.22117484958176203,
"learning_rate": 6.998302509518322e-06,
"loss": 0.5834554433822632,
"step": 62,
"token_acc": 0.8194585867590003
},
{
"epoch": 0.20224719101123595,
"grad_norm": 0.2369306420451283,
"learning_rate": 6.998068654563534e-06,
"loss": 0.6092122793197632,
"step": 63,
"token_acc": 0.8114008548966387
},
{
"epoch": 0.20545746388443017,
"grad_norm": 0.2374452306327778,
"learning_rate": 6.997819717916924e-06,
"loss": 0.57958984375,
"step": 64,
"token_acc": 0.8179867806451919
},
{
"epoch": 0.2086677367576244,
"grad_norm": 0.24491102842825901,
"learning_rate": 6.997555700651767e-06,
"loss": 0.6301676630973816,
"step": 65,
"token_acc": 0.8041523895074651
},
{
"epoch": 0.21187800963081863,
"grad_norm": 0.2533635108541559,
"learning_rate": 6.997276603906356e-06,
"loss": 0.6229248046875,
"step": 66,
"token_acc": 0.8068026927120842
},
{
"epoch": 0.21508828250401285,
"grad_norm": 0.25272404960266603,
"learning_rate": 6.996982428883997e-06,
"loss": 0.6161295771598816,
"step": 67,
"token_acc": 0.8068630844776081
},
{
"epoch": 0.21829855537720708,
"grad_norm": 0.24472278025695055,
"learning_rate": 6.996673176853009e-06,
"loss": 0.64013671875,
"step": 68,
"token_acc": 0.8036322295114298
},
{
"epoch": 0.22150882825040127,
"grad_norm": 0.2854371340953023,
"learning_rate": 6.9963488491467085e-06,
"loss": 0.6022135615348816,
"step": 69,
"token_acc": 0.8134928438501238
},
{
"epoch": 0.2247191011235955,
"grad_norm": 0.25148079147298236,
"learning_rate": 6.996009447163415e-06,
"loss": 0.6416015625,
"step": 70,
"token_acc": 0.8002263049431778
},
{
"epoch": 0.22792937399678972,
"grad_norm": 0.3285179445881002,
"learning_rate": 6.995654972366437e-06,
"loss": 0.6038411855697632,
"step": 71,
"token_acc": 0.8102099753673148
},
{
"epoch": 0.23113964686998395,
"grad_norm": 0.27484980697196754,
"learning_rate": 6.995285426284069e-06,
"loss": 0.6334635615348816,
"step": 72,
"token_acc": 0.8013070425989607
},
{
"epoch": 0.23434991974317818,
"grad_norm": 0.22943171657225794,
"learning_rate": 6.994900810509586e-06,
"loss": 0.62158203125,
"step": 73,
"token_acc": 0.8064510438728842
},
{
"epoch": 0.2375601926163724,
"grad_norm": 0.22114969362856388,
"learning_rate": 6.994501126701231e-06,
"loss": 0.607666015625,
"step": 74,
"token_acc": 0.8105192034063484
},
{
"epoch": 0.24077046548956663,
"grad_norm": 0.21411939799484717,
"learning_rate": 6.994086376582216e-06,
"loss": 0.64404296875,
"step": 75,
"token_acc": 0.8001576820735192
},
{
"epoch": 0.24398073836276082,
"grad_norm": 0.30133931257007796,
"learning_rate": 6.993656561940708e-06,
"loss": 0.6025797724723816,
"step": 76,
"token_acc": 0.8135537136285721
},
{
"epoch": 0.24719101123595505,
"grad_norm": 0.23684167938690207,
"learning_rate": 6.993211684629825e-06,
"loss": 0.628662109375,
"step": 77,
"token_acc": 0.8000817371987206
},
{
"epoch": 0.2504012841091493,
"grad_norm": 0.22858724073281955,
"learning_rate": 6.992751746567627e-06,
"loss": 0.58447265625,
"step": 78,
"token_acc": 0.8176294504797399
},
{
"epoch": 0.2536115569823435,
"grad_norm": 0.2266475406878526,
"learning_rate": 6.9922767497371035e-06,
"loss": 0.6127523183822632,
"step": 79,
"token_acc": 0.8079507163086846
},
{
"epoch": 0.2568218298555377,
"grad_norm": 0.2103005674438925,
"learning_rate": 6.991786696186174e-06,
"loss": 0.5852457880973816,
"step": 80,
"token_acc": 0.8153684854626521
},
{
"epoch": 0.26003210272873195,
"grad_norm": 0.22723109993760274,
"learning_rate": 6.9912815880276726e-06,
"loss": 0.6097819209098816,
"step": 81,
"token_acc": 0.8103663148795159
},
{
"epoch": 0.26324237560192615,
"grad_norm": 0.29984159529039645,
"learning_rate": 6.990761427439339e-06,
"loss": 0.6161702871322632,
"step": 82,
"token_acc": 0.807772867606998
},
{
"epoch": 0.2664526484751204,
"grad_norm": 0.20165801770814,
"learning_rate": 6.990226216663812e-06,
"loss": 0.6199544668197632,
"step": 83,
"token_acc": 0.8052925761371553
},
{
"epoch": 0.2696629213483146,
"grad_norm": 0.20292966725426143,
"learning_rate": 6.989675958008616e-06,
"loss": 0.6083984375,
"step": 84,
"token_acc": 0.8112003737594581
},
{
"epoch": 0.27287319422150885,
"grad_norm": 0.23864143651314107,
"learning_rate": 6.9891106538461556e-06,
"loss": 0.6287435293197632,
"step": 85,
"token_acc": 0.8035761022475708
},
{
"epoch": 0.27608346709470305,
"grad_norm": 0.20165640241083152,
"learning_rate": 6.988530306613702e-06,
"loss": 0.562744140625,
"step": 86,
"token_acc": 0.8244872851494846
},
{
"epoch": 0.27929373996789725,
"grad_norm": 0.24403191645122582,
"learning_rate": 6.987934918813385e-06,
"loss": 0.6338704824447632,
"step": 87,
"token_acc": 0.7988280640081371
},
{
"epoch": 0.2825040128410915,
"grad_norm": 0.19545947759314103,
"learning_rate": 6.987324493012178e-06,
"loss": 0.5802001953125,
"step": 88,
"token_acc": 0.8199500271690323
},
{
"epoch": 0.2857142857142857,
"grad_norm": 0.2253804481388312,
"learning_rate": 6.986699031841892e-06,
"loss": 0.595947265625,
"step": 89,
"token_acc": 0.8126973130192251
},
{
"epoch": 0.28892455858747995,
"grad_norm": 0.21931866178043172,
"learning_rate": 6.986058537999162e-06,
"loss": 0.5707194209098816,
"step": 90,
"token_acc": 0.8200324964871083
},
{
"epoch": 0.29213483146067415,
"grad_norm": 0.20499347869349804,
"learning_rate": 6.9854030142454365e-06,
"loss": 0.6116536855697632,
"step": 91,
"token_acc": 0.8094465472802599
},
{
"epoch": 0.2953451043338684,
"grad_norm": 0.2250303266125255,
"learning_rate": 6.98473246340696e-06,
"loss": 0.599365234375,
"step": 92,
"token_acc": 0.811629323867537
},
{
"epoch": 0.2985553772070626,
"grad_norm": 0.2085257553005371,
"learning_rate": 6.98404688837477e-06,
"loss": 0.579345703125,
"step": 93,
"token_acc": 0.8167167856323254
},
{
"epoch": 0.3017656500802568,
"grad_norm": 0.2245820178674568,
"learning_rate": 6.983346292104677e-06,
"loss": 0.6009928584098816,
"step": 94,
"token_acc": 0.8108060415463085
},
{
"epoch": 0.30497592295345105,
"grad_norm": 0.23427956495661934,
"learning_rate": 6.982630677617255e-06,
"loss": 0.6172689199447632,
"step": 95,
"token_acc": 0.8067812881737619
},
{
"epoch": 0.30818619582664525,
"grad_norm": 0.2338919293838274,
"learning_rate": 6.98190004799783e-06,
"loss": 0.5944417715072632,
"step": 96,
"token_acc": 0.8127140785908364
},
{
"epoch": 0.3113964686998395,
"grad_norm": 0.24889797189768018,
"learning_rate": 6.981154406396462e-06,
"loss": 0.5865072011947632,
"step": 97,
"token_acc": 0.8161886338728965
},
{
"epoch": 0.3146067415730337,
"grad_norm": 0.20962406790387186,
"learning_rate": 6.980393756027937e-06,
"loss": 0.5,
"step": 98,
"token_acc": 0.8442178208592472
},
{
"epoch": 0.31781701444622795,
"grad_norm": 0.2289159905663413,
"learning_rate": 6.979618100171748e-06,
"loss": 0.5677490234375,
"step": 99,
"token_acc": 0.8194265030485797
},
{
"epoch": 0.32102728731942215,
"grad_norm": 0.24122198762820238,
"learning_rate": 6.978827442172083e-06,
"loss": 0.5986735224723816,
"step": 100,
"token_acc": 0.8116517932311837
},
{
"epoch": 0.32423756019261635,
"grad_norm": 0.22373246065875144,
"learning_rate": 6.978021785437813e-06,
"loss": 0.5494791865348816,
"step": 101,
"token_acc": 0.8246630012780631
},
{
"epoch": 0.3274478330658106,
"grad_norm": 0.23636873692563587,
"learning_rate": 6.9772011334424736e-06,
"loss": 0.633056640625,
"step": 102,
"token_acc": 0.8022693088313757
},
{
"epoch": 0.3306581059390048,
"grad_norm": 0.22038734453112976,
"learning_rate": 6.976365489724251e-06,
"loss": 0.5868327021598816,
"step": 103,
"token_acc": 0.8138595932069553
},
{
"epoch": 0.33386837881219905,
"grad_norm": 0.22408692701933014,
"learning_rate": 6.975514857885968e-06,
"loss": 0.5525309443473816,
"step": 104,
"token_acc": 0.8250399138957633
},
{
"epoch": 0.33707865168539325,
"grad_norm": 0.23880530373560283,
"learning_rate": 6.974649241595068e-06,
"loss": 0.5885009765625,
"step": 105,
"token_acc": 0.8135449856905108
},
{
"epoch": 0.3402889245585875,
"grad_norm": 0.2219089889348867,
"learning_rate": 6.973768644583598e-06,
"loss": 0.5853678584098816,
"step": 106,
"token_acc": 0.8156692860923206
},
{
"epoch": 0.3434991974317817,
"grad_norm": 0.24434844798833436,
"learning_rate": 6.972873070648195e-06,
"loss": 0.6019694209098816,
"step": 107,
"token_acc": 0.8092352417680213
},
{
"epoch": 0.3467094703049759,
"grad_norm": 0.24131505642984427,
"learning_rate": 6.971962523650066e-06,
"loss": 0.5548909902572632,
"step": 108,
"token_acc": 0.8241489043461055
},
{
"epoch": 0.34991974317817015,
"grad_norm": 0.21370848840195006,
"learning_rate": 6.971037007514973e-06,
"loss": 0.5423991084098816,
"step": 109,
"token_acc": 0.8294367863882048
},
{
"epoch": 0.35313001605136435,
"grad_norm": 0.21529534769229164,
"learning_rate": 6.970096526233219e-06,
"loss": 0.5777181386947632,
"step": 110,
"token_acc": 0.8161277117442256
},
{
"epoch": 0.3563402889245586,
"grad_norm": 0.2126697222990689,
"learning_rate": 6.9691410838596274e-06,
"loss": 0.539794921875,
"step": 111,
"token_acc": 0.8310754970924679
},
{
"epoch": 0.3595505617977528,
"grad_norm": 0.2099849961286063,
"learning_rate": 6.9681706845135235e-06,
"loss": 0.54931640625,
"step": 112,
"token_acc": 0.8269279755101857
},
{
"epoch": 0.36276083467094705,
"grad_norm": 0.19964465289795394,
"learning_rate": 6.96718533237872e-06,
"loss": 0.5533854365348816,
"step": 113,
"token_acc": 0.8252085016943435
},
{
"epoch": 0.36597110754414125,
"grad_norm": 0.22495124874675887,
"learning_rate": 6.9661850317035e-06,
"loss": 0.5847982168197632,
"step": 114,
"token_acc": 0.8134031162952646
},
{
"epoch": 0.36918138041733545,
"grad_norm": 0.21442270445925285,
"learning_rate": 6.96516978680059e-06,
"loss": 0.567138671875,
"step": 115,
"token_acc": 0.819186331796326
},
{
"epoch": 0.3723916532905297,
"grad_norm": 0.23733459205129076,
"learning_rate": 6.964139602047153e-06,
"loss": 0.552734375,
"step": 116,
"token_acc": 0.8262689282403223
},
{
"epoch": 0.3756019261637239,
"grad_norm": 0.22871584287933092,
"learning_rate": 6.963094481884764e-06,
"loss": 0.5907389521598816,
"step": 117,
"token_acc": 0.8136596935605276
},
{
"epoch": 0.37881219903691815,
"grad_norm": 0.27109815884075683,
"learning_rate": 6.962034430819388e-06,
"loss": 0.603759765625,
"step": 118,
"token_acc": 0.807743927578312
},
{
"epoch": 0.38202247191011235,
"grad_norm": 0.22684991921359934,
"learning_rate": 6.960959453421364e-06,
"loss": 0.5983480215072632,
"step": 119,
"token_acc": 0.8135154163800631
},
{
"epoch": 0.3852327447833066,
"grad_norm": 0.26065427109275385,
"learning_rate": 6.959869554325387e-06,
"loss": 0.6192220449447632,
"step": 120,
"token_acc": 0.8068024339349146
},
{
"epoch": 0.3884430176565008,
"grad_norm": 0.23060650144915434,
"learning_rate": 6.958764738230486e-06,
"loss": 0.59130859375,
"step": 121,
"token_acc": 0.8157852810433464
},
{
"epoch": 0.391653290529695,
"grad_norm": 0.23337115115264662,
"learning_rate": 6.957645009900002e-06,
"loss": 0.5838623046875,
"step": 122,
"token_acc": 0.8170180478634359
},
{
"epoch": 0.39486356340288925,
"grad_norm": 0.22749336370908144,
"learning_rate": 6.95651037416157e-06,
"loss": 0.542236328125,
"step": 123,
"token_acc": 0.8281357519840995
},
{
"epoch": 0.39807383627608345,
"grad_norm": 0.26777825600197724,
"learning_rate": 6.955360835907094e-06,
"loss": 0.5811361074447632,
"step": 124,
"token_acc": 0.8172125757480218
},
{
"epoch": 0.4012841091492777,
"grad_norm": 0.21125480157250714,
"learning_rate": 6.9541964000927365e-06,
"loss": 0.5238444209098816,
"step": 125,
"token_acc": 0.8307404356732712
},
{
"epoch": 0.4044943820224719,
"grad_norm": 0.28968235140244575,
"learning_rate": 6.953017071738884e-06,
"loss": 0.5297445058822632,
"step": 126,
"token_acc": 0.8288457580216044
},
{
"epoch": 0.40770465489566615,
"grad_norm": 0.22532405020503135,
"learning_rate": 6.951822855930132e-06,
"loss": 0.6102702021598816,
"step": 127,
"token_acc": 0.8099563264673997
},
{
"epoch": 0.41091492776886035,
"grad_norm": 0.24207770615521643,
"learning_rate": 6.950613757815262e-06,
"loss": 0.5835775136947632,
"step": 128,
"token_acc": 0.8142721823466087
},
{
"epoch": 0.41412520064205455,
"grad_norm": 0.2818806687612919,
"learning_rate": 6.949389782607224e-06,
"loss": 0.5836181640625,
"step": 129,
"token_acc": 0.8151410447642872
},
{
"epoch": 0.4173354735152488,
"grad_norm": 0.22555436718218647,
"learning_rate": 6.948150935583104e-06,
"loss": 0.5754801630973816,
"step": 130,
"token_acc": 0.8165735381099055
},
{
"epoch": 0.420545746388443,
"grad_norm": 0.22108665350881793,
"learning_rate": 6.946897222084108e-06,
"loss": 0.495849609375,
"step": 131,
"token_acc": 0.8415052834998561
},
{
"epoch": 0.42375601926163725,
"grad_norm": 0.24997735771174617,
"learning_rate": 6.945628647515542e-06,
"loss": 0.5591227412223816,
"step": 132,
"token_acc": 0.8239088365246904
},
{
"epoch": 0.42696629213483145,
"grad_norm": 0.24031477779236896,
"learning_rate": 6.944345217346779e-06,
"loss": 0.5630697011947632,
"step": 133,
"token_acc": 0.8211002632665639
},
{
"epoch": 0.4301765650080257,
"grad_norm": 0.22334587736808414,
"learning_rate": 6.943046937111243e-06,
"loss": 0.5450846552848816,
"step": 134,
"token_acc": 0.8270578092652001
},
{
"epoch": 0.4333868378812199,
"grad_norm": 0.2535913704396145,
"learning_rate": 6.9417338124063855e-06,
"loss": 0.5626220703125,
"step": 135,
"token_acc": 0.8230574363105163
},
{
"epoch": 0.43659711075441415,
"grad_norm": 0.2360783896662543,
"learning_rate": 6.940405848893657e-06,
"loss": 0.5987955927848816,
"step": 136,
"token_acc": 0.8105675443353312
},
{
"epoch": 0.43980738362760835,
"grad_norm": 0.227552351357898,
"learning_rate": 6.939063052298481e-06,
"loss": 0.5629069209098816,
"step": 137,
"token_acc": 0.8191981932274853
},
{
"epoch": 0.44301765650080255,
"grad_norm": 0.23651265721469195,
"learning_rate": 6.9377054284102395e-06,
"loss": 0.572021484375,
"step": 138,
"token_acc": 0.8184342833848558
},
{
"epoch": 0.4462279293739968,
"grad_norm": 0.23035871036509648,
"learning_rate": 6.936332983082238e-06,
"loss": 0.5441080927848816,
"step": 139,
"token_acc": 0.8275835179057841
},
{
"epoch": 0.449438202247191,
"grad_norm": 0.2241414851384322,
"learning_rate": 6.934945722231681e-06,
"loss": 0.5607503652572632,
"step": 140,
"token_acc": 0.8231049158892737
},
{
"epoch": 0.45264847512038525,
"grad_norm": 0.20522800991476106,
"learning_rate": 6.933543651839656e-06,
"loss": 0.4959309995174408,
"step": 141,
"token_acc": 0.8439340400471143
},
{
"epoch": 0.45585874799357945,
"grad_norm": 0.28087736110337097,
"learning_rate": 6.932126777951095e-06,
"loss": 0.5668131709098816,
"step": 142,
"token_acc": 0.8209694360085737
},
{
"epoch": 0.4590690208667737,
"grad_norm": 0.24579741713184028,
"learning_rate": 6.930695106674754e-06,
"loss": 0.6044921875,
"step": 143,
"token_acc": 0.8082383181728449
},
{
"epoch": 0.4622792937399679,
"grad_norm": 0.2186687760996141,
"learning_rate": 6.929248644183193e-06,
"loss": 0.5465494990348816,
"step": 144,
"token_acc": 0.8264947550565516
},
{
"epoch": 0.4654895666131621,
"grad_norm": 0.21880386821151576,
"learning_rate": 6.927787396712737e-06,
"loss": 0.556884765625,
"step": 145,
"token_acc": 0.8225188628651489
},
{
"epoch": 0.46869983948635635,
"grad_norm": 0.2181719869340069,
"learning_rate": 6.926311370563459e-06,
"loss": 0.5694987177848816,
"step": 146,
"token_acc": 0.8188421765901659
},
{
"epoch": 0.47191011235955055,
"grad_norm": 0.24663948215716094,
"learning_rate": 6.924820572099146e-06,
"loss": 0.5752360224723816,
"step": 147,
"token_acc": 0.8163516597284418
},
{
"epoch": 0.4751203852327448,
"grad_norm": 0.21405980314840437,
"learning_rate": 6.92331500774728e-06,
"loss": 0.5713704824447632,
"step": 148,
"token_acc": 0.8194312733495944
},
{
"epoch": 0.478330658105939,
"grad_norm": 0.2288108170237036,
"learning_rate": 6.921794683999001e-06,
"loss": 0.553466796875,
"step": 149,
"token_acc": 0.8241651875080771
},
{
"epoch": 0.48154093097913325,
"grad_norm": 0.22572327698038352,
"learning_rate": 6.920259607409083e-06,
"loss": 0.5987142324447632,
"step": 150,
"token_acc": 0.8091493502900298
},
{
"epoch": 0.48475120385232745,
"grad_norm": 0.23659179885041934,
"learning_rate": 6.918709784595909e-06,
"loss": 0.5702311396598816,
"step": 151,
"token_acc": 0.8184588832975306
},
{
"epoch": 0.48796147672552165,
"grad_norm": 0.2172415460157859,
"learning_rate": 6.917145222241438e-06,
"loss": 0.55322265625,
"step": 152,
"token_acc": 0.8255652742967593
},
{
"epoch": 0.4911717495987159,
"grad_norm": 0.2415770177141649,
"learning_rate": 6.915565927091175e-06,
"loss": 0.569091796875,
"step": 153,
"token_acc": 0.8180283259225865
},
{
"epoch": 0.4943820224719101,
"grad_norm": 0.21994415876302964,
"learning_rate": 6.913971905954148e-06,
"loss": 0.5682780146598816,
"step": 154,
"token_acc": 0.8188937695372782
},
{
"epoch": 0.49759229534510435,
"grad_norm": 0.22183078942159312,
"learning_rate": 6.912363165702875e-06,
"loss": 0.5708822011947632,
"step": 155,
"token_acc": 0.820008484414334
},
{
"epoch": 0.5008025682182986,
"grad_norm": 0.2301899962788731,
"learning_rate": 6.910739713273332e-06,
"loss": 0.5730794668197632,
"step": 156,
"token_acc": 0.8183678267873312
},
{
"epoch": 0.5040128410914928,
"grad_norm": 0.24344159329348375,
"learning_rate": 6.909101555664925e-06,
"loss": 0.52001953125,
"step": 157,
"token_acc": 0.8352379923752359
},
{
"epoch": 0.507223113964687,
"grad_norm": 0.24562350866351212,
"learning_rate": 6.907448699940466e-06,
"loss": 0.593017578125,
"step": 158,
"token_acc": 0.8103424645897165
},
{
"epoch": 0.5104333868378812,
"grad_norm": 0.23059316629978308,
"learning_rate": 6.90578115322613e-06,
"loss": 0.5559896230697632,
"step": 159,
"token_acc": 0.8221401735275833
},
{
"epoch": 0.5136436597110754,
"grad_norm": 0.19449172151921634,
"learning_rate": 6.904098922711437e-06,
"loss": 0.5581868886947632,
"step": 160,
"token_acc": 0.821773430889453
},
{
"epoch": 0.5168539325842697,
"grad_norm": 0.2583978315849545,
"learning_rate": 6.902402015649211e-06,
"loss": 0.595703125,
"step": 161,
"token_acc": 0.8105830700080301
},
{
"epoch": 0.5200642054574639,
"grad_norm": 0.22692930990663654,
"learning_rate": 6.900690439355556e-06,
"loss": 0.5867513418197632,
"step": 162,
"token_acc": 0.8146894100986661
},
{
"epoch": 0.5232744783306581,
"grad_norm": 0.28318555908117266,
"learning_rate": 6.898964201209819e-06,
"loss": 0.60546875,
"step": 163,
"token_acc": 0.8075459800056409
},
{
"epoch": 0.5264847512038523,
"grad_norm": 0.24970083895982967,
"learning_rate": 6.897223308654561e-06,
"loss": 0.5707194209098816,
"step": 164,
"token_acc": 0.8176165295145137
},
{
"epoch": 0.5296950240770465,
"grad_norm": 0.25340900406989025,
"learning_rate": 6.895467769195527e-06,
"loss": 0.548583984375,
"step": 165,
"token_acc": 0.825177801435191
},
{
"epoch": 0.5329052969502408,
"grad_norm": 0.32645258457685933,
"learning_rate": 6.8936975904016085e-06,
"loss": 0.5934244990348816,
"step": 166,
"token_acc": 0.8111397321354924
},
{
"epoch": 0.536115569823435,
"grad_norm": 0.2407030461325421,
"learning_rate": 6.891912779904814e-06,
"loss": 0.5758463740348816,
"step": 167,
"token_acc": 0.8148208275034046
},
{
"epoch": 0.5393258426966292,
"grad_norm": 0.22576411609848393,
"learning_rate": 6.8901133454002365e-06,
"loss": 0.6165364980697632,
"step": 168,
"token_acc": 0.8023810162399925
},
{
"epoch": 0.5425361155698234,
"grad_norm": 0.25395984417353595,
"learning_rate": 6.888299294646019e-06,
"loss": 0.5718587636947632,
"step": 169,
"token_acc": 0.81836172332396
},
{
"epoch": 0.5457463884430177,
"grad_norm": 0.2224281422448542,
"learning_rate": 6.8864706354633215e-06,
"loss": 0.546630859375,
"step": 170,
"token_acc": 0.8248056872037914
},
{
"epoch": 0.5489566613162119,
"grad_norm": 0.22587769476968814,
"learning_rate": 6.884627375736286e-06,
"loss": 0.5841471552848816,
"step": 171,
"token_acc": 0.8147992586599647
},
{
"epoch": 0.5521669341894061,
"grad_norm": 0.2170054892285852,
"learning_rate": 6.882769523412006e-06,
"loss": 0.5365804433822632,
"step": 172,
"token_acc": 0.828471935090668
},
{
"epoch": 0.5553772070626003,
"grad_norm": 0.23690655653269174,
"learning_rate": 6.88089708650049e-06,
"loss": 0.5306803584098816,
"step": 173,
"token_acc": 0.8298377031583641
},
{
"epoch": 0.5585874799357945,
"grad_norm": 0.2314215800076655,
"learning_rate": 6.879010073074624e-06,
"loss": 0.6065267324447632,
"step": 174,
"token_acc": 0.8071848070756561
},
{
"epoch": 0.5617977528089888,
"grad_norm": 0.22361031206178827,
"learning_rate": 6.8771084912701436e-06,
"loss": 0.5253092646598816,
"step": 175,
"token_acc": 0.8319281568387744
},
{
"epoch": 0.565008025682183,
"grad_norm": 0.22176935468268558,
"learning_rate": 6.8751923492855915e-06,
"loss": 0.5660807490348816,
"step": 176,
"token_acc": 0.8193021750652316
},
{
"epoch": 0.5682182985553772,
"grad_norm": 0.24144561116957808,
"learning_rate": 6.873261655382288e-06,
"loss": 0.5496826171875,
"step": 177,
"token_acc": 0.8220418344678725
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.21286758327817357,
"learning_rate": 6.8713164178842926e-06,
"loss": 0.5799967646598816,
"step": 178,
"token_acc": 0.8143100972023347
},
{
"epoch": 0.5746388443017657,
"grad_norm": 0.21782450812586338,
"learning_rate": 6.8693566451783665e-06,
"loss": 0.5998942255973816,
"step": 179,
"token_acc": 0.8102601393648907
},
{
"epoch": 0.5778491171749599,
"grad_norm": 0.2324683480162456,
"learning_rate": 6.867382345713942e-06,
"loss": 0.5740560293197632,
"step": 180,
"token_acc": 0.8173885189328248
},
{
"epoch": 0.5810593900481541,
"grad_norm": 0.20533388731698282,
"learning_rate": 6.86539352800308e-06,
"loss": 0.4949137568473816,
"step": 181,
"token_acc": 0.8424438091193526
},
{
"epoch": 0.5842696629213483,
"grad_norm": 0.20592976059215887,
"learning_rate": 6.8633902006204375e-06,
"loss": 0.5462239980697632,
"step": 182,
"token_acc": 0.8261973157867761
},
{
"epoch": 0.5874799357945425,
"grad_norm": 0.2113373512227851,
"learning_rate": 6.861372372203227e-06,
"loss": 0.5481771230697632,
"step": 183,
"token_acc": 0.8251484703826323
},
{
"epoch": 0.5906902086677368,
"grad_norm": 0.1933295692622174,
"learning_rate": 6.859340051451183e-06,
"loss": 0.5323079824447632,
"step": 184,
"token_acc": 0.8299652429059305
},
{
"epoch": 0.593900481540931,
"grad_norm": 0.23548376959220121,
"learning_rate": 6.857293247126522e-06,
"loss": 0.5689290761947632,
"step": 185,
"token_acc": 0.8176760652421292
},
{
"epoch": 0.5971107544141252,
"grad_norm": 0.20081126428895105,
"learning_rate": 6.855231968053906e-06,
"loss": 0.5013834834098816,
"step": 186,
"token_acc": 0.8375914147549965
},
{
"epoch": 0.6003210272873194,
"grad_norm": 0.20025396799297465,
"learning_rate": 6.8531562231204015e-06,
"loss": 0.511474609375,
"step": 187,
"token_acc": 0.8372179968386327
},
{
"epoch": 0.6035313001605136,
"grad_norm": 0.27351283743607113,
"learning_rate": 6.851066021275448e-06,
"loss": 0.5250651240348816,
"step": 188,
"token_acc": 0.8299763504644978
},
{
"epoch": 0.6067415730337079,
"grad_norm": 0.2426189973247391,
"learning_rate": 6.8489613715308116e-06,
"loss": 0.5476888418197632,
"step": 189,
"token_acc": 0.825513469139694
},
{
"epoch": 0.6099518459069021,
"grad_norm": 0.2400231078753033,
"learning_rate": 6.846842282960551e-06,
"loss": 0.5440673828125,
"step": 190,
"token_acc": 0.8253329204087952
},
{
"epoch": 0.6131621187800963,
"grad_norm": 0.25681990063139026,
"learning_rate": 6.8447087647009756e-06,
"loss": 0.5882975459098816,
"step": 191,
"token_acc": 0.8140240697564722
},
{
"epoch": 0.6163723916532905,
"grad_norm": 0.21533682569585577,
"learning_rate": 6.842560825950609e-06,
"loss": 0.5162353515625,
"step": 192,
"token_acc": 0.8349828905724699
},
{
"epoch": 0.6195826645264848,
"grad_norm": 0.224070847170404,
"learning_rate": 6.840398475970147e-06,
"loss": 0.5556640625,
"step": 193,
"token_acc": 0.8219916672593991
},
{
"epoch": 0.622792937399679,
"grad_norm": 0.20086320548268272,
"learning_rate": 6.838221724082419e-06,
"loss": 0.55078125,
"step": 194,
"token_acc": 0.8239912087816152
},
{
"epoch": 0.6260032102728732,
"grad_norm": 0.22337205031208743,
"learning_rate": 6.836030579672347e-06,
"loss": 0.5475260615348816,
"step": 195,
"token_acc": 0.8235302077295884
},
{
"epoch": 0.6292134831460674,
"grad_norm": 0.22668574083983906,
"learning_rate": 6.833825052186905e-06,
"loss": 0.5760091543197632,
"step": 196,
"token_acc": 0.8175243891128943
},
{
"epoch": 0.6324237560192616,
"grad_norm": 0.19877544092977742,
"learning_rate": 6.8316051511350786e-06,
"loss": 0.536376953125,
"step": 197,
"token_acc": 0.8281901004896317
},
{
"epoch": 0.6356340288924559,
"grad_norm": 0.20395951641675453,
"learning_rate": 6.8293708860878245e-06,
"loss": 0.5526530146598816,
"step": 198,
"token_acc": 0.8250786404619416
},
{
"epoch": 0.6388443017656501,
"grad_norm": 0.21289607589577456,
"learning_rate": 6.82712226667803e-06,
"loss": 0.548095703125,
"step": 199,
"token_acc": 0.8248148446845975
},
{
"epoch": 0.6420545746388443,
"grad_norm": 0.24152612355007852,
"learning_rate": 6.824859302600468e-06,
"loss": 0.5478109121322632,
"step": 200,
"token_acc": 0.8234885791051114
},
{
"epoch": 0.6452648475120385,
"grad_norm": 0.22741295294045016,
"learning_rate": 6.822582003611759e-06,
"loss": 0.54248046875,
"step": 201,
"token_acc": 0.8279165289983051
},
{
"epoch": 0.6484751203852327,
"grad_norm": 0.21542183542521456,
"learning_rate": 6.820290379530326e-06,
"loss": 0.5347900390625,
"step": 202,
"token_acc": 0.8269245689989603
},
{
"epoch": 0.651685393258427,
"grad_norm": 0.21563815091926725,
"learning_rate": 6.817984440236357e-06,
"loss": 0.5167643427848816,
"step": 203,
"token_acc": 0.8355193840088763
},
{
"epoch": 0.6548956661316212,
"grad_norm": 0.21492257068771026,
"learning_rate": 6.8156641956717535e-06,
"loss": 0.5599772334098816,
"step": 204,
"token_acc": 0.8229220720036458
},
{
"epoch": 0.6581059390048154,
"grad_norm": 0.21079929662710306,
"learning_rate": 6.8133296558401e-06,
"loss": 0.5957845449447632,
"step": 205,
"token_acc": 0.8099493419818274
},
{
"epoch": 0.6613162118780096,
"grad_norm": 0.28263826007405335,
"learning_rate": 6.81098083080661e-06,
"loss": 0.5642904043197632,
"step": 206,
"token_acc": 0.8202062771524219
},
{
"epoch": 0.6645264847512039,
"grad_norm": 0.23663061523679904,
"learning_rate": 6.808617730698085e-06,
"loss": 0.5949300527572632,
"step": 207,
"token_acc": 0.8093328906747198
},
{
"epoch": 0.6677367576243981,
"grad_norm": 0.21481492269788427,
"learning_rate": 6.806240365702877e-06,
"loss": 0.5576986074447632,
"step": 208,
"token_acc": 0.8225621783463226
},
{
"epoch": 0.6709470304975923,
"grad_norm": 0.2596134684640292,
"learning_rate": 6.803848746070839e-06,
"loss": 0.5392252802848816,
"step": 209,
"token_acc": 0.8259330772465088
},
{
"epoch": 0.6741573033707865,
"grad_norm": 0.24809961814052503,
"learning_rate": 6.801442882113278e-06,
"loss": 0.52099609375,
"step": 210,
"token_acc": 0.8323869477105627
},
{
"epoch": 0.6773675762439807,
"grad_norm": 0.2443118739139523,
"learning_rate": 6.79902278420292e-06,
"loss": 0.5388997793197632,
"step": 211,
"token_acc": 0.8263394986714556
},
{
"epoch": 0.680577849117175,
"grad_norm": 0.21098866976666678,
"learning_rate": 6.796588462773857e-06,
"loss": 0.5052286982536316,
"step": 212,
"token_acc": 0.8393440527484701
},
{
"epoch": 0.6837881219903692,
"grad_norm": 0.24102593312514728,
"learning_rate": 6.794139928321504e-06,
"loss": 0.552490234375,
"step": 213,
"token_acc": 0.820462355331019
},
{
"epoch": 0.6869983948635634,
"grad_norm": 0.19538302780441605,
"learning_rate": 6.791677191402555e-06,
"loss": 0.5192464590072632,
"step": 214,
"token_acc": 0.8333250621849146
},
{
"epoch": 0.6902086677367576,
"grad_norm": 0.21456432126279212,
"learning_rate": 6.789200262634939e-06,
"loss": 0.5003662109375,
"step": 215,
"token_acc": 0.8399372573500391
},
{
"epoch": 0.6934189406099518,
"grad_norm": 0.2153404027287284,
"learning_rate": 6.7867091526977696e-06,
"loss": 0.5514323115348816,
"step": 216,
"token_acc": 0.8233947895282249
},
{
"epoch": 0.6966292134831461,
"grad_norm": 0.21079426317177405,
"learning_rate": 6.784203872331302e-06,
"loss": 0.5425618886947632,
"step": 217,
"token_acc": 0.8267645216064796
},
{
"epoch": 0.6998394863563403,
"grad_norm": 0.22090843136929758,
"learning_rate": 6.7816844323368905e-06,
"loss": 0.5508829951286316,
"step": 218,
"token_acc": 0.8237542129810146
},
{
"epoch": 0.7030497592295345,
"grad_norm": 0.30110343371175113,
"learning_rate": 6.77915084357693e-06,
"loss": 0.546875,
"step": 219,
"token_acc": 0.8256972124018396
},
{
"epoch": 0.7062600321027287,
"grad_norm": 0.2404307206347699,
"learning_rate": 6.776603116974823e-06,
"loss": 0.599609375,
"step": 220,
"token_acc": 0.8071373867727709
},
{
"epoch": 0.709470304975923,
"grad_norm": 0.2704381577683076,
"learning_rate": 6.7740412635149225e-06,
"loss": 0.57293701171875,
"step": 221,
"token_acc": 0.8164267936370523
},
{
"epoch": 0.7126805778491172,
"grad_norm": 0.21919175912057806,
"learning_rate": 6.771465294242493e-06,
"loss": 0.5669759511947632,
"step": 222,
"token_acc": 0.8191622319226122
},
{
"epoch": 0.7158908507223114,
"grad_norm": 0.21425950203620117,
"learning_rate": 6.768875220263655e-06,
"loss": 0.559814453125,
"step": 223,
"token_acc": 0.8224226963471117
},
{
"epoch": 0.7191011235955056,
"grad_norm": 0.21223607679357182,
"learning_rate": 6.76627105274534e-06,
"loss": 0.5444743037223816,
"step": 224,
"token_acc": 0.8234702086507026
},
{
"epoch": 0.7223113964686998,
"grad_norm": 0.20942358002404907,
"learning_rate": 6.763652802915245e-06,
"loss": 0.5237223505973816,
"step": 225,
"token_acc": 0.8334582114515168
},
{
"epoch": 0.7255216693418941,
"grad_norm": 0.24723183006765018,
"learning_rate": 6.761020482061782e-06,
"loss": 0.56414794921875,
"step": 226,
"token_acc": 0.8203677260789879
},
{
"epoch": 0.7287319422150883,
"grad_norm": 0.25030847996600203,
"learning_rate": 6.758374101534027e-06,
"loss": 0.5577799677848816,
"step": 227,
"token_acc": 0.8203356490268309
},
{
"epoch": 0.7319422150882825,
"grad_norm": 0.3205086458960867,
"learning_rate": 6.755713672741676e-06,
"loss": 0.5999755859375,
"step": 228,
"token_acc": 0.8054511124060968
},
{
"epoch": 0.7351524879614767,
"grad_norm": 0.24550784670563833,
"learning_rate": 6.753039207154989e-06,
"loss": 0.57373046875,
"step": 229,
"token_acc": 0.8176859273836078
},
{
"epoch": 0.7383627608346709,
"grad_norm": 0.19001728402352852,
"learning_rate": 6.750350716304752e-06,
"loss": 0.5404459834098816,
"step": 230,
"token_acc": 0.8257092883215871
},
{
"epoch": 0.7415730337078652,
"grad_norm": 0.20564095580716163,
"learning_rate": 6.747648211782212e-06,
"loss": 0.5472005605697632,
"step": 231,
"token_acc": 0.8256463868333908
},
{
"epoch": 0.7447833065810594,
"grad_norm": 0.25323339440670445,
"learning_rate": 6.74493170523904e-06,
"loss": 0.5516764521598816,
"step": 232,
"token_acc": 0.8231926897249683
},
{
"epoch": 0.7479935794542536,
"grad_norm": 0.23805514044333884,
"learning_rate": 6.742201208387276e-06,
"loss": 0.52392578125,
"step": 233,
"token_acc": 0.8286431336834839
},
{
"epoch": 0.7512038523274478,
"grad_norm": 0.20890219888317152,
"learning_rate": 6.739456732999274e-06,
"loss": 0.5518392324447632,
"step": 234,
"token_acc": 0.8245726725848885
},
{
"epoch": 0.7544141252006421,
"grad_norm": 0.20338522734836773,
"learning_rate": 6.73669829090766e-06,
"loss": 0.53076171875,
"step": 235,
"token_acc": 0.8298385429058719
},
{
"epoch": 0.7576243980738363,
"grad_norm": 0.2643362744904908,
"learning_rate": 6.733925894005273e-06,
"loss": 0.5685221552848816,
"step": 236,
"token_acc": 0.8183341855576388
},
{
"epoch": 0.7608346709470305,
"grad_norm": 0.2287330654705364,
"learning_rate": 6.731139554245122e-06,
"loss": 0.5322672724723816,
"step": 237,
"token_acc": 0.8259961312934699
},
{
"epoch": 0.7640449438202247,
"grad_norm": 0.2275245612718834,
"learning_rate": 6.728339283640325e-06,
"loss": 0.4906412959098816,
"step": 238,
"token_acc": 0.8402331584922113
},
{
"epoch": 0.7672552166934189,
"grad_norm": 0.24729750718952795,
"learning_rate": 6.7255250942640625e-06,
"loss": 0.5785726308822632,
"step": 239,
"token_acc": 0.8139347537406585
},
{
"epoch": 0.7704654895666132,
"grad_norm": 0.22470429416037957,
"learning_rate": 6.722696998249527e-06,
"loss": 0.5373942255973816,
"step": 240,
"token_acc": 0.8278915767997669
},
{
"epoch": 0.7736757624398074,
"grad_norm": 0.21124522226813244,
"learning_rate": 6.719855007789868e-06,
"loss": 0.5575765371322632,
"step": 241,
"token_acc": 0.8219577750122666
},
{
"epoch": 0.7768860353130016,
"grad_norm": 0.22522506304223616,
"learning_rate": 6.71699913513814e-06,
"loss": 0.5452474355697632,
"step": 242,
"token_acc": 0.8278975153651517
},
{
"epoch": 0.7800963081861958,
"grad_norm": 0.21054417790079533,
"learning_rate": 6.714129392607248e-06,
"loss": 0.5528157949447632,
"step": 243,
"token_acc": 0.8216659366800207
},
{
"epoch": 0.78330658105939,
"grad_norm": 0.2168513347663671,
"learning_rate": 6.7112457925698985e-06,
"loss": 0.5669759511947632,
"step": 244,
"token_acc": 0.8197795876351085
},
{
"epoch": 0.7865168539325843,
"grad_norm": 0.271555617520745,
"learning_rate": 6.7083483474585395e-06,
"loss": 0.5638834834098816,
"step": 245,
"token_acc": 0.8143064734116244
},
{
"epoch": 0.7897271268057785,
"grad_norm": 0.21704184254450343,
"learning_rate": 6.705437069765319e-06,
"loss": 0.56658935546875,
"step": 246,
"token_acc": 0.817545748116254
},
{
"epoch": 0.7929373996789727,
"grad_norm": 0.22862825396828354,
"learning_rate": 6.702511972042014e-06,
"loss": 0.5729166865348816,
"step": 247,
"token_acc": 0.8182453879142039
},
{
"epoch": 0.7961476725521669,
"grad_norm": 0.2265701155951088,
"learning_rate": 6.6995730668999925e-06,
"loss": 0.5604655146598816,
"step": 248,
"token_acc": 0.8183653526421868
},
{
"epoch": 0.7993579454253612,
"grad_norm": 0.23197032599583667,
"learning_rate": 6.696620367010148e-06,
"loss": 0.5447591543197632,
"step": 249,
"token_acc": 0.8233240936035472
},
{
"epoch": 0.8025682182985554,
"grad_norm": 0.23312218134153492,
"learning_rate": 6.693653885102853e-06,
"loss": 0.5588786005973816,
"step": 250,
"token_acc": 0.819455498267914
},
{
"epoch": 0.8057784911717496,
"grad_norm": 0.2151929612075134,
"learning_rate": 6.690673633967896e-06,
"loss": 0.5681559443473816,
"step": 251,
"token_acc": 0.8187757678531563
},
{
"epoch": 0.8089887640449438,
"grad_norm": 0.21115778956555162,
"learning_rate": 6.687679626454435e-06,
"loss": 0.5596517324447632,
"step": 252,
"token_acc": 0.8215622369212267
},
{
"epoch": 0.812199036918138,
"grad_norm": 0.2174992394129607,
"learning_rate": 6.684671875470934e-06,
"loss": 0.5286458730697632,
"step": 253,
"token_acc": 0.8295877099816256
},
{
"epoch": 0.8154093097913323,
"grad_norm": 0.2983534969364284,
"learning_rate": 6.6816503939851136e-06,
"loss": 0.5350748896598816,
"step": 254,
"token_acc": 0.825895675553894
},
{
"epoch": 0.8186195826645265,
"grad_norm": 0.2085815626895177,
"learning_rate": 6.678615195023891e-06,
"loss": 0.5171305537223816,
"step": 255,
"token_acc": 0.8348302581547042
},
{
"epoch": 0.8218298555377207,
"grad_norm": 0.21559592718354698,
"learning_rate": 6.675566291673325e-06,
"loss": 0.5474853515625,
"step": 256,
"token_acc": 0.824765407128191
},
{
"epoch": 0.8250401284109149,
"grad_norm": 0.24128741129795617,
"learning_rate": 6.672503697078562e-06,
"loss": 0.5852864980697632,
"step": 257,
"token_acc": 0.8155020986063753
},
{
"epoch": 0.8282504012841091,
"grad_norm": 0.2293702550872797,
"learning_rate": 6.669427424443776e-06,
"loss": 0.5548502802848816,
"step": 258,
"token_acc": 0.8226406443632063
},
{
"epoch": 0.8314606741573034,
"grad_norm": 0.25674300678141526,
"learning_rate": 6.666337487032113e-06,
"loss": 0.5816243886947632,
"step": 259,
"token_acc": 0.8115385153930885
},
{
"epoch": 0.8346709470304976,
"grad_norm": 0.21850937742829685,
"learning_rate": 6.663233898165635e-06,
"loss": 0.5177409052848816,
"step": 260,
"token_acc": 0.8329880272119501
},
{
"epoch": 0.8378812199036918,
"grad_norm": 0.3843277677133425,
"learning_rate": 6.660116671225258e-06,
"loss": 0.5751953125,
"step": 261,
"token_acc": 0.8143645310289503
},
{
"epoch": 0.841091492776886,
"grad_norm": 0.21134853251628777,
"learning_rate": 6.656985819650703e-06,
"loss": 0.45391845703125,
"step": 262,
"token_acc": 0.8531797499880263
},
{
"epoch": 0.8443017656500803,
"grad_norm": 0.25633734763605176,
"learning_rate": 6.653841356940426e-06,
"loss": 0.5741373896598816,
"step": 263,
"token_acc": 0.8163117184245008
},
{
"epoch": 0.8475120385232745,
"grad_norm": 0.20414507237731114,
"learning_rate": 6.650683296651573e-06,
"loss": 0.5130208730697632,
"step": 264,
"token_acc": 0.8335838411053109
},
{
"epoch": 0.8507223113964687,
"grad_norm": 0.23368657687253075,
"learning_rate": 6.647511652399912e-06,
"loss": 0.576904296875,
"step": 265,
"token_acc": 0.8158995625019959
},
{
"epoch": 0.8539325842696629,
"grad_norm": 0.2823726018701455,
"learning_rate": 6.6443264378597775e-06,
"loss": 0.5238851308822632,
"step": 266,
"token_acc": 0.8303428680614855
},
{
"epoch": 0.8571428571428571,
"grad_norm": 0.2660958114189501,
"learning_rate": 6.641127666764009e-06,
"loss": 0.5625,
"step": 267,
"token_acc": 0.8188886633069231
},
{
"epoch": 0.8603531300160514,
"grad_norm": 0.23790729136692979,
"learning_rate": 6.6379153529038996e-06,
"loss": 0.5369466543197632,
"step": 268,
"token_acc": 0.8278342392587025
},
{
"epoch": 0.8635634028892456,
"grad_norm": 0.22080856072472654,
"learning_rate": 6.634689510129127e-06,
"loss": 0.4977213740348816,
"step": 269,
"token_acc": 0.8393823857083582
},
{
"epoch": 0.8667736757624398,
"grad_norm": 0.2412716818325412,
"learning_rate": 6.6314501523477e-06,
"loss": 0.5535482168197632,
"step": 270,
"token_acc": 0.8220067709239879
},
{
"epoch": 0.869983948635634,
"grad_norm": 0.2396795881940935,
"learning_rate": 6.628197293525894e-06,
"loss": 0.56494140625,
"step": 271,
"token_acc": 0.8181763386789053
},
{
"epoch": 0.8731942215088283,
"grad_norm": 0.26498431498454633,
"learning_rate": 6.624930947688197e-06,
"loss": 0.5183919668197632,
"step": 272,
"token_acc": 0.8312001392731633
},
{
"epoch": 0.8764044943820225,
"grad_norm": 0.1927504153360831,
"learning_rate": 6.6216511289172395e-06,
"loss": 0.5640462636947632,
"step": 273,
"token_acc": 0.8176485799109028
},
{
"epoch": 0.8796147672552167,
"grad_norm": 0.19474833122907118,
"learning_rate": 6.618357851353749e-06,
"loss": 0.4883219599723816,
"step": 274,
"token_acc": 0.844173214072538
},
{
"epoch": 0.8828250401284109,
"grad_norm": 0.24092291673702348,
"learning_rate": 6.615051129196469e-06,
"loss": 0.5417073965072632,
"step": 275,
"token_acc": 0.8236458084139298
},
{
"epoch": 0.8860353130016051,
"grad_norm": 0.21413177551118223,
"learning_rate": 6.611730976702116e-06,
"loss": 0.4571940302848816,
"step": 276,
"token_acc": 0.8514311032529894
},
{
"epoch": 0.8892455858747994,
"grad_norm": 0.20494712262771816,
"learning_rate": 6.608397408185307e-06,
"loss": 0.5249837636947632,
"step": 277,
"token_acc": 0.8307591432318229
},
{
"epoch": 0.8924558587479936,
"grad_norm": 0.227860155385193,
"learning_rate": 6.605050438018503e-06,
"loss": 0.5576171875,
"step": 278,
"token_acc": 0.8215625938536182
},
{
"epoch": 0.8956661316211878,
"grad_norm": 0.20272894805165137,
"learning_rate": 6.6016900806319444e-06,
"loss": 0.5760091543197632,
"step": 279,
"token_acc": 0.8148033546019078
},
{
"epoch": 0.898876404494382,
"grad_norm": 0.21292167264886414,
"learning_rate": 6.598316350513591e-06,
"loss": 0.5145263671875,
"step": 280,
"token_acc": 0.8337830790752344
},
{
"epoch": 0.9020866773675762,
"grad_norm": 0.2211511021106543,
"learning_rate": 6.594929262209055e-06,
"loss": 0.5369466543197632,
"step": 281,
"token_acc": 0.8276449799280647
},
{
"epoch": 0.9052969502407705,
"grad_norm": 0.2047148842889143,
"learning_rate": 6.591528830321546e-06,
"loss": 0.5138346552848816,
"step": 282,
"token_acc": 0.8341773490893749
},
{
"epoch": 0.9085072231139647,
"grad_norm": 0.24542746740430346,
"learning_rate": 6.5881150695118e-06,
"loss": 0.539306640625,
"step": 283,
"token_acc": 0.8257388871001344
},
{
"epoch": 0.9117174959871589,
"grad_norm": 0.23317430848356677,
"learning_rate": 6.5846879944980224e-06,
"loss": 0.5642904043197632,
"step": 284,
"token_acc": 0.8206747364141221
},
{
"epoch": 0.9149277688603531,
"grad_norm": 0.22878109419510925,
"learning_rate": 6.58124762005582e-06,
"loss": 0.5784912109375,
"step": 285,
"token_acc": 0.8153904333455296
},
{
"epoch": 0.9181380417335474,
"grad_norm": 0.23490487636437477,
"learning_rate": 6.577793961018139e-06,
"loss": 0.56982421875,
"step": 286,
"token_acc": 0.8179441732969294
},
{
"epoch": 0.9213483146067416,
"grad_norm": 0.2213625777811698,
"learning_rate": 6.574327032275203e-06,
"loss": 0.5587565302848816,
"step": 287,
"token_acc": 0.8212527035117336
},
{
"epoch": 0.9245585874799358,
"grad_norm": 0.23982512719414933,
"learning_rate": 6.570846848774445e-06,
"loss": 0.51416015625,
"step": 288,
"token_acc": 0.8342100757147085
},
{
"epoch": 0.92776886035313,
"grad_norm": 0.24863035113160709,
"learning_rate": 6.567353425520448e-06,
"loss": 0.5484212636947632,
"step": 289,
"token_acc": 0.823044297514873
},
{
"epoch": 0.9309791332263242,
"grad_norm": 0.24695733139934517,
"learning_rate": 6.563846777574875e-06,
"loss": 0.5726318359375,
"step": 290,
"token_acc": 0.8166731715915243
},
{
"epoch": 0.9341894060995185,
"grad_norm": 0.22335565967464163,
"learning_rate": 6.5603269200564055e-06,
"loss": 0.5408529043197632,
"step": 291,
"token_acc": 0.8273028089404346
},
{
"epoch": 0.9373996789727127,
"grad_norm": 0.18186030967506026,
"learning_rate": 6.556793868140674e-06,
"loss": 0.4903157651424408,
"step": 292,
"token_acc": 0.8423202288740971
},
{
"epoch": 0.9406099518459069,
"grad_norm": 0.22464903635996314,
"learning_rate": 6.5532476370602e-06,
"loss": 0.5187174677848816,
"step": 293,
"token_acc": 0.8333880337392103
},
{
"epoch": 0.9438202247191011,
"grad_norm": 0.2003623985055459,
"learning_rate": 6.549688242104324e-06,
"loss": 0.4782308042049408,
"step": 294,
"token_acc": 0.8428174821070171
},
{
"epoch": 0.9470304975922953,
"grad_norm": 0.3648452937041888,
"learning_rate": 6.546115698619143e-06,
"loss": 0.5069173574447632,
"step": 295,
"token_acc": 0.8375849390096171
},
{
"epoch": 0.9502407704654896,
"grad_norm": 0.2668698493414911,
"learning_rate": 6.54253002200744e-06,
"loss": 0.5812174677848816,
"step": 296,
"token_acc": 0.8119673669275481
},
{
"epoch": 0.9534510433386838,
"grad_norm": 0.23425175593147715,
"learning_rate": 6.538931227728625e-06,
"loss": 0.5435384511947632,
"step": 297,
"token_acc": 0.823641333000032
},
{
"epoch": 0.956661316211878,
"grad_norm": 0.21316620580353451,
"learning_rate": 6.535319331298662e-06,
"loss": 0.5441080927848816,
"step": 298,
"token_acc": 0.8267760735277975
},
{
"epoch": 0.9598715890850722,
"grad_norm": 0.24898603165299715,
"learning_rate": 6.531694348290001e-06,
"loss": 0.5554606318473816,
"step": 299,
"token_acc": 0.8209872341754991
},
{
"epoch": 0.9630818619582665,
"grad_norm": 0.2554315265827971,
"learning_rate": 6.528056294331519e-06,
"loss": 0.4842122495174408,
"step": 300,
"token_acc": 0.8440150411488848
},
{
"epoch": 0.9662921348314607,
"grad_norm": 0.22974814057192716,
"learning_rate": 6.524405185108444e-06,
"loss": 0.5605875849723816,
"step": 301,
"token_acc": 0.8200306844214243
},
{
"epoch": 0.9695024077046549,
"grad_norm": 0.2596599946392832,
"learning_rate": 6.520741036362294e-06,
"loss": 0.5032145380973816,
"step": 302,
"token_acc": 0.8374895144901848
},
{
"epoch": 0.9727126805778491,
"grad_norm": 0.2402742203312852,
"learning_rate": 6.517063863890802e-06,
"loss": 0.5245768427848816,
"step": 303,
"token_acc": 0.8304467561823956
},
{
"epoch": 0.9759229534510433,
"grad_norm": 0.23484754575093275,
"learning_rate": 6.513373683547856e-06,
"loss": 0.5390218496322632,
"step": 304,
"token_acc": 0.8251677096206059
},
{
"epoch": 0.9791332263242376,
"grad_norm": 0.22608927358046563,
"learning_rate": 6.509670511243424e-06,
"loss": 0.4981282651424408,
"step": 305,
"token_acc": 0.8377231819118646
},
{
"epoch": 0.9823434991974318,
"grad_norm": 0.19516925536917554,
"learning_rate": 6.505954362943486e-06,
"loss": 0.4888509213924408,
"step": 306,
"token_acc": 0.8426081587359
},
{
"epoch": 0.985553772070626,
"grad_norm": 0.2890322962206889,
"learning_rate": 6.502225254669973e-06,
"loss": 0.5541178584098816,
"step": 307,
"token_acc": 0.8245721970122185
},
{
"epoch": 0.9887640449438202,
"grad_norm": 0.2439684361938674,
"learning_rate": 6.498483202500689e-06,
"loss": 0.5735677480697632,
"step": 308,
"token_acc": 0.815568343528531
},
{
"epoch": 0.9919743178170144,
"grad_norm": 0.20662452726270084,
"learning_rate": 6.4947282225692425e-06,
"loss": 0.5161539912223816,
"step": 309,
"token_acc": 0.8335114537040171
},
{
"epoch": 0.9951845906902087,
"grad_norm": 0.30566444126626124,
"learning_rate": 6.490960331064983e-06,
"loss": 0.5284830927848816,
"step": 310,
"token_acc": 0.8267753724083531
},
{
"epoch": 0.9983948635634029,
"grad_norm": 0.20539089555886036,
"learning_rate": 6.487179544232924e-06,
"loss": 0.5338541865348816,
"step": 311,
"token_acc": 0.8280595721254926
},
{
"epoch": 1.0,
"grad_norm": 0.26945301378123576,
"learning_rate": 6.48338587837368e-06,
"loss": 0.53369140625,
"step": 312,
"token_acc": 0.8281918169384378
},
{
"epoch": 1.0032102728731942,
"grad_norm": 0.2228766062119388,
"learning_rate": 6.47957934984339e-06,
"loss": 0.5152994990348816,
"step": 313,
"token_acc": 0.8322323489910715
},
{
"epoch": 1.0064205457463884,
"grad_norm": 0.2202353695498369,
"learning_rate": 6.4757599750536495e-06,
"loss": 0.5165609121322632,
"step": 314,
"token_acc": 0.8306170672021838
},
{
"epoch": 1.0096308186195826,
"grad_norm": 0.21944860025769794,
"learning_rate": 6.471927770471441e-06,
"loss": 0.506591796875,
"step": 315,
"token_acc": 0.8360700243853959
},
{
"epoch": 1.0128410914927768,
"grad_norm": 0.19852273162473266,
"learning_rate": 6.468082752619062e-06,
"loss": 0.4534912109375,
"step": 316,
"token_acc": 0.8498115059824072
},
{
"epoch": 1.0160513643659712,
"grad_norm": 0.1950395060592887,
"learning_rate": 6.464224938074051e-06,
"loss": 0.462646484375,
"step": 317,
"token_acc": 0.849263431737473
},
{
"epoch": 1.0192616372391654,
"grad_norm": 0.22619155870689964,
"learning_rate": 6.460354343469121e-06,
"loss": 0.5100911855697632,
"step": 318,
"token_acc": 0.8333788909396087
},
{
"epoch": 1.0224719101123596,
"grad_norm": 0.1966958888107114,
"learning_rate": 6.456470985492086e-06,
"loss": 0.4920247495174408,
"step": 319,
"token_acc": 0.8367696940757496
},
{
"epoch": 1.0256821829855538,
"grad_norm": 0.19779145399479958,
"learning_rate": 6.452574880885788e-06,
"loss": 0.520263671875,
"step": 320,
"token_acc": 0.8299368954007383
},
{
"epoch": 1.028892455858748,
"grad_norm": 0.21033363002221026,
"learning_rate": 6.4486660464480225e-06,
"loss": 0.50634765625,
"step": 321,
"token_acc": 0.8340756326797889
},
{
"epoch": 1.0321027287319422,
"grad_norm": 0.18963872381058564,
"learning_rate": 6.4447444990314716e-06,
"loss": 0.4986572265625,
"step": 322,
"token_acc": 0.8362821544068159
},
{
"epoch": 1.0353130016051364,
"grad_norm": 0.2209164173614081,
"learning_rate": 6.4408102555436264e-06,
"loss": 0.4700114130973816,
"step": 323,
"token_acc": 0.8459083202661721
},
{
"epoch": 1.0385232744783306,
"grad_norm": 0.33229018679196365,
"learning_rate": 6.436863332946721e-06,
"loss": 0.5262451171875,
"step": 324,
"token_acc": 0.8283396932868827
},
{
"epoch": 1.0417335473515248,
"grad_norm": 0.20218430124439193,
"learning_rate": 6.432903748257647e-06,
"loss": 0.5170491933822632,
"step": 325,
"token_acc": 0.8299572023876562
},
{
"epoch": 1.0449438202247192,
"grad_norm": 0.22146372328593175,
"learning_rate": 6.428931518547896e-06,
"loss": 0.5284830927848816,
"step": 326,
"token_acc": 0.828352607066829
},
{
"epoch": 1.0481540930979134,
"grad_norm": 0.22608426879553686,
"learning_rate": 6.424946660943472e-06,
"loss": 0.4761556088924408,
"step": 327,
"token_acc": 0.8440760758481691
},
{
"epoch": 1.0513643659711076,
"grad_norm": 0.22334666046146132,
"learning_rate": 6.420949192624826e-06,
"loss": 0.5010172724723816,
"step": 328,
"token_acc": 0.8354667860206632
},
{
"epoch": 1.0545746388443018,
"grad_norm": 0.21680735707121704,
"learning_rate": 6.416939130826778e-06,
"loss": 0.473876953125,
"step": 329,
"token_acc": 0.844303615954024
},
{
"epoch": 1.057784911717496,
"grad_norm": 0.24793959036723234,
"learning_rate": 6.412916492838444e-06,
"loss": 0.5398763418197632,
"step": 330,
"token_acc": 0.824620045732701
},
{
"epoch": 1.0609951845906902,
"grad_norm": 0.21297159564652285,
"learning_rate": 6.4088812960031625e-06,
"loss": 0.5343017578125,
"step": 331,
"token_acc": 0.8273545743431096
},
{
"epoch": 1.0642054574638844,
"grad_norm": 0.20927854637993462,
"learning_rate": 6.404833557718418e-06,
"loss": 0.4844157099723816,
"step": 332,
"token_acc": 0.8403367191581549
},
{
"epoch": 1.0674157303370786,
"grad_norm": 0.22252358860857668,
"learning_rate": 6.400773295435766e-06,
"loss": 0.5533854365348816,
"step": 333,
"token_acc": 0.8207041890009662
},
{
"epoch": 1.0706260032102728,
"grad_norm": 0.19982580110995563,
"learning_rate": 6.396700526660759e-06,
"loss": 0.4607340693473816,
"step": 334,
"token_acc": 0.8489513299287037
},
{
"epoch": 1.0738362760834672,
"grad_norm": 0.2175771128151161,
"learning_rate": 6.392615268952871e-06,
"loss": 0.494140625,
"step": 335,
"token_acc": 0.8397883541365987
},
{
"epoch": 1.0770465489566614,
"grad_norm": 0.24194777122457137,
"learning_rate": 6.388517539925422e-06,
"loss": 0.5191243886947632,
"step": 336,
"token_acc": 0.8303176430366812
},
{
"epoch": 1.0802568218298556,
"grad_norm": 0.21096680669019466,
"learning_rate": 6.384407357245495e-06,
"loss": 0.5267741084098816,
"step": 337,
"token_acc": 0.8252223007261563
},
{
"epoch": 1.0834670947030498,
"grad_norm": 0.23444580996675599,
"learning_rate": 6.380284738633876e-06,
"loss": 0.5298665761947632,
"step": 338,
"token_acc": 0.8276450107105229
},
{
"epoch": 1.086677367576244,
"grad_norm": 0.22588709190384268,
"learning_rate": 6.376149701864961e-06,
"loss": 0.4950765073299408,
"step": 339,
"token_acc": 0.8365316647599914
},
{
"epoch": 1.0898876404494382,
"grad_norm": 0.25313796690731927,
"learning_rate": 6.372002264766688e-06,
"loss": 0.5378011465072632,
"step": 340,
"token_acc": 0.8255440705674001
},
{
"epoch": 1.0930979133226324,
"grad_norm": 0.21913878331030762,
"learning_rate": 6.367842445220458e-06,
"loss": 0.537109375,
"step": 341,
"token_acc": 0.8250007981544305
},
{
"epoch": 1.0963081861958266,
"grad_norm": 0.22448672487090238,
"learning_rate": 6.363670261161057e-06,
"loss": 0.4685872495174408,
"step": 342,
"token_acc": 0.8437887567778065
},
{
"epoch": 1.0995184590690208,
"grad_norm": 0.19124533755087705,
"learning_rate": 6.359485730576581e-06,
"loss": 0.4889729917049408,
"step": 343,
"token_acc": 0.840408457946595
},
{
"epoch": 1.102728731942215,
"grad_norm": 0.2168277664403285,
"learning_rate": 6.355288871508358e-06,
"loss": 0.5118408203125,
"step": 344,
"token_acc": 0.8335477930980931
},
{
"epoch": 1.1059390048154094,
"grad_norm": 0.22068343255790987,
"learning_rate": 6.351079702050868e-06,
"loss": 0.5257161855697632,
"step": 345,
"token_acc": 0.8293478332683655
},
{
"epoch": 1.1091492776886036,
"grad_norm": 0.17736870716031858,
"learning_rate": 6.346858240351667e-06,
"loss": 0.4034017026424408,
"step": 346,
"token_acc": 0.8668660975357989
},
{
"epoch": 1.1123595505617978,
"grad_norm": 0.22649924176834732,
"learning_rate": 6.342624504611308e-06,
"loss": 0.4998779296875,
"step": 347,
"token_acc": 0.8358005764012703
},
{
"epoch": 1.115569823434992,
"grad_norm": 0.21562170580193807,
"learning_rate": 6.338378513083264e-06,
"loss": 0.5145670771598816,
"step": 348,
"token_acc": 0.8321265570415991
},
{
"epoch": 1.1187800963081862,
"grad_norm": 0.2002318871361869,
"learning_rate": 6.334120284073845e-06,
"loss": 0.5229899287223816,
"step": 349,
"token_acc": 0.8315942248719818
},
{
"epoch": 1.1219903691813804,
"grad_norm": 0.23677754906933055,
"learning_rate": 6.329849835942125e-06,
"loss": 0.5107828974723816,
"step": 350,
"token_acc": 0.8333928908270134
},
{
"epoch": 1.1252006420545746,
"grad_norm": 0.25803905866296484,
"learning_rate": 6.325567187099859e-06,
"loss": 0.5131022334098816,
"step": 351,
"token_acc": 0.8336294261255828
},
{
"epoch": 1.1284109149277688,
"grad_norm": 0.29486525496945437,
"learning_rate": 6.321272356011404e-06,
"loss": 0.4559326171875,
"step": 352,
"token_acc": 0.8493671388941911
},
{
"epoch": 1.131621187800963,
"grad_norm": 0.20294298663958116,
"learning_rate": 6.31696536119364e-06,
"loss": 0.51611328125,
"step": 353,
"token_acc": 0.8324970433783843
},
{
"epoch": 1.1348314606741572,
"grad_norm": 0.21077888101344128,
"learning_rate": 6.312646221215891e-06,
"loss": 0.5026448965072632,
"step": 354,
"token_acc": 0.8361661983842797
},
{
"epoch": 1.1380417335473516,
"grad_norm": 0.22827729326842286,
"learning_rate": 6.308314954699845e-06,
"loss": 0.5194091796875,
"step": 355,
"token_acc": 0.8296480882851123
},
{
"epoch": 1.1412520064205458,
"grad_norm": 0.2092342197897483,
"learning_rate": 6.303971580319469e-06,
"loss": 0.5154622793197632,
"step": 356,
"token_acc": 0.8319102496742222
},
{
"epoch": 1.14446227929374,
"grad_norm": 0.19947979635092572,
"learning_rate": 6.299616116800936e-06,
"loss": 0.4569498896598816,
"step": 357,
"token_acc": 0.848671552549395
},
{
"epoch": 1.1476725521669342,
"grad_norm": 0.23453848382593498,
"learning_rate": 6.295248582922538e-06,
"loss": 0.4909261167049408,
"step": 358,
"token_acc": 0.8384680608615377
},
{
"epoch": 1.1508828250401284,
"grad_norm": 0.954323915842282,
"learning_rate": 6.290868997514609e-06,
"loss": 0.5150553584098816,
"step": 359,
"token_acc": 0.8319439384775722
},
{
"epoch": 1.1540930979133226,
"grad_norm": 0.23039156354638235,
"learning_rate": 6.2864773794594435e-06,
"loss": 0.5189616084098816,
"step": 360,
"token_acc": 0.8291666480554228
},
{
"epoch": 1.1573033707865168,
"grad_norm": 0.2108602621989691,
"learning_rate": 6.28207374769121e-06,
"loss": 0.48828125,
"step": 361,
"token_acc": 0.8417036259188337
},
{
"epoch": 1.160513643659711,
"grad_norm": 0.2274478694052877,
"learning_rate": 6.277658121195879e-06,
"loss": 0.5316569209098816,
"step": 362,
"token_acc": 0.8250233183762744
},
{
"epoch": 1.1637239165329052,
"grad_norm": 0.19314602070313494,
"learning_rate": 6.273230519011129e-06,
"loss": 0.5065511465072632,
"step": 363,
"token_acc": 0.8337296713987661
},
{
"epoch": 1.1669341894060996,
"grad_norm": 0.19156656647439638,
"learning_rate": 6.2687909602262775e-06,
"loss": 0.4707845151424408,
"step": 364,
"token_acc": 0.8435817672766602
},
{
"epoch": 1.1701444622792938,
"grad_norm": 0.1983194701465737,
"learning_rate": 6.2643394639821855e-06,
"loss": 0.4794921875,
"step": 365,
"token_acc": 0.8451988430219622
},
{
"epoch": 1.173354735152488,
"grad_norm": 0.2140867347077317,
"learning_rate": 6.2598760494711865e-06,
"loss": 0.5105184316635132,
"step": 366,
"token_acc": 0.8330230414412061
},
{
"epoch": 1.1765650080256822,
"grad_norm": 0.21944917336007386,
"learning_rate": 6.255400735936998e-06,
"loss": 0.4921468198299408,
"step": 367,
"token_acc": 0.8382150994132302
},
{
"epoch": 1.1797752808988764,
"grad_norm": 0.23449231199354112,
"learning_rate": 6.250913542674637e-06,
"loss": 0.5512288808822632,
"step": 368,
"token_acc": 0.8213648698273107
},
{
"epoch": 1.1829855537720706,
"grad_norm": 0.2031330605981291,
"learning_rate": 6.246414489030342e-06,
"loss": 0.50830078125,
"step": 369,
"token_acc": 0.8335424810046952
},
{
"epoch": 1.1861958266452648,
"grad_norm": 0.23655728339205756,
"learning_rate": 6.241903594401484e-06,
"loss": 0.535888671875,
"step": 370,
"token_acc": 0.825756292557336
},
{
"epoch": 1.189406099518459,
"grad_norm": 2.107458585311116,
"learning_rate": 6.237380878236488e-06,
"loss": 0.481201171875,
"step": 371,
"token_acc": 0.8415841111341924
},
{
"epoch": 1.1926163723916532,
"grad_norm": 0.20710561438383962,
"learning_rate": 6.2328463600347465e-06,
"loss": 0.4458414912223816,
"step": 372,
"token_acc": 0.8520495554363945
},
{
"epoch": 1.1958266452648476,
"grad_norm": 0.20708739457868908,
"learning_rate": 6.228300059346533e-06,
"loss": 0.5088704824447632,
"step": 373,
"token_acc": 0.8342107871219757
},
{
"epoch": 1.1990369181380418,
"grad_norm": 0.20213338810098505,
"learning_rate": 6.223741995772923e-06,
"loss": 0.522705078125,
"step": 374,
"token_acc": 0.8303547302535117
},
{
"epoch": 1.202247191011236,
"grad_norm": 0.255102267938197,
"learning_rate": 6.219172188965709e-06,
"loss": 0.4881591796875,
"step": 375,
"token_acc": 0.8406651832875708
},
{
"epoch": 1.2054574638844302,
"grad_norm": 0.19269656731911167,
"learning_rate": 6.214590658627308e-06,
"loss": 0.4801228940486908,
"step": 376,
"token_acc": 0.8424494317329764
},
{
"epoch": 1.2086677367576244,
"grad_norm": 0.2078396922394465,
"learning_rate": 6.209997424510687e-06,
"loss": 0.4632975459098816,
"step": 377,
"token_acc": 0.8472165411231166
},
{
"epoch": 1.2118780096308186,
"grad_norm": 0.27323577905583074,
"learning_rate": 6.205392506419271e-06,
"loss": 0.4674479365348816,
"step": 378,
"token_acc": 0.847078453706124
},
{
"epoch": 1.2150882825040128,
"grad_norm": 0.2110978207882355,
"learning_rate": 6.2007759242068585e-06,
"loss": 0.4643961787223816,
"step": 379,
"token_acc": 0.8469605268875274
},
{
"epoch": 1.218298555377207,
"grad_norm": 0.22639346046683828,
"learning_rate": 6.196147697777541e-06,
"loss": 0.5506998896598816,
"step": 380,
"token_acc": 0.8201897189997169
},
{
"epoch": 1.2215088282504012,
"grad_norm": 0.2475511079473807,
"learning_rate": 6.191507847085608e-06,
"loss": 0.4781901240348816,
"step": 381,
"token_acc": 0.8447922066935496
},
{
"epoch": 1.2247191011235956,
"grad_norm": 0.24040115062575793,
"learning_rate": 6.186856392135472e-06,
"loss": 0.5130208730697632,
"step": 382,
"token_acc": 0.8315499466295303
},
{
"epoch": 1.2279293739967898,
"grad_norm": 0.1815363847276098,
"learning_rate": 6.182193352981573e-06,
"loss": 0.4622802734375,
"step": 383,
"token_acc": 0.8477780786218334
},
{
"epoch": 1.231139646869984,
"grad_norm": 0.20482340368559337,
"learning_rate": 6.177518749728295e-06,
"loss": 0.5057780146598816,
"step": 384,
"token_acc": 0.8352884439770376
},
{
"epoch": 1.2343499197431782,
"grad_norm": 0.19762870907244168,
"learning_rate": 6.172832602529881e-06,
"loss": 0.49853515625,
"step": 385,
"token_acc": 0.8363799989361367
},
{
"epoch": 1.2375601926163724,
"grad_norm": 0.2227216191441955,
"learning_rate": 6.168134931590346e-06,
"loss": 0.5113932490348816,
"step": 386,
"token_acc": 0.8322664894148927
},
{
"epoch": 1.2407704654895666,
"grad_norm": 0.20112034748752594,
"learning_rate": 6.163425757163387e-06,
"loss": 0.4695638120174408,
"step": 387,
"token_acc": 0.8444868529687521
},
{
"epoch": 1.2439807383627608,
"grad_norm": 0.22492795778104924,
"learning_rate": 6.158705099552299e-06,
"loss": 0.466064453125,
"step": 388,
"token_acc": 0.848569682910181
},
{
"epoch": 1.247191011235955,
"grad_norm": 0.18710929793776945,
"learning_rate": 6.153972979109884e-06,
"loss": 0.4784342646598816,
"step": 389,
"token_acc": 0.842638103523271
},
{
"epoch": 1.2504012841091492,
"grad_norm": 0.19948237667815355,
"learning_rate": 6.149229416238368e-06,
"loss": 0.4717610776424408,
"step": 390,
"token_acc": 0.8457017327855467
},
{
"epoch": 1.2536115569823436,
"grad_norm": 0.24721986040967323,
"learning_rate": 6.144474431389309e-06,
"loss": 0.5409342646598816,
"step": 391,
"token_acc": 0.8249820153645141
},
{
"epoch": 1.2568218298555376,
"grad_norm": 0.20262551772341117,
"learning_rate": 6.139708045063508e-06,
"loss": 0.537353515625,
"step": 392,
"token_acc": 0.8233082420552443
},
{
"epoch": 1.260032102728732,
"grad_norm": 0.20651343606655426,
"learning_rate": 6.134930277810927e-06,
"loss": 0.5173746943473816,
"step": 393,
"token_acc": 0.8314509609945887
},
{
"epoch": 1.2632423756019262,
"grad_norm": 0.21318989113773756,
"learning_rate": 6.1301411502305915e-06,
"loss": 0.4995931088924408,
"step": 394,
"token_acc": 0.836391847497132
},
{
"epoch": 1.2664526484751204,
"grad_norm": 0.19507151642627543,
"learning_rate": 6.1253406829705105e-06,
"loss": 0.4830729365348816,
"step": 395,
"token_acc": 0.8431513934296263
},
{
"epoch": 1.2696629213483146,
"grad_norm": 0.21904835008775547,
"learning_rate": 6.12052889672758e-06,
"loss": 0.5050048828125,
"step": 396,
"token_acc": 0.8356795815804924
},
{
"epoch": 1.2728731942215088,
"grad_norm": 0.2014526276910395,
"learning_rate": 6.115705812247499e-06,
"loss": 0.5210775136947632,
"step": 397,
"token_acc": 0.8303255017672488
},
{
"epoch": 1.276083467094703,
"grad_norm": 0.2212609849779949,
"learning_rate": 6.110871450324678e-06,
"loss": 0.4835612177848816,
"step": 398,
"token_acc": 0.8395820907570261
},
{
"epoch": 1.2792937399678972,
"grad_norm": 0.21543834649039986,
"learning_rate": 6.106025831802148e-06,
"loss": 0.4977620542049408,
"step": 399,
"token_acc": 0.8376060380030089
},
{
"epoch": 1.2825040128410916,
"grad_norm": 0.2240179499546737,
"learning_rate": 6.101168977571472e-06,
"loss": 0.5069173574447632,
"step": 400,
"token_acc": 0.8339093844348606
},
{
"epoch": 1.2857142857142856,
"grad_norm": 0.1957158009734539,
"learning_rate": 6.096300908572658e-06,
"loss": 0.515380859375,
"step": 401,
"token_acc": 0.8308960711150132
},
{
"epoch": 1.28892455858748,
"grad_norm": 0.18337021533975237,
"learning_rate": 6.091421645794063e-06,
"loss": 0.4827880859375,
"step": 402,
"token_acc": 0.8408876549431548
},
{
"epoch": 1.2921348314606742,
"grad_norm": 0.201454794503546,
"learning_rate": 6.086531210272307e-06,
"loss": 0.48809814453125,
"step": 403,
"token_acc": 0.8390145324144904
},
{
"epoch": 1.2953451043338684,
"grad_norm": 0.21590878833530364,
"learning_rate": 6.0816296230921774e-06,
"loss": 0.5361328125,
"step": 404,
"token_acc": 0.8257730768061761
},
{
"epoch": 1.2985553772070626,
"grad_norm": 0.18895177189761647,
"learning_rate": 6.076716905386546e-06,
"loss": 0.4814453125,
"step": 405,
"token_acc": 0.8436404513000572
},
{
"epoch": 1.3017656500802568,
"grad_norm": 0.19402420650392968,
"learning_rate": 6.071793078336268e-06,
"loss": 0.4874267578125,
"step": 406,
"token_acc": 0.8412702912442461
},
{
"epoch": 1.304975922953451,
"grad_norm": 0.19851857071944187,
"learning_rate": 6.0668581631701e-06,
"loss": 0.51611328125,
"step": 407,
"token_acc": 0.8293106083450718
},
{
"epoch": 1.3081861958266452,
"grad_norm": 0.18981501741141846,
"learning_rate": 6.061912181164602e-06,
"loss": 0.4933675229549408,
"step": 408,
"token_acc": 0.8380443881201454
},
{
"epoch": 1.3113964686998396,
"grad_norm": 0.26280017805298755,
"learning_rate": 6.056955153644048e-06,
"loss": 0.5085042715072632,
"step": 409,
"token_acc": 0.8320101659806012
},
{
"epoch": 1.3146067415730336,
"grad_norm": 0.20834143665165283,
"learning_rate": 6.051987101980336e-06,
"loss": 0.4640299677848816,
"step": 410,
"token_acc": 0.8469984896533973
},
{
"epoch": 1.317817014446228,
"grad_norm": 0.20377462902343046,
"learning_rate": 6.047008047592892e-06,
"loss": 0.4606119990348816,
"step": 411,
"token_acc": 0.8498952878946993
},
{
"epoch": 1.3210272873194222,
"grad_norm": 0.2256058408114048,
"learning_rate": 6.042018011948578e-06,
"loss": 0.5301920771598816,
"step": 412,
"token_acc": 0.8273486921025606
},
{
"epoch": 1.3242375601926164,
"grad_norm": 0.2039682659306468,
"learning_rate": 6.0370170165616056e-06,
"loss": 0.4898274838924408,
"step": 413,
"token_acc": 0.8401188260484118
},
{
"epoch": 1.3274478330658106,
"grad_norm": 0.19661549991262828,
"learning_rate": 6.0320050829934346e-06,
"loss": 0.501953125,
"step": 414,
"token_acc": 0.8351441080350566
},
{
"epoch": 1.3306581059390048,
"grad_norm": 0.2009643860247893,
"learning_rate": 6.026982232852684e-06,
"loss": 0.5013834834098816,
"step": 415,
"token_acc": 0.8359275770809191
},
{
"epoch": 1.333868378812199,
"grad_norm": 0.21808790105770282,
"learning_rate": 6.021948487795043e-06,
"loss": 0.4978841245174408,
"step": 416,
"token_acc": 0.8399703730053248
},
{
"epoch": 1.3370786516853932,
"grad_norm": 0.20526463177341095,
"learning_rate": 6.016903869523169e-06,
"loss": 0.5048828125,
"step": 417,
"token_acc": 0.834811573183773
},
{
"epoch": 1.3402889245585876,
"grad_norm": 0.21299152332644122,
"learning_rate": 6.011848399786601e-06,
"loss": 0.49658203125,
"step": 418,
"token_acc": 0.8379083588445921
},
{
"epoch": 1.3434991974317816,
"grad_norm": 0.24218971921492033,
"learning_rate": 6.0067821003816626e-06,
"loss": 0.5063883662223816,
"step": 419,
"token_acc": 0.8342907607753908
},
{
"epoch": 1.346709470304976,
"grad_norm": 0.23838925650407486,
"learning_rate": 6.0017049931513685e-06,
"loss": 0.4720052182674408,
"step": 420,
"token_acc": 0.8453532915568743
},
{
"epoch": 1.3499197431781702,
"grad_norm": 0.18940147678462618,
"learning_rate": 5.996617099985331e-06,
"loss": 0.4628499448299408,
"step": 421,
"token_acc": 0.8485520945983281
},
{
"epoch": 1.3531300160513644,
"grad_norm": 0.21754239145548293,
"learning_rate": 5.991518442819664e-06,
"loss": 0.503662109375,
"step": 422,
"token_acc": 0.8317651653677501
},
{
"epoch": 1.3563402889245586,
"grad_norm": 0.20677671370390927,
"learning_rate": 5.986409043636894e-06,
"loss": 0.4958903193473816,
"step": 423,
"token_acc": 0.8347628821195424
},
{
"epoch": 1.3595505617977528,
"grad_norm": 0.21125756075422883,
"learning_rate": 5.981288924465855e-06,
"loss": 0.4697265625,
"step": 424,
"token_acc": 0.8464578221460343
},
{
"epoch": 1.362760834670947,
"grad_norm": 0.1845227895272064,
"learning_rate": 5.9761581073816055e-06,
"loss": 0.4358724057674408,
"step": 425,
"token_acc": 0.856153648909988
},
{
"epoch": 1.3659711075441412,
"grad_norm": 0.22226226474729194,
"learning_rate": 5.971016614505321e-06,
"loss": 0.48388671875,
"step": 426,
"token_acc": 0.841594074255456
},
{
"epoch": 1.3691813804173354,
"grad_norm": 0.3627275381953577,
"learning_rate": 5.965864468004209e-06,
"loss": 0.5054525136947632,
"step": 427,
"token_acc": 0.8334647244765889
},
{
"epoch": 1.3723916532905296,
"grad_norm": 0.2446819517168817,
"learning_rate": 5.96070169009141e-06,
"loss": 0.5396932363510132,
"step": 428,
"token_acc": 0.8251210462343914
},
{
"epoch": 1.375601926163724,
"grad_norm": 0.18480486993834852,
"learning_rate": 5.955528303025899e-06,
"loss": 0.492919921875,
"step": 429,
"token_acc": 0.838200488309484
},
{
"epoch": 1.3788121990369182,
"grad_norm": 0.1996582980430958,
"learning_rate": 5.950344329112392e-06,
"loss": 0.4872233271598816,
"step": 430,
"token_acc": 0.8402870290077712
},
{
"epoch": 1.3820224719101124,
"grad_norm": 0.19915530342516288,
"learning_rate": 5.94514979070125e-06,
"loss": 0.4737955927848816,
"step": 431,
"token_acc": 0.8434756493995772
},
{
"epoch": 1.3852327447833066,
"grad_norm": 0.21306869814823742,
"learning_rate": 5.939944710188383e-06,
"loss": 0.4843343198299408,
"step": 432,
"token_acc": 0.840401274727809
},
{
"epoch": 1.3884430176565008,
"grad_norm": 0.21153154196928545,
"learning_rate": 5.934729110015151e-06,
"loss": 0.4997355341911316,
"step": 433,
"token_acc": 0.8352082065345394
},
{
"epoch": 1.391653290529695,
"grad_norm": 0.1948748437790448,
"learning_rate": 5.929503012668269e-06,
"loss": 0.4604085385799408,
"step": 434,
"token_acc": 0.8470840873900802
},
{
"epoch": 1.3948635634028892,
"grad_norm": 0.19686780321241873,
"learning_rate": 5.924266440679711e-06,
"loss": 0.4471028745174408,
"step": 435,
"token_acc": 0.8513346822951796
},
{
"epoch": 1.3980738362760834,
"grad_norm": 0.23488653128023249,
"learning_rate": 5.919019416626611e-06,
"loss": 0.4792887568473816,
"step": 436,
"token_acc": 0.8414461663137132
},
{
"epoch": 1.4012841091492776,
"grad_norm": 0.21251232363695088,
"learning_rate": 5.913761963131167e-06,
"loss": 0.5232747793197632,
"step": 437,
"token_acc": 0.8290875849854311
},
{
"epoch": 1.404494382022472,
"grad_norm": 0.22499625939133328,
"learning_rate": 5.908494102860541e-06,
"loss": 0.5137939453125,
"step": 438,
"token_acc": 0.8317246352356477
},
{
"epoch": 1.4077046548956662,
"grad_norm": 0.1950334456413267,
"learning_rate": 5.903215858526765e-06,
"loss": 0.5267741084098816,
"step": 439,
"token_acc": 0.8279046746612088
},
{
"epoch": 1.4109149277688604,
"grad_norm": 0.22516438572620065,
"learning_rate": 5.89792725288664e-06,
"loss": 0.4964192807674408,
"step": 440,
"token_acc": 0.8369873037845605
},
{
"epoch": 1.4141252006420546,
"grad_norm": 0.24835908416757446,
"learning_rate": 5.892628308741642e-06,
"loss": 0.515869140625,
"step": 441,
"token_acc": 0.828441323656562
},
{
"epoch": 1.4173354735152488,
"grad_norm": 0.20400606035164545,
"learning_rate": 5.8873190489378146e-06,
"loss": 0.5088704824447632,
"step": 442,
"token_acc": 0.8347255754640608
},
{
"epoch": 1.420545746388443,
"grad_norm": 0.1986845262037535,
"learning_rate": 5.881999496365684e-06,
"loss": 0.4674479365348816,
"step": 443,
"token_acc": 0.846090615198355
},
{
"epoch": 1.4237560192616372,
"grad_norm": 0.267957898844908,
"learning_rate": 5.876669673960148e-06,
"loss": 0.5269572138786316,
"step": 444,
"token_acc": 0.8272518904385817
},
{
"epoch": 1.4269662921348314,
"grad_norm": 0.1954436329565384,
"learning_rate": 5.871329604700384e-06,
"loss": 0.5074869990348816,
"step": 445,
"token_acc": 0.8332683584946571
},
{
"epoch": 1.4301765650080256,
"grad_norm": 0.2367051809395509,
"learning_rate": 5.865979311609748e-06,
"loss": 0.53369140625,
"step": 446,
"token_acc": 0.8254907318262998
},
{
"epoch": 1.43338683788122,
"grad_norm": 0.20668963218170677,
"learning_rate": 5.860618817755674e-06,
"loss": 0.4967448115348816,
"step": 447,
"token_acc": 0.8361870607490132
},
{
"epoch": 1.4365971107544142,
"grad_norm": 0.19895590716270828,
"learning_rate": 5.8552481462495785e-06,
"loss": 0.5145670771598816,
"step": 448,
"token_acc": 0.8321965107826509
},
{
"epoch": 1.4398073836276084,
"grad_norm": 0.30832311716577054,
"learning_rate": 5.849867320246756e-06,
"loss": 0.5150553584098816,
"step": 449,
"token_acc": 0.8315047127468582
},
{
"epoch": 1.4430176565008026,
"grad_norm": 0.20139986527570522,
"learning_rate": 5.844476362946282e-06,
"loss": 0.5021159052848816,
"step": 450,
"token_acc": 0.8357744885476138
},
{
"epoch": 1.4462279293739968,
"grad_norm": 0.20570084102187688,
"learning_rate": 5.8390752975909116e-06,
"loss": 0.4951171875,
"step": 451,
"token_acc": 0.8379211979893351
},
{
"epoch": 1.449438202247191,
"grad_norm": 0.35580604726544524,
"learning_rate": 5.833664147466983e-06,
"loss": 0.4720458984375,
"step": 452,
"token_acc": 0.8447361847752629
},
{
"epoch": 1.4526484751203852,
"grad_norm": 0.21944795964910477,
"learning_rate": 5.828242935904313e-06,
"loss": 0.51861572265625,
"step": 453,
"token_acc": 0.8304687110031447
},
{
"epoch": 1.4558587479935794,
"grad_norm": 0.216206526603621,
"learning_rate": 5.8228116862760936e-06,
"loss": 0.519775390625,
"step": 454,
"token_acc": 0.8273322676894389
},
{
"epoch": 1.4590690208667736,
"grad_norm": 0.23632585935268247,
"learning_rate": 5.8173704219988015e-06,
"loss": 0.518798828125,
"step": 455,
"token_acc": 0.8292118566043417
},
{
"epoch": 1.462279293739968,
"grad_norm": 0.2133500391661819,
"learning_rate": 5.811919166532087e-06,
"loss": 0.4931640625,
"step": 456,
"token_acc": 0.8363678333960607
},
{
"epoch": 1.465489566613162,
"grad_norm": 0.20470785515570017,
"learning_rate": 5.806457943378678e-06,
"loss": 0.4711100459098816,
"step": 457,
"token_acc": 0.8461975940270282
},
{
"epoch": 1.4686998394863564,
"grad_norm": 0.2195825179878936,
"learning_rate": 5.8009867760842776e-06,
"loss": 0.4962972104549408,
"step": 458,
"token_acc": 0.8377250322321346
},
{
"epoch": 1.4719101123595506,
"grad_norm": 0.2198457397287571,
"learning_rate": 5.795505688237461e-06,
"loss": 0.5032552480697632,
"step": 459,
"token_acc": 0.8350467062766113
},
{
"epoch": 1.4751203852327448,
"grad_norm": 0.2337449787419216,
"learning_rate": 5.790014703469577e-06,
"loss": 0.5010172724723816,
"step": 460,
"token_acc": 0.8338096031025574
},
{
"epoch": 1.478330658105939,
"grad_norm": 0.21231226916932114,
"learning_rate": 5.7845138454546445e-06,
"loss": 0.5118001699447632,
"step": 461,
"token_acc": 0.8346163924673687
},
{
"epoch": 1.4815409309791332,
"grad_norm": 0.2078743299374229,
"learning_rate": 5.779003137909246e-06,
"loss": 0.5108846426010132,
"step": 462,
"token_acc": 0.8348133684777426
},
{
"epoch": 1.4847512038523274,
"grad_norm": 0.2592011427768495,
"learning_rate": 5.773482604592436e-06,
"loss": 0.5184326171875,
"step": 463,
"token_acc": 0.8289103308278782
},
{
"epoch": 1.4879614767255216,
"grad_norm": 0.20072440357092186,
"learning_rate": 5.767952269305628e-06,
"loss": 0.52734375,
"step": 464,
"token_acc": 0.8298738313921822
},
{
"epoch": 1.491171749598716,
"grad_norm": 0.22554287682014237,
"learning_rate": 5.762412155892497e-06,
"loss": 0.5120443105697632,
"step": 465,
"token_acc": 0.8331116711856353
},
{
"epoch": 1.49438202247191,
"grad_norm": 0.19988828001667414,
"learning_rate": 5.756862288238876e-06,
"loss": 0.5431722402572632,
"step": 466,
"token_acc": 0.8247288141532609
},
{
"epoch": 1.4975922953451044,
"grad_norm": 0.21785806141984132,
"learning_rate": 5.751302690272653e-06,
"loss": 0.4822591245174408,
"step": 467,
"token_acc": 0.8409143683909689
},
{
"epoch": 1.5008025682182986,
"grad_norm": 0.2504631057534585,
"learning_rate": 5.745733385963666e-06,
"loss": 0.5161947011947632,
"step": 468,
"token_acc": 0.8309906856663261
},
{
"epoch": 1.5040128410914928,
"grad_norm": 0.22185281387691191,
"learning_rate": 5.740154399323604e-06,
"loss": 0.5100911855697632,
"step": 469,
"token_acc": 0.8328053877676979
},
{
"epoch": 1.507223113964687,
"grad_norm": 0.20799497094441494,
"learning_rate": 5.7345657544058975e-06,
"loss": 0.5203857421875,
"step": 470,
"token_acc": 0.8293026919927851
},
{
"epoch": 1.5104333868378812,
"grad_norm": 0.2731861663055686,
"learning_rate": 5.728967475305622e-06,
"loss": 0.5336100459098816,
"step": 471,
"token_acc": 0.8260466631908238
},
{
"epoch": 1.5136436597110754,
"grad_norm": 0.17070530437224793,
"learning_rate": 5.723359586159385e-06,
"loss": 0.4242350459098816,
"step": 472,
"token_acc": 0.8605290565725756
},
{
"epoch": 1.5168539325842696,
"grad_norm": 0.2212458940736086,
"learning_rate": 5.717742111145232e-06,
"loss": 0.52850341796875,
"step": 473,
"token_acc": 0.829130877534756
},
{
"epoch": 1.520064205457464,
"grad_norm": 0.2414877669947761,
"learning_rate": 5.7121150744825345e-06,
"loss": 0.5250651240348816,
"step": 474,
"token_acc": 0.8277093675602236
},
{
"epoch": 1.523274478330658,
"grad_norm": 0.24570889317680952,
"learning_rate": 5.70647850043189e-06,
"loss": 0.5235189199447632,
"step": 475,
"token_acc": 0.827997714766816
},
{
"epoch": 1.5264847512038524,
"grad_norm": 0.2245178646870182,
"learning_rate": 5.700832413295014e-06,
"loss": 0.5039469599723816,
"step": 476,
"token_acc": 0.8342653694641449
},
{
"epoch": 1.5296950240770464,
"grad_norm": 0.18979828763233803,
"learning_rate": 5.695176837414639e-06,
"loss": 0.44140625,
"step": 477,
"token_acc": 0.8553784055133313
},
{
"epoch": 1.5329052969502408,
"grad_norm": 0.26777629112744844,
"learning_rate": 5.689511797174406e-06,
"loss": 0.4769287109375,
"step": 478,
"token_acc": 0.8443998026767429
},
{
"epoch": 1.536115569823435,
"grad_norm": 0.20772215310261655,
"learning_rate": 5.68383731699876e-06,
"loss": 0.4756266474723816,
"step": 479,
"token_acc": 0.8454203175667018
},
{
"epoch": 1.5393258426966292,
"grad_norm": 0.20996237183201139,
"learning_rate": 5.678153421352851e-06,
"loss": 0.498779296875,
"step": 480,
"token_acc": 0.8373191847204661
},
{
"epoch": 1.5425361155698234,
"grad_norm": 0.22226152308127473,
"learning_rate": 5.672460134742417e-06,
"loss": 0.5416259765625,
"step": 481,
"token_acc": 0.8240733038205491
},
{
"epoch": 1.5457463884430176,
"grad_norm": 0.22278318566681762,
"learning_rate": 5.666757481713687e-06,
"loss": 0.5304362177848816,
"step": 482,
"token_acc": 0.8288821307241506
},
{
"epoch": 1.548956661316212,
"grad_norm": 0.18975820697676749,
"learning_rate": 5.661045486853273e-06,
"loss": 0.4460042417049408,
"step": 483,
"token_acc": 0.8508166917859834
},
{
"epoch": 1.552166934189406,
"grad_norm": 0.23953556254565397,
"learning_rate": 5.655324174788063e-06,
"loss": 0.4954427182674408,
"step": 484,
"token_acc": 0.8386156763424147
},
{
"epoch": 1.5553772070626004,
"grad_norm": 0.21308687243633692,
"learning_rate": 5.649593570185116e-06,
"loss": 0.5013021230697632,
"step": 485,
"token_acc": 0.8365669483628418
},
{
"epoch": 1.5585874799357944,
"grad_norm": 0.2591747558605951,
"learning_rate": 5.643853697751556e-06,
"loss": 0.5220947265625,
"step": 486,
"token_acc": 0.82841962791029
},
{
"epoch": 1.5617977528089888,
"grad_norm": 0.19126924879912047,
"learning_rate": 5.638104582234462e-06,
"loss": 0.4590657651424408,
"step": 487,
"token_acc": 0.8493991109615291
},
{
"epoch": 1.565008025682183,
"grad_norm": 0.21861579173115944,
"learning_rate": 5.6323462484207665e-06,
"loss": 0.5420736074447632,
"step": 488,
"token_acc": 0.8220674942647027
},
{
"epoch": 1.5682182985553772,
"grad_norm": 0.20998260867078428,
"learning_rate": 5.626578721137146e-06,
"loss": 0.4426676630973816,
"step": 489,
"token_acc": 0.8554470581902353
},
{
"epoch": 1.5714285714285714,
"grad_norm": 0.22131084892106537,
"learning_rate": 5.6208020252499125e-06,
"loss": 0.5141195058822632,
"step": 490,
"token_acc": 0.8318744388867513
},
{
"epoch": 1.5746388443017656,
"grad_norm": 0.20986941585535346,
"learning_rate": 5.6150161856649075e-06,
"loss": 0.4871826171875,
"step": 491,
"token_acc": 0.8383738855963362
},
{
"epoch": 1.57784911717496,
"grad_norm": 0.25274465913526994,
"learning_rate": 5.609221227327397e-06,
"loss": 0.4795735776424408,
"step": 492,
"token_acc": 0.8422403366168165
},
{
"epoch": 1.581059390048154,
"grad_norm": 0.2097105194028596,
"learning_rate": 5.603417175221961e-06,
"loss": 0.524169921875,
"step": 493,
"token_acc": 0.8295362334704794
},
{
"epoch": 1.5842696629213484,
"grad_norm": 0.20754286664850768,
"learning_rate": 5.597604054372387e-06,
"loss": 0.51318359375,
"step": 494,
"token_acc": 0.8333822870324183
},
{
"epoch": 1.5874799357945424,
"grad_norm": 0.21371753822542933,
"learning_rate": 5.59178188984156e-06,
"loss": 0.52392578125,
"step": 495,
"token_acc": 0.8291340676249795
},
{
"epoch": 1.5906902086677368,
"grad_norm": 0.24999506191109322,
"learning_rate": 5.585950706731359e-06,
"loss": 0.4842122495174408,
"step": 496,
"token_acc": 0.8399531895544081
},
{
"epoch": 1.593900481540931,
"grad_norm": 0.21426267112364675,
"learning_rate": 5.580110530182542e-06,
"loss": 0.5196126699447632,
"step": 497,
"token_acc": 0.8292234827081937
},
{
"epoch": 1.5971107544141252,
"grad_norm": 0.20205484433806986,
"learning_rate": 5.574261385374648e-06,
"loss": 0.4690958857536316,
"step": 498,
"token_acc": 0.8430140522879065
},
{
"epoch": 1.6003210272873194,
"grad_norm": 0.24302697861059383,
"learning_rate": 5.568403297525875e-06,
"loss": 0.496826171875,
"step": 499,
"token_acc": 0.8369248877989177
},
{
"epoch": 1.6035313001605136,
"grad_norm": 0.1950042091838039,
"learning_rate": 5.562536291892984e-06,
"loss": 0.4523112177848816,
"step": 500,
"token_acc": 0.8496571087889186
},
{
"epoch": 1.606741573033708,
"grad_norm": 0.2068802556309059,
"learning_rate": 5.556660393771181e-06,
"loss": 0.4870198667049408,
"step": 501,
"token_acc": 0.8402496542590332
},
{
"epoch": 1.609951845906902,
"grad_norm": 0.2243379981959925,
"learning_rate": 5.550775628494012e-06,
"loss": 0.5233561396598816,
"step": 502,
"token_acc": 0.8274344767729369
},
{
"epoch": 1.6131621187800964,
"grad_norm": 0.1927744421853587,
"learning_rate": 5.544882021433255e-06,
"loss": 0.5045573115348816,
"step": 503,
"token_acc": 0.8343204270724673
},
{
"epoch": 1.6163723916532904,
"grad_norm": 0.21113102526395744,
"learning_rate": 5.5389795979988046e-06,
"loss": 0.5104166865348816,
"step": 504,
"token_acc": 0.8325613633742619
},
{
"epoch": 1.6195826645264848,
"grad_norm": 0.20121076756883371,
"learning_rate": 5.533068383638573e-06,
"loss": 0.4603271484375,
"step": 505,
"token_acc": 0.8479993692296718
},
{
"epoch": 1.622792937399679,
"grad_norm": 0.1935855855657875,
"learning_rate": 5.5271484038383664e-06,
"loss": 0.4717203974723816,
"step": 506,
"token_acc": 0.8432898963659861
},
{
"epoch": 1.6260032102728732,
"grad_norm": 0.1924072294082364,
"learning_rate": 5.52121968412179e-06,
"loss": 0.5055745840072632,
"step": 507,
"token_acc": 0.8345738040983084
},
{
"epoch": 1.6292134831460674,
"grad_norm": 0.21261427100461092,
"learning_rate": 5.515282250050126e-06,
"loss": 0.5099284052848816,
"step": 508,
"token_acc": 0.8314976854366483
},
{
"epoch": 1.6324237560192616,
"grad_norm": 0.2025806990103089,
"learning_rate": 5.509336127222227e-06,
"loss": 0.4560343623161316,
"step": 509,
"token_acc": 0.8485463993320789
},
{
"epoch": 1.635634028892456,
"grad_norm": 0.19793694638195608,
"learning_rate": 5.50338134127441e-06,
"loss": 0.4739583432674408,
"step": 510,
"token_acc": 0.8435436655565424
},
{
"epoch": 1.63884430176565,
"grad_norm": 0.20239792327651074,
"learning_rate": 5.497417917880343e-06,
"loss": 0.4736328125,
"step": 511,
"token_acc": 0.8439130236170116
},
{
"epoch": 1.6420545746388444,
"grad_norm": 0.24438066987043003,
"learning_rate": 5.4914458827509284e-06,
"loss": 0.504638671875,
"step": 512,
"token_acc": 0.8329907322901383
},
{
"epoch": 1.6452648475120384,
"grad_norm": 0.20114343359896492,
"learning_rate": 5.485465261634202e-06,
"loss": 0.4823405146598816,
"step": 513,
"token_acc": 0.8400644581090766
},
{
"epoch": 1.6484751203852328,
"grad_norm": 0.19304136590878382,
"learning_rate": 5.4794760803152185e-06,
"loss": 0.4589436948299408,
"step": 514,
"token_acc": 0.8478929820632646
},
{
"epoch": 1.651685393258427,
"grad_norm": 0.20488526796239803,
"learning_rate": 5.473478364615935e-06,
"loss": 0.509521484375,
"step": 515,
"token_acc": 0.832314781042273
},
{
"epoch": 1.6548956661316212,
"grad_norm": 0.21491797606119356,
"learning_rate": 5.467472140395109e-06,
"loss": 0.5096029043197632,
"step": 516,
"token_acc": 0.8323196545778732
},
{
"epoch": 1.6581059390048154,
"grad_norm": 0.22279949178529096,
"learning_rate": 5.461457433548176e-06,
"loss": 0.509765625,
"step": 517,
"token_acc": 0.8327975491177504
},
{
"epoch": 1.6613162118780096,
"grad_norm": 0.28204149423499697,
"learning_rate": 5.455434270007149e-06,
"loss": 0.4602457880973816,
"step": 518,
"token_acc": 0.8462769014072337
},
{
"epoch": 1.664526484751204,
"grad_norm": 0.19857347221417296,
"learning_rate": 5.449402675740499e-06,
"loss": 0.4959716796875,
"step": 519,
"token_acc": 0.8394384067964914
},
{
"epoch": 1.667736757624398,
"grad_norm": 0.19651498000755524,
"learning_rate": 5.443362676753047e-06,
"loss": 0.5087077021598816,
"step": 520,
"token_acc": 0.8350759960516969
},
{
"epoch": 1.6709470304975924,
"grad_norm": 0.22185821418458507,
"learning_rate": 5.4373142990858475e-06,
"loss": 0.522705078125,
"step": 521,
"token_acc": 0.8278426661895878
},
{
"epoch": 1.6741573033707864,
"grad_norm": 0.21252124551024912,
"learning_rate": 5.4312575688160834e-06,
"loss": 0.4680989682674408,
"step": 522,
"token_acc": 0.8462134699192948
},
{
"epoch": 1.6773675762439808,
"grad_norm": 0.195019164160461,
"learning_rate": 5.4251925120569444e-06,
"loss": 0.4916178584098816,
"step": 523,
"token_acc": 0.838224431417199
},
{
"epoch": 1.680577849117175,
"grad_norm": 0.1954125028857243,
"learning_rate": 5.4191191549575235e-06,
"loss": 0.5023600459098816,
"step": 524,
"token_acc": 0.8380056429926596
},
{
"epoch": 1.6837881219903692,
"grad_norm": 0.20550599221336258,
"learning_rate": 5.4130375237027e-06,
"loss": 0.4671224057674408,
"step": 525,
"token_acc": 0.848831486823923
},
{
"epoch": 1.6869983948635634,
"grad_norm": 0.2248887463696617,
"learning_rate": 5.406947644513022e-06,
"loss": 0.5421549677848816,
"step": 526,
"token_acc": 0.8226678940196518
},
{
"epoch": 1.6902086677367576,
"grad_norm": 0.20967538841981723,
"learning_rate": 5.400849543644603e-06,
"loss": 0.4853108823299408,
"step": 527,
"token_acc": 0.8407424576984543
},
{
"epoch": 1.6934189406099518,
"grad_norm": 0.20696923631557104,
"learning_rate": 5.394743247389001e-06,
"loss": 0.5126139521598816,
"step": 528,
"token_acc": 0.8323176383810414
},
{
"epoch": 1.696629213483146,
"grad_norm": 0.1939717195751301,
"learning_rate": 5.388628782073109e-06,
"loss": 0.4879150390625,
"step": 529,
"token_acc": 0.8364967403475068
},
{
"epoch": 1.6998394863563404,
"grad_norm": 0.20816101945087143,
"learning_rate": 5.382506174059041e-06,
"loss": 0.5115153193473816,
"step": 530,
"token_acc": 0.8326466761482434
},
{
"epoch": 1.7030497592295344,
"grad_norm": 0.2124824253704799,
"learning_rate": 5.376375449744016e-06,
"loss": 0.5000407099723816,
"step": 531,
"token_acc": 0.8352673820710945
},
{
"epoch": 1.7062600321027288,
"grad_norm": 0.20749686944009402,
"learning_rate": 5.370236635560248e-06,
"loss": 0.5111491084098816,
"step": 532,
"token_acc": 0.8330238015681216
},
{
"epoch": 1.709470304975923,
"grad_norm": 0.21515083840345234,
"learning_rate": 5.364089757974825e-06,
"loss": 0.4954427182674408,
"step": 533,
"token_acc": 0.8367462041670953
},
{
"epoch": 1.7126805778491172,
"grad_norm": 0.20415546098643264,
"learning_rate": 5.357934843489607e-06,
"loss": 0.4798177182674408,
"step": 534,
"token_acc": 0.843754797148208
},
{
"epoch": 1.7158908507223114,
"grad_norm": 0.20982427849673252,
"learning_rate": 5.3517719186411e-06,
"loss": 0.5299072265625,
"step": 535,
"token_acc": 0.824616460691853
},
{
"epoch": 1.7191011235955056,
"grad_norm": 0.19471497160695844,
"learning_rate": 5.3456010100003475e-06,
"loss": 0.4965006709098816,
"step": 536,
"token_acc": 0.8370856785490932
},
{
"epoch": 1.7223113964686998,
"grad_norm": 0.21374619126322247,
"learning_rate": 5.339422144172813e-06,
"loss": 0.477294921875,
"step": 537,
"token_acc": 0.8420311047825308
},
{
"epoch": 1.725521669341894,
"grad_norm": 0.2174473719352107,
"learning_rate": 5.333235347798271e-06,
"loss": 0.5260416865348816,
"step": 538,
"token_acc": 0.8281320331805363
},
{
"epoch": 1.7287319422150884,
"grad_norm": 0.18649816958919183,
"learning_rate": 5.327040647550682e-06,
"loss": 0.5035807490348816,
"step": 539,
"token_acc": 0.8344767359437466
},
{
"epoch": 1.7319422150882824,
"grad_norm": 0.2090737717426926,
"learning_rate": 5.320838070138088e-06,
"loss": 0.470703125,
"step": 540,
"token_acc": 0.8442940369697607
},
{
"epoch": 1.7351524879614768,
"grad_norm": 0.19379465591646067,
"learning_rate": 5.3146276423024916e-06,
"loss": 0.4705810546875,
"step": 541,
"token_acc": 0.8456177731583734
},
{
"epoch": 1.7383627608346708,
"grad_norm": 0.2089526337613315,
"learning_rate": 5.308409390819741e-06,
"loss": 0.4715983271598816,
"step": 542,
"token_acc": 0.8441938895802779
},
{
"epoch": 1.7415730337078652,
"grad_norm": 0.18693608307321663,
"learning_rate": 5.30218334249942e-06,
"loss": 0.4944254755973816,
"step": 543,
"token_acc": 0.8356434935135585
},
{
"epoch": 1.7447833065810594,
"grad_norm": 0.20269334957465848,
"learning_rate": 5.295949524184719e-06,
"loss": 0.4752604365348816,
"step": 544,
"token_acc": 0.8436504266306717
},
{
"epoch": 1.7479935794542536,
"grad_norm": 0.2138523875202584,
"learning_rate": 5.289707962752339e-06,
"loss": 0.4871826171875,
"step": 545,
"token_acc": 0.8399199325842195
},
{
"epoch": 1.7512038523274478,
"grad_norm": 0.21718445041799453,
"learning_rate": 5.283458685112356e-06,
"loss": 0.5204671621322632,
"step": 546,
"token_acc": 0.8286762334730732
},
{
"epoch": 1.754414125200642,
"grad_norm": 0.1844341076843839,
"learning_rate": 5.277201718208119e-06,
"loss": 0.46923828125,
"step": 547,
"token_acc": 0.8450318961286986
},
{
"epoch": 1.7576243980738364,
"grad_norm": 0.22487524878241977,
"learning_rate": 5.2709370890161275e-06,
"loss": 0.4814860224723816,
"step": 548,
"token_acc": 0.8417112567809547
},
{
"epoch": 1.7608346709470304,
"grad_norm": 0.33064628612141134,
"learning_rate": 5.264664824545915e-06,
"loss": 0.4855143427848816,
"step": 549,
"token_acc": 0.8410220085334438
},
{
"epoch": 1.7640449438202248,
"grad_norm": 0.23119216688303074,
"learning_rate": 5.258384951839937e-06,
"loss": 0.5033366084098816,
"step": 550,
"token_acc": 0.8337788557407072
},
{
"epoch": 1.7672552166934188,
"grad_norm": 0.18103127956229184,
"learning_rate": 5.252097497973448e-06,
"loss": 0.4745280146598816,
"step": 551,
"token_acc": 0.8419503784039933
},
{
"epoch": 1.7704654895666132,
"grad_norm": 0.23833812707404017,
"learning_rate": 5.245802490054391e-06,
"loss": 0.5206705927848816,
"step": 552,
"token_acc": 0.8301833414810544
},
{
"epoch": 1.7736757624398074,
"grad_norm": 0.19785227845167025,
"learning_rate": 5.239499955223275e-06,
"loss": 0.5011393427848816,
"step": 553,
"token_acc": 0.8353842042141859
},
{
"epoch": 1.7768860353130016,
"grad_norm": 0.19304852583653132,
"learning_rate": 5.233189920653065e-06,
"loss": 0.473388671875,
"step": 554,
"token_acc": 0.8432906503448305
},
{
"epoch": 1.7800963081861958,
"grad_norm": 0.17681531991205873,
"learning_rate": 5.226872413549056e-06,
"loss": 0.4810384213924408,
"step": 555,
"token_acc": 0.8424199560655011
},
{
"epoch": 1.78330658105939,
"grad_norm": 0.222715883875662,
"learning_rate": 5.220547461148762e-06,
"loss": 0.5354411005973816,
"step": 556,
"token_acc": 0.8273922278443526
},
{
"epoch": 1.7865168539325844,
"grad_norm": 0.21206983063444448,
"learning_rate": 5.2142150907217994e-06,
"loss": 0.5223795771598816,
"step": 557,
"token_acc": 0.8301004175651783
},
{
"epoch": 1.7897271268057784,
"grad_norm": 0.22442095897752923,
"learning_rate": 5.207875329569763e-06,
"loss": 0.5176595449447632,
"step": 558,
"token_acc": 0.8284297882890461
},
{
"epoch": 1.7929373996789728,
"grad_norm": 0.1920913997085975,
"learning_rate": 5.201528205026115e-06,
"loss": 0.4936116635799408,
"step": 559,
"token_acc": 0.8377082151513588
},
{
"epoch": 1.7961476725521668,
"grad_norm": 0.20956077689588662,
"learning_rate": 5.195173744456062e-06,
"loss": 0.516357421875,
"step": 560,
"token_acc": 0.8303534127757773
},
{
"epoch": 1.7993579454253612,
"grad_norm": 0.19020157762826823,
"learning_rate": 5.188811975256443e-06,
"loss": 0.4615071713924408,
"step": 561,
"token_acc": 0.8485206394794728
},
{
"epoch": 1.8025682182985554,
"grad_norm": 0.23222755465752573,
"learning_rate": 5.182442924855604e-06,
"loss": 0.5149332880973816,
"step": 562,
"token_acc": 0.8293938407833624
},
{
"epoch": 1.8057784911717496,
"grad_norm": 0.20156009297422847,
"learning_rate": 5.176066620713284e-06,
"loss": 0.4759928584098816,
"step": 563,
"token_acc": 0.8466319757217003
},
{
"epoch": 1.8089887640449438,
"grad_norm": 0.22296232660664472,
"learning_rate": 5.169683090320499e-06,
"loss": 0.4822184443473816,
"step": 564,
"token_acc": 0.8399143768271938
},
{
"epoch": 1.812199036918138,
"grad_norm": 0.21622319508582735,
"learning_rate": 5.163292361199418e-06,
"loss": 0.5107828974723816,
"step": 565,
"token_acc": 0.8304115767143985
},
{
"epoch": 1.8154093097913324,
"grad_norm": 0.18927721817608498,
"learning_rate": 5.156894460903245e-06,
"loss": 0.463134765625,
"step": 566,
"token_acc": 0.8469723153690331
},
{
"epoch": 1.8186195826645264,
"grad_norm": 0.2353979649474993,
"learning_rate": 5.1504894170161064e-06,
"loss": 0.5115560293197632,
"step": 567,
"token_acc": 0.8329049258062717
},
{
"epoch": 1.8218298555377208,
"grad_norm": 0.21250723283268672,
"learning_rate": 5.144077257152926e-06,
"loss": 0.5072428584098816,
"step": 568,
"token_acc": 0.8336668242384212
},
{
"epoch": 1.8250401284109148,
"grad_norm": 0.1973486114398243,
"learning_rate": 5.137658008959306e-06,
"loss": 0.5082194209098816,
"step": 569,
"token_acc": 0.8336676845589859
},
{
"epoch": 1.8282504012841092,
"grad_norm": 0.1891113883288137,
"learning_rate": 5.131231700111412e-06,
"loss": 0.5130615234375,
"step": 570,
"token_acc": 0.8317067866491336
},
{
"epoch": 1.8314606741573034,
"grad_norm": 0.1811753985751082,
"learning_rate": 5.124798358315848e-06,
"loss": 0.4811198115348816,
"step": 571,
"token_acc": 0.8425846984605
},
{
"epoch": 1.8346709470304976,
"grad_norm": 0.2251090035119156,
"learning_rate": 5.118358011309543e-06,
"loss": 0.5309244990348816,
"step": 572,
"token_acc": 0.8256256709608158
},
{
"epoch": 1.8378812199036918,
"grad_norm": 0.21647657132954062,
"learning_rate": 5.1119106868596285e-06,
"loss": 0.5172526240348816,
"step": 573,
"token_acc": 0.8320323617707402
},
{
"epoch": 1.841091492776886,
"grad_norm": 0.21316393483658413,
"learning_rate": 5.105456412763317e-06,
"loss": 0.4793294370174408,
"step": 574,
"token_acc": 0.8424825576937821
},
{
"epoch": 1.8443017656500804,
"grad_norm": 0.19319232333509864,
"learning_rate": 5.0989952168477845e-06,
"loss": 0.5150553584098816,
"step": 575,
"token_acc": 0.8323303300826294
},
{
"epoch": 1.8475120385232744,
"grad_norm": 0.1928503515554526,
"learning_rate": 5.092527126970049e-06,
"loss": 0.4501546323299408,
"step": 576,
"token_acc": 0.8541077268114217
},
{
"epoch": 1.8507223113964688,
"grad_norm": 0.22284305361976314,
"learning_rate": 5.086052171016856e-06,
"loss": 0.5310465693473816,
"step": 577,
"token_acc": 0.8276428803018245
},
{
"epoch": 1.8539325842696628,
"grad_norm": 0.19548158623323453,
"learning_rate": 5.079570376904545e-06,
"loss": 0.4556071162223816,
"step": 578,
"token_acc": 0.8501566517150454
},
{
"epoch": 1.8571428571428572,
"grad_norm": 0.20558731330933078,
"learning_rate": 5.073081772578948e-06,
"loss": 0.4671224057674408,
"step": 579,
"token_acc": 0.8471051563225707
},
{
"epoch": 1.8603531300160514,
"grad_norm": 0.24311090611247288,
"learning_rate": 5.06658638601525e-06,
"loss": 0.5386556386947632,
"step": 580,
"token_acc": 0.82159393957155
},
{
"epoch": 1.8635634028892456,
"grad_norm": 0.20978509717502059,
"learning_rate": 5.060084245217884e-06,
"loss": 0.5139974355697632,
"step": 581,
"token_acc": 0.8326921636335957
},
{
"epoch": 1.8667736757624398,
"grad_norm": 0.2685738757132505,
"learning_rate": 5.0535753782203984e-06,
"loss": 0.502685546875,
"step": 582,
"token_acc": 0.8341369158210662
},
{
"epoch": 1.869983948635634,
"grad_norm": 0.22599843111261422,
"learning_rate": 5.047059813085343e-06,
"loss": 0.4997151792049408,
"step": 583,
"token_acc": 0.8349442492783665
},
{
"epoch": 1.8731942215088284,
"grad_norm": 0.2066873416116241,
"learning_rate": 5.040537577904148e-06,
"loss": 0.4751790463924408,
"step": 584,
"token_acc": 0.8440484294631824
},
{
"epoch": 1.8764044943820224,
"grad_norm": 0.20099590959736363,
"learning_rate": 5.034008700796996e-06,
"loss": 0.459716796875,
"step": 585,
"token_acc": 0.8476779365774649
},
{
"epoch": 1.8796147672552168,
"grad_norm": 0.21775156659306671,
"learning_rate": 5.027473209912714e-06,
"loss": 0.5242513418197632,
"step": 586,
"token_acc": 0.8305953601755901
},
{
"epoch": 1.8828250401284108,
"grad_norm": 0.1890495343970011,
"learning_rate": 5.020931133428634e-06,
"loss": 0.4951171875,
"step": 587,
"token_acc": 0.836662561631107
},
{
"epoch": 1.8860353130016052,
"grad_norm": 0.21527350209501137,
"learning_rate": 5.014382499550491e-06,
"loss": 0.53173828125,
"step": 588,
"token_acc": 0.8277492238314829
},
{
"epoch": 1.8892455858747994,
"grad_norm": 0.23227571055684113,
"learning_rate": 5.007827336512283e-06,
"loss": 0.4992268979549408,
"step": 589,
"token_acc": 0.8370985707887444
},
{
"epoch": 1.8924558587479936,
"grad_norm": 0.21042210745504783,
"learning_rate": 5.001265672576164e-06,
"loss": 0.5146484375,
"step": 590,
"token_acc": 0.8314888476811728
},
{
"epoch": 1.8956661316211878,
"grad_norm": 0.23265310716693266,
"learning_rate": 4.994697536032316e-06,
"loss": 0.46502685546875,
"step": 591,
"token_acc": 0.8469708939265049
},
{
"epoch": 1.898876404494382,
"grad_norm": 0.19709877781742627,
"learning_rate": 4.988122955198823e-06,
"loss": 0.526123046875,
"step": 592,
"token_acc": 0.8270824317198007
},
{
"epoch": 1.9020866773675762,
"grad_norm": 0.2112550438235136,
"learning_rate": 4.981541958421558e-06,
"loss": 0.4967854917049408,
"step": 593,
"token_acc": 0.8364650593827906
},
{
"epoch": 1.9052969502407704,
"grad_norm": 0.30968707063177286,
"learning_rate": 4.974954574074051e-06,
"loss": 0.4849446713924408,
"step": 594,
"token_acc": 0.840327030844899
},
{
"epoch": 1.9085072231139648,
"grad_norm": 0.17649448504083676,
"learning_rate": 4.9683608305573775e-06,
"loss": 0.4849853515625,
"step": 595,
"token_acc": 0.8408735274724193
},
{
"epoch": 1.9117174959871588,
"grad_norm": 0.1931223434545565,
"learning_rate": 4.961760756300024e-06,
"loss": 0.488037109375,
"step": 596,
"token_acc": 0.8391003302273251
},
{
"epoch": 1.9149277688603532,
"grad_norm": 0.19289562481645892,
"learning_rate": 4.955154379757776e-06,
"loss": 0.4826253354549408,
"step": 597,
"token_acc": 0.8411694360867358
},
{
"epoch": 1.9181380417335474,
"grad_norm": 0.2027631539141705,
"learning_rate": 4.94854172941359e-06,
"loss": 0.466064453125,
"step": 598,
"token_acc": 0.8464385225571056
},
{
"epoch": 1.9213483146067416,
"grad_norm": 0.19077233268267782,
"learning_rate": 4.94192283377747e-06,
"loss": 0.4860026240348816,
"step": 599,
"token_acc": 0.8392299481728827
},
{
"epoch": 1.9245585874799358,
"grad_norm": 0.42816646022917115,
"learning_rate": 4.935297721386346e-06,
"loss": 0.53564453125,
"step": 600,
"token_acc": 0.8237000288030063
},
{
"epoch": 1.92776886035313,
"grad_norm": 0.18538141827096788,
"learning_rate": 4.928666420803953e-06,
"loss": 0.5026448965072632,
"step": 601,
"token_acc": 0.8352271388377014
},
{
"epoch": 1.9309791332263242,
"grad_norm": 0.18836200455506089,
"learning_rate": 4.922028960620707e-06,
"loss": 0.4625651240348816,
"step": 602,
"token_acc": 0.8463538880118882
},
{
"epoch": 1.9341894060995184,
"grad_norm": 3.8210876490438364,
"learning_rate": 4.915385369453577e-06,
"loss": 0.4749755859375,
"step": 603,
"token_acc": 0.8578914327444173
},
{
"epoch": 1.9373996789727128,
"grad_norm": 0.18920287412411185,
"learning_rate": 4.908735675945967e-06,
"loss": 0.474853515625,
"step": 604,
"token_acc": 0.8450734160217078
},
{
"epoch": 1.9406099518459068,
"grad_norm": 0.18893026029493232,
"learning_rate": 4.902079908767593e-06,
"loss": 0.4932454526424408,
"step": 605,
"token_acc": 0.8372230383233746
},
{
"epoch": 1.9438202247191012,
"grad_norm": 0.19603551127341237,
"learning_rate": 4.895418096614352e-06,
"loss": 0.4828287959098816,
"step": 606,
"token_acc": 0.8403485501639586
},
{
"epoch": 1.9470304975922952,
"grad_norm": 0.19978754345494049,
"learning_rate": 4.888750268208213e-06,
"loss": 0.472900390625,
"step": 607,
"token_acc": 0.8435411698135478
},
{
"epoch": 1.9502407704654896,
"grad_norm": 0.16106835196772315,
"learning_rate": 4.88207645229707e-06,
"loss": 0.4534098505973816,
"step": 608,
"token_acc": 0.8508265038647486
},
{
"epoch": 1.9534510433386838,
"grad_norm": 0.20706264592958304,
"learning_rate": 4.8753966776546435e-06,
"loss": 0.5162353515625,
"step": 609,
"token_acc": 0.8308325069891165
},
{
"epoch": 1.956661316211878,
"grad_norm": 0.20059240987585883,
"learning_rate": 4.868710973080339e-06,
"loss": 0.4964599609375,
"step": 610,
"token_acc": 0.8357304231324441
},
{
"epoch": 1.9598715890850722,
"grad_norm": 0.1861954935618554,
"learning_rate": 4.862019367399132e-06,
"loss": 0.49462890625,
"step": 611,
"token_acc": 0.8372238527765166
},
{
"epoch": 1.9630818619582664,
"grad_norm": 0.18074506091497944,
"learning_rate": 4.855321889461436e-06,
"loss": 0.4484049677848816,
"step": 612,
"token_acc": 0.8514336315420921
},
{
"epoch": 1.9662921348314608,
"grad_norm": 0.19022034618725248,
"learning_rate": 4.848618568142984e-06,
"loss": 0.4744466245174408,
"step": 613,
"token_acc": 0.8430896562046518
},
{
"epoch": 1.9695024077046548,
"grad_norm": 0.2108594040760019,
"learning_rate": 4.841909432344706e-06,
"loss": 0.5034586787223816,
"step": 614,
"token_acc": 0.8326992650209126
},
{
"epoch": 1.9727126805778492,
"grad_norm": 0.21038379203970337,
"learning_rate": 4.8351945109925935e-06,
"loss": 0.485595703125,
"step": 615,
"token_acc": 0.8397556248498602
},
{
"epoch": 1.9759229534510432,
"grad_norm": 0.19789035644639225,
"learning_rate": 4.82847383303759e-06,
"loss": 0.4847412109375,
"step": 616,
"token_acc": 0.840059550300896
},
{
"epoch": 1.9791332263242376,
"grad_norm": 0.18719895439265477,
"learning_rate": 4.821747427455452e-06,
"loss": 0.5332845449447632,
"step": 617,
"token_acc": 0.8263378386364979
},
{
"epoch": 1.9823434991974318,
"grad_norm": 0.20232274397686473,
"learning_rate": 4.815015323246633e-06,
"loss": 0.488525390625,
"step": 618,
"token_acc": 0.8385997224780295
},
{
"epoch": 1.985553772070626,
"grad_norm": 0.204908880630734,
"learning_rate": 4.808277549436157e-06,
"loss": 0.5272623896598816,
"step": 619,
"token_acc": 0.8287716233266886
},
{
"epoch": 1.9887640449438202,
"grad_norm": 0.18923767196243826,
"learning_rate": 4.801534135073487e-06,
"loss": 0.4903971552848816,
"step": 620,
"token_acc": 0.8410040926815862
},
{
"epoch": 1.9919743178170144,
"grad_norm": 0.19604318807580265,
"learning_rate": 4.794785109232412e-06,
"loss": 0.4755045771598816,
"step": 621,
"token_acc": 0.8440249094547587
},
{
"epoch": 1.9951845906902088,
"grad_norm": 0.19360171069988216,
"learning_rate": 4.788030501010908e-06,
"loss": 0.4340413510799408,
"step": 622,
"token_acc": 0.8566109542706771
},
{
"epoch": 1.9983948635634028,
"grad_norm": 0.21169639803746398,
"learning_rate": 4.781270339531025e-06,
"loss": 0.4751790463924408,
"step": 623,
"token_acc": 0.8425490126991452
},
{
"epoch": 2.0,
"grad_norm": 0.32969776523469907,
"learning_rate": 4.774504653938749e-06,
"loss": 0.5048828125,
"step": 624,
"token_acc": 0.835168437064428
},
{
"epoch": 2.0032102728731944,
"grad_norm": 0.2115590717349967,
"learning_rate": 4.767733473403889e-06,
"loss": 0.4543660581111908,
"step": 625,
"token_acc": 0.8494195117786585
},
{
"epoch": 2.0064205457463884,
"grad_norm": 0.20321529746345438,
"learning_rate": 4.760956827119941e-06,
"loss": 0.4671224057674408,
"step": 626,
"token_acc": 0.8450132819713727
},
{
"epoch": 2.009630818619583,
"grad_norm": 0.19687550291202977,
"learning_rate": 4.754174744303969e-06,
"loss": 0.4146728515625,
"step": 627,
"token_acc": 0.8618291589277594
},
{
"epoch": 2.012841091492777,
"grad_norm": 0.2523161476109027,
"learning_rate": 4.747387254196472e-06,
"loss": 0.4659017026424408,
"step": 628,
"token_acc": 0.8450511225888058
},
{
"epoch": 2.016051364365971,
"grad_norm": 0.2275898129138819,
"learning_rate": 4.740594386061269e-06,
"loss": 0.4815673828125,
"step": 629,
"token_acc": 0.8391848027771384
},
{
"epoch": 2.019261637239165,
"grad_norm": 0.20945775756940324,
"learning_rate": 4.733796169185358e-06,
"loss": 0.4023844599723816,
"step": 630,
"token_acc": 0.8670760098186089
},
{
"epoch": 2.0224719101123596,
"grad_norm": 0.18606275376579812,
"learning_rate": 4.726992632878804e-06,
"loss": 0.4026286005973816,
"step": 631,
"token_acc": 0.8687089536923019
},
{
"epoch": 2.0256821829855536,
"grad_norm": 0.24944135985122234,
"learning_rate": 4.7201838064746045e-06,
"loss": 0.4796549677848816,
"step": 632,
"token_acc": 0.8415990151837972
},
{
"epoch": 2.028892455858748,
"grad_norm": 0.21927512660431955,
"learning_rate": 4.713369719328564e-06,
"loss": 0.4185384213924408,
"step": 633,
"token_acc": 0.8599196770434493
},
{
"epoch": 2.0321027287319424,
"grad_norm": 0.20423909469957818,
"learning_rate": 4.706550400819168e-06,
"loss": 0.4786173701286316,
"step": 634,
"token_acc": 0.8406345834834711
},
{
"epoch": 2.0353130016051364,
"grad_norm": 0.2110739290103949,
"learning_rate": 4.699725880347459e-06,
"loss": 0.4677327573299408,
"step": 635,
"token_acc": 0.842199147099176
},
{
"epoch": 2.038523274478331,
"grad_norm": 0.2241130535375935,
"learning_rate": 4.692896187336904e-06,
"loss": 0.45947265625,
"step": 636,
"token_acc": 0.8468612098922518
},
{
"epoch": 2.041733547351525,
"grad_norm": 0.21080453370461158,
"learning_rate": 4.686061351233276e-06,
"loss": 0.4320882260799408,
"step": 637,
"token_acc": 0.8533629402120904
},
{
"epoch": 2.044943820224719,
"grad_norm": 0.32144418793695584,
"learning_rate": 4.6792214015045174e-06,
"loss": 0.4599609375,
"step": 638,
"token_acc": 0.8464602124837394
},
{
"epoch": 2.048154093097913,
"grad_norm": 0.2078479815708183,
"learning_rate": 4.672376367640618e-06,
"loss": 0.44677734375,
"step": 639,
"token_acc": 0.8507954992333089
},
{
"epoch": 2.0513643659711076,
"grad_norm": 0.19777970399487305,
"learning_rate": 4.6655262791534915e-06,
"loss": 0.447021484375,
"step": 640,
"token_acc": 0.8517660703332297
},
{
"epoch": 2.0545746388443016,
"grad_norm": 0.23410952163019927,
"learning_rate": 4.658671165576841e-06,
"loss": 0.48828125,
"step": 641,
"token_acc": 0.8380002042953089
},
{
"epoch": 2.057784911717496,
"grad_norm": 0.23586584692038676,
"learning_rate": 4.6518110564660345e-06,
"loss": 0.50079345703125,
"step": 642,
"token_acc": 0.8353889961913865
},
{
"epoch": 2.0609951845906904,
"grad_norm": 0.18434627206392934,
"learning_rate": 4.644945981397981e-06,
"loss": 0.4385986328125,
"step": 643,
"token_acc": 0.8545233421570331
},
{
"epoch": 2.0642054574638844,
"grad_norm": 0.24034833905211023,
"learning_rate": 4.6380759699709955e-06,
"loss": 0.4774169921875,
"step": 644,
"token_acc": 0.8411591494593856
},
{
"epoch": 2.067415730337079,
"grad_norm": 0.20241650168423786,
"learning_rate": 4.631201051804681e-06,
"loss": 0.4654134213924408,
"step": 645,
"token_acc": 0.8455160407197725
},
{
"epoch": 2.070626003210273,
"grad_norm": 0.2046900122680403,
"learning_rate": 4.6243212565397895e-06,
"loss": 0.473876953125,
"step": 646,
"token_acc": 0.8431059778612698
},
{
"epoch": 2.073836276083467,
"grad_norm": 0.2417909269891525,
"learning_rate": 4.6174366138381075e-06,
"loss": 0.5037841796875,
"step": 647,
"token_acc": 0.8344951828933971
},
{
"epoch": 2.077046548956661,
"grad_norm": 0.22303446814288203,
"learning_rate": 4.610547153382314e-06,
"loss": 0.4219563901424408,
"step": 648,
"token_acc": 0.8592609646904761
},
{
"epoch": 2.0802568218298556,
"grad_norm": 0.18113955215323252,
"learning_rate": 4.6036529048758625e-06,
"loss": 0.4479573667049408,
"step": 649,
"token_acc": 0.8503887564028392
},
{
"epoch": 2.0834670947030496,
"grad_norm": 0.20016249384220633,
"learning_rate": 4.596753898042852e-06,
"loss": 0.4075927734375,
"step": 650,
"token_acc": 0.8642372361853052
},
{
"epoch": 2.086677367576244,
"grad_norm": 0.2758564437294458,
"learning_rate": 4.589850162627892e-06,
"loss": 0.4864909052848816,
"step": 651,
"token_acc": 0.8406675969996761
},
{
"epoch": 2.0898876404494384,
"grad_norm": 0.20650339064451034,
"learning_rate": 4.582941728395984e-06,
"loss": 0.4553629755973816,
"step": 652,
"token_acc": 0.8495991193830745
},
{
"epoch": 2.0930979133226324,
"grad_norm": 0.2135335946405334,
"learning_rate": 4.5760286251323835e-06,
"loss": 0.4881998896598816,
"step": 653,
"token_acc": 0.838402452002496
},
{
"epoch": 2.096308186195827,
"grad_norm": 0.18863240697781536,
"learning_rate": 4.56911088264248e-06,
"loss": 0.4012858271598816,
"step": 654,
"token_acc": 0.8660901925722599
},
{
"epoch": 2.099518459069021,
"grad_norm": 0.1892004302680043,
"learning_rate": 4.562188530751662e-06,
"loss": 0.4156901240348816,
"step": 655,
"token_acc": 0.8613871908997803
},
{
"epoch": 2.102728731942215,
"grad_norm": 0.20815692597784538,
"learning_rate": 4.555261599305191e-06,
"loss": 0.4726969599723816,
"step": 656,
"token_acc": 0.8432399054351278
},
{
"epoch": 2.105939004815409,
"grad_norm": 0.2005084264907335,
"learning_rate": 4.548330118168078e-06,
"loss": 0.453369140625,
"step": 657,
"token_acc": 0.8475700262574954
},
{
"epoch": 2.1091492776886036,
"grad_norm": 0.1976983371473078,
"learning_rate": 4.5413941172249414e-06,
"loss": 0.47509765625,
"step": 658,
"token_acc": 0.8436715912866749
},
{
"epoch": 2.1123595505617976,
"grad_norm": 0.20121465339934608,
"learning_rate": 4.534453626379895e-06,
"loss": 0.4298502802848816,
"step": 659,
"token_acc": 0.8559626604434072
},
{
"epoch": 2.115569823434992,
"grad_norm": 0.2188682255969682,
"learning_rate": 4.527508675556402e-06,
"loss": 0.4773356318473816,
"step": 660,
"token_acc": 0.8424510479284684
},
{
"epoch": 2.1187800963081864,
"grad_norm": 0.196298873218584,
"learning_rate": 4.520559294697162e-06,
"loss": 0.4510905146598816,
"step": 661,
"token_acc": 0.8485553706957581
},
{
"epoch": 2.1219903691813804,
"grad_norm": 0.20127195747756338,
"learning_rate": 4.513605513763971e-06,
"loss": 0.441162109375,
"step": 662,
"token_acc": 0.8537049095552068
},
{
"epoch": 2.125200642054575,
"grad_norm": 0.20422156474354985,
"learning_rate": 4.5066473627375944e-06,
"loss": 0.430908203125,
"step": 663,
"token_acc": 0.8582405961292723
},
{
"epoch": 2.128410914927769,
"grad_norm": 0.19325151540226462,
"learning_rate": 4.499684871617642e-06,
"loss": 0.4597981870174408,
"step": 664,
"token_acc": 0.8459079539061464
},
{
"epoch": 2.131621187800963,
"grad_norm": 0.21773389555672684,
"learning_rate": 4.492718070422433e-06,
"loss": 0.4646809995174408,
"step": 665,
"token_acc": 0.846901257790134
},
{
"epoch": 2.134831460674157,
"grad_norm": 0.21424623915215318,
"learning_rate": 4.4857469891888724e-06,
"loss": 0.4659423828125,
"step": 666,
"token_acc": 0.8441213392519439
},
{
"epoch": 2.1380417335473516,
"grad_norm": 0.2009249890431782,
"learning_rate": 4.4787716579723136e-06,
"loss": 0.4574788510799408,
"step": 667,
"token_acc": 0.8479140475611946
},
{
"epoch": 2.1412520064205456,
"grad_norm": 0.1754109779980525,
"learning_rate": 4.471792106846437e-06,
"loss": 0.4366862177848816,
"step": 668,
"token_acc": 0.8545431885861233
},
{
"epoch": 2.14446227929374,
"grad_norm": 0.19503766537161335,
"learning_rate": 4.4648083659031164e-06,
"loss": 0.4374593198299408,
"step": 669,
"token_acc": 0.8539004313079454
},
{
"epoch": 2.1476725521669344,
"grad_norm": 0.20520643998076005,
"learning_rate": 4.45782046525229e-06,
"loss": 0.4506022334098816,
"step": 670,
"token_acc": 0.8490898021870575
},
{
"epoch": 2.1508828250401284,
"grad_norm": 0.2149478358921993,
"learning_rate": 4.450828435021828e-06,
"loss": 0.4812418818473816,
"step": 671,
"token_acc": 0.8411253322095418
},
{
"epoch": 2.154093097913323,
"grad_norm": 0.24205969670556346,
"learning_rate": 4.443832305357409e-06,
"loss": 0.4271240234375,
"step": 672,
"token_acc": 0.8578997251920647
},
{
"epoch": 2.157303370786517,
"grad_norm": 0.2004422310098208,
"learning_rate": 4.436832106422381e-06,
"loss": 0.4443766474723816,
"step": 673,
"token_acc": 0.8535244828617516
},
{
"epoch": 2.160513643659711,
"grad_norm": 0.21111911297117497,
"learning_rate": 4.429827868397641e-06,
"loss": 0.49969482421875,
"step": 674,
"token_acc": 0.835968496333625
},
{
"epoch": 2.163723916532905,
"grad_norm": 0.19869004633584814,
"learning_rate": 4.422819621481496e-06,
"loss": 0.4911295771598816,
"step": 675,
"token_acc": 0.8370750860720663
},
{
"epoch": 2.1669341894060996,
"grad_norm": 0.18351061549754683,
"learning_rate": 4.415807395889543e-06,
"loss": 0.4704183042049408,
"step": 676,
"token_acc": 0.8436283717276069
},
{
"epoch": 2.1701444622792936,
"grad_norm": 0.20781851776302881,
"learning_rate": 4.408791221854526e-06,
"loss": 0.4776204526424408,
"step": 677,
"token_acc": 0.8419597303153555
},
{
"epoch": 2.173354735152488,
"grad_norm": 0.19484554074620664,
"learning_rate": 4.401771129626217e-06,
"loss": 0.4590250849723816,
"step": 678,
"token_acc": 0.8474181663804423
},
{
"epoch": 2.176565008025682,
"grad_norm": 0.19215576220084854,
"learning_rate": 4.39474714947128e-06,
"loss": 0.4247233271598816,
"step": 679,
"token_acc": 0.857625869274352
},
{
"epoch": 2.1797752808988764,
"grad_norm": 0.19807829984933298,
"learning_rate": 4.38771931167314e-06,
"loss": 0.3979085385799408,
"step": 680,
"token_acc": 0.8667534544040275
},
{
"epoch": 2.182985553772071,
"grad_norm": 0.20487202093763737,
"learning_rate": 4.380687646531856e-06,
"loss": 0.4185791015625,
"step": 681,
"token_acc": 0.8599961127795507
},
{
"epoch": 2.186195826645265,
"grad_norm": 0.2158434149497215,
"learning_rate": 4.373652184363989e-06,
"loss": 0.4339803159236908,
"step": 682,
"token_acc": 0.8547810099872699
},
{
"epoch": 2.189406099518459,
"grad_norm": 0.19450239119508037,
"learning_rate": 4.366612955502466e-06,
"loss": 0.4608968198299408,
"step": 683,
"token_acc": 0.8457622285510433
},
{
"epoch": 2.192616372391653,
"grad_norm": 8.151322593196062,
"learning_rate": 4.35956999029646e-06,
"loss": 0.6525065302848816,
"step": 684,
"token_acc": 0.8290942820194461
},
{
"epoch": 2.1958266452648476,
"grad_norm": 0.2301432405581243,
"learning_rate": 4.352523319111249e-06,
"loss": 0.3920084834098816,
"step": 685,
"token_acc": 0.8705705882287679
},
{
"epoch": 2.1990369181380416,
"grad_norm": 0.19177589079146826,
"learning_rate": 4.34547297232809e-06,
"loss": 0.4669596552848816,
"step": 686,
"token_acc": 0.8443450342652492
},
{
"epoch": 2.202247191011236,
"grad_norm": 0.19407418830797563,
"learning_rate": 4.338418980344086e-06,
"loss": 0.44915771484375,
"step": 687,
"token_acc": 0.849480469239529
},
{
"epoch": 2.20545746388443,
"grad_norm": 0.21316814997568834,
"learning_rate": 4.331361373572058e-06,
"loss": 0.4639892578125,
"step": 688,
"token_acc": 0.8474408172948753
},
{
"epoch": 2.2086677367576244,
"grad_norm": 0.2011592865916337,
"learning_rate": 4.324300182440413e-06,
"loss": 0.4359130859375,
"step": 689,
"token_acc": 0.8551578038190392
},
{
"epoch": 2.211878009630819,
"grad_norm": 0.19686597083465757,
"learning_rate": 4.317235437393007e-06,
"loss": 0.4206950068473816,
"step": 690,
"token_acc": 0.8606955810759271
},
{
"epoch": 2.215088282504013,
"grad_norm": 0.1967845907073751,
"learning_rate": 4.310167168889025e-06,
"loss": 0.460693359375,
"step": 691,
"token_acc": 0.8455688388190843
},
{
"epoch": 2.218298555377207,
"grad_norm": 0.22090097209538617,
"learning_rate": 4.303095407402835e-06,
"loss": 0.4521484375,
"step": 692,
"token_acc": 0.8506517253734294
},
{
"epoch": 2.221508828250401,
"grad_norm": 0.21677968474594916,
"learning_rate": 4.296020183423873e-06,
"loss": 0.458984375,
"step": 693,
"token_acc": 0.8488590619141185
},
{
"epoch": 2.2247191011235956,
"grad_norm": 0.25446383871644274,
"learning_rate": 4.288941527456497e-06,
"loss": 0.47802734375,
"step": 694,
"token_acc": 0.8432776673814599
},
{
"epoch": 2.2279293739967896,
"grad_norm": 0.4021141479368582,
"learning_rate": 4.281859470019866e-06,
"loss": 0.4942220151424408,
"step": 695,
"token_acc": 0.8364409878451377
},
{
"epoch": 2.231139646869984,
"grad_norm": 0.21117603453855527,
"learning_rate": 4.274774041647802e-06,
"loss": 0.4697265625,
"step": 696,
"token_acc": 0.8429563959002171
},
{
"epoch": 2.234349919743178,
"grad_norm": 0.20475132687397243,
"learning_rate": 4.267685272888662e-06,
"loss": 0.4669596552848816,
"step": 697,
"token_acc": 0.8455216817521362
},
{
"epoch": 2.2375601926163724,
"grad_norm": 0.21314161795787429,
"learning_rate": 4.260593194305204e-06,
"loss": 0.4713541865348816,
"step": 698,
"token_acc": 0.8424913415995721
},
{
"epoch": 2.240770465489567,
"grad_norm": 0.20683152437768806,
"learning_rate": 4.253497836474453e-06,
"loss": 0.457763671875,
"step": 699,
"token_acc": 0.8489541017992153
},
{
"epoch": 2.243980738362761,
"grad_norm": 0.17876551858602532,
"learning_rate": 4.2463992299875805e-06,
"loss": 0.4122314453125,
"step": 700,
"token_acc": 0.8616135939943479
},
{
"epoch": 2.247191011235955,
"grad_norm": 0.19472552746773453,
"learning_rate": 4.239297405449754e-06,
"loss": 0.4484049677848816,
"step": 701,
"token_acc": 0.8514272976743756
},
{
"epoch": 2.250401284109149,
"grad_norm": 0.4749107558904206,
"learning_rate": 4.232192393480025e-06,
"loss": 0.449462890625,
"step": 702,
"token_acc": 0.8503469105115239
},
{
"epoch": 2.2536115569823436,
"grad_norm": 0.20430373824253736,
"learning_rate": 4.22508422471118e-06,
"loss": 0.4173991084098816,
"step": 703,
"token_acc": 0.8616719451442656
},
{
"epoch": 2.2568218298555376,
"grad_norm": 0.17014621877440333,
"learning_rate": 4.2179729297896215e-06,
"loss": 0.390655517578125,
"step": 704,
"token_acc": 0.8682284140866495
},
{
"epoch": 2.260032102728732,
"grad_norm": 0.2702584195216989,
"learning_rate": 4.210858539375225e-06,
"loss": 0.46490478515625,
"step": 705,
"token_acc": 0.8425552069903721
},
{
"epoch": 2.263242375601926,
"grad_norm": 0.18780220901872502,
"learning_rate": 4.203741084141217e-06,
"loss": 0.4297078549861908,
"step": 706,
"token_acc": 0.855920493511375
},
{
"epoch": 2.2664526484751204,
"grad_norm": 0.19912047965979465,
"learning_rate": 4.196620594774033e-06,
"loss": 0.4477742612361908,
"step": 707,
"token_acc": 0.8516944724579766
},
{
"epoch": 2.2696629213483144,
"grad_norm": 0.18743590774241778,
"learning_rate": 4.189497101973194e-06,
"loss": 0.4573567807674408,
"step": 708,
"token_acc": 0.8472192071315597
},
{
"epoch": 2.272873194221509,
"grad_norm": 0.196464694698763,
"learning_rate": 4.182370636451168e-06,
"loss": 0.470947265625,
"step": 709,
"token_acc": 0.8416422255356567
},
{
"epoch": 2.276083467094703,
"grad_norm": 0.2003085439880675,
"learning_rate": 4.175241228933239e-06,
"loss": 0.4788818359375,
"step": 710,
"token_acc": 0.840434185236257
},
{
"epoch": 2.279293739967897,
"grad_norm": 0.2707263062539891,
"learning_rate": 4.168108910157378e-06,
"loss": 0.4853515625,
"step": 711,
"token_acc": 0.8384725258346581
},
{
"epoch": 2.2825040128410916,
"grad_norm": 0.20944891376742802,
"learning_rate": 4.160973710874105e-06,
"loss": 0.4217529296875,
"step": 712,
"token_acc": 0.8596543309402678
},
{
"epoch": 2.2857142857142856,
"grad_norm": 0.21908698982662267,
"learning_rate": 4.153835661846362e-06,
"loss": 0.4789225459098816,
"step": 713,
"token_acc": 0.8408887490165224
},
{
"epoch": 2.28892455858748,
"grad_norm": 0.20629193977552543,
"learning_rate": 4.146694793849371e-06,
"loss": 0.4779866635799408,
"step": 714,
"token_acc": 0.8418222501838504
},
{
"epoch": 2.292134831460674,
"grad_norm": 0.3562937509884993,
"learning_rate": 4.139551137670518e-06,
"loss": 0.40576171875,
"step": 715,
"token_acc": 0.8659504454433432
},
{
"epoch": 2.2953451043338684,
"grad_norm": 0.20903340846333765,
"learning_rate": 4.132404724109203e-06,
"loss": 0.425048828125,
"step": 716,
"token_acc": 0.8585834387938246
},
{
"epoch": 2.2985553772070624,
"grad_norm": 0.20841632184926223,
"learning_rate": 4.125255583976713e-06,
"loss": 0.489990234375,
"step": 717,
"token_acc": 0.837642594100864
},
{
"epoch": 2.301765650080257,
"grad_norm": 0.19144223083979675,
"learning_rate": 4.118103748096096e-06,
"loss": 0.4186604917049408,
"step": 718,
"token_acc": 0.861624575018568
},
{
"epoch": 2.304975922953451,
"grad_norm": 0.18462822644417673,
"learning_rate": 4.110949247302018e-06,
"loss": 0.4283040463924408,
"step": 719,
"token_acc": 0.8584933901493453
},
{
"epoch": 2.308186195826645,
"grad_norm": 0.23122194639269375,
"learning_rate": 4.103792112440638e-06,
"loss": 0.4649251401424408,
"step": 720,
"token_acc": 0.8475111352464667
},
{
"epoch": 2.3113964686998396,
"grad_norm": 0.20138008057163814,
"learning_rate": 4.096632374369469e-06,
"loss": 0.471435546875,
"step": 721,
"token_acc": 0.8418223324087484
},
{
"epoch": 2.3146067415730336,
"grad_norm": 0.18998635568779773,
"learning_rate": 4.089470063957249e-06,
"loss": 0.4586588740348816,
"step": 722,
"token_acc": 0.8465928619671051
},
{
"epoch": 2.317817014446228,
"grad_norm": 0.20400593017285748,
"learning_rate": 4.082305212083804e-06,
"loss": 0.4299723505973816,
"step": 723,
"token_acc": 0.8568945784409309
},
{
"epoch": 2.321027287319422,
"grad_norm": 0.20762724598838142,
"learning_rate": 4.075137849639922e-06,
"loss": 0.4675700068473816,
"step": 724,
"token_acc": 0.8452025635686357
},
{
"epoch": 2.3242375601926164,
"grad_norm": 0.1880619188409803,
"learning_rate": 4.0679680075272115e-06,
"loss": 0.456787109375,
"step": 725,
"token_acc": 0.8487387689720907
},
{
"epoch": 2.3274478330658104,
"grad_norm": 0.19835165112775277,
"learning_rate": 4.060795716657973e-06,
"loss": 0.4452311396598816,
"step": 726,
"token_acc": 0.8512359014143344
},
{
"epoch": 2.330658105939005,
"grad_norm": 0.2121134079480636,
"learning_rate": 4.053621007955064e-06,
"loss": 0.4849446713924408,
"step": 727,
"token_acc": 0.8393768648219405
},
{
"epoch": 2.333868378812199,
"grad_norm": 0.22909333366098328,
"learning_rate": 4.046443912351768e-06,
"loss": 0.4340413510799408,
"step": 728,
"token_acc": 0.8557074860326944
},
{
"epoch": 2.337078651685393,
"grad_norm": 0.2203957460996992,
"learning_rate": 4.039264460791657e-06,
"loss": 0.4347737729549408,
"step": 729,
"token_acc": 0.8545071956839781
},
{
"epoch": 2.3402889245585876,
"grad_norm": 0.17220469104421543,
"learning_rate": 4.032082684228464e-06,
"loss": 0.458251953125,
"step": 730,
"token_acc": 0.8449290110193061
},
{
"epoch": 2.3434991974317816,
"grad_norm": 0.19203683386704218,
"learning_rate": 4.0248986136259406e-06,
"loss": 0.4306233823299408,
"step": 731,
"token_acc": 0.856137690285591
},
{
"epoch": 2.346709470304976,
"grad_norm": 0.2031216819897101,
"learning_rate": 4.017712279957736e-06,
"loss": 0.442138671875,
"step": 732,
"token_acc": 0.8552035462444415
},
{
"epoch": 2.34991974317817,
"grad_norm": 0.19608268201438736,
"learning_rate": 4.01052371420725e-06,
"loss": 0.4835408627986908,
"step": 733,
"token_acc": 0.8386815758131784
},
{
"epoch": 2.3531300160513644,
"grad_norm": 0.21101181910808853,
"learning_rate": 4.003332947367512e-06,
"loss": 0.4925944209098816,
"step": 734,
"token_acc": 0.8369294385129835
},
{
"epoch": 2.3563402889245584,
"grad_norm": 0.20991820512037176,
"learning_rate": 3.996140010441033e-06,
"loss": 0.4532063901424408,
"step": 735,
"token_acc": 0.849036176457023
},
{
"epoch": 2.359550561797753,
"grad_norm": 0.18535631476063846,
"learning_rate": 3.988944934439692e-06,
"loss": 0.501953125,
"step": 736,
"token_acc": 0.8317749768371802
},
{
"epoch": 2.362760834670947,
"grad_norm": 0.18506516727500621,
"learning_rate": 3.981747750384578e-06,
"loss": 0.4610188901424408,
"step": 737,
"token_acc": 0.845921960373089
},
{
"epoch": 2.365971107544141,
"grad_norm": 0.208056339714116,
"learning_rate": 3.974548489305876e-06,
"loss": 0.4741618037223816,
"step": 738,
"token_acc": 0.8420869682092947
},
{
"epoch": 2.3691813804173356,
"grad_norm": 0.20281908769650198,
"learning_rate": 3.9673471822427244e-06,
"loss": 0.4227702021598816,
"step": 739,
"token_acc": 0.8587991226400339
},
{
"epoch": 2.3723916532905296,
"grad_norm": 0.1996124616676339,
"learning_rate": 3.960143860243085e-06,
"loss": 0.4525553584098816,
"step": 740,
"token_acc": 0.848268109039626
},
{
"epoch": 2.375601926163724,
"grad_norm": 0.20996239997114738,
"learning_rate": 3.952938554363601e-06,
"loss": 0.4862060546875,
"step": 741,
"token_acc": 0.8388721388484253
},
{
"epoch": 2.378812199036918,
"grad_norm": 0.2013275896535193,
"learning_rate": 3.9457312956694736e-06,
"loss": 0.4324544370174408,
"step": 742,
"token_acc": 0.855948261073139
},
{
"epoch": 2.3820224719101124,
"grad_norm": 0.20345530910490214,
"learning_rate": 3.938522115234324e-06,
"loss": 0.486328125,
"step": 743,
"token_acc": 0.8371477100211938
},
{
"epoch": 2.3852327447833064,
"grad_norm": 0.20546084474227075,
"learning_rate": 3.931311044140055e-06,
"loss": 0.4718017578125,
"step": 744,
"token_acc": 0.8443781924795707
},
{
"epoch": 2.388443017656501,
"grad_norm": 0.21296384006021174,
"learning_rate": 3.924098113476726e-06,
"loss": 0.4545084834098816,
"step": 745,
"token_acc": 0.8494246157263653
},
{
"epoch": 2.391653290529695,
"grad_norm": 0.2221018040614752,
"learning_rate": 3.916883354342406e-06,
"loss": 0.5028483271598816,
"step": 746,
"token_acc": 0.8337560616098695
},
{
"epoch": 2.394863563402889,
"grad_norm": 0.3265950556384243,
"learning_rate": 3.9096667978430576e-06,
"loss": 0.4358724057674408,
"step": 747,
"token_acc": 0.8543744663599452
},
{
"epoch": 2.3980738362760836,
"grad_norm": 0.20055502563884153,
"learning_rate": 3.902448475092382e-06,
"loss": 0.4836018979549408,
"step": 748,
"token_acc": 0.8387875378488737
},
{
"epoch": 2.4012841091492776,
"grad_norm": 0.17420122955205622,
"learning_rate": 3.895228417211706e-06,
"loss": 0.420654296875,
"step": 749,
"token_acc": 0.8596989345596846
},
{
"epoch": 2.404494382022472,
"grad_norm": 0.19038474616744408,
"learning_rate": 3.888006655329828e-06,
"loss": 0.4512125849723816,
"step": 750,
"token_acc": 0.8501353108361196
},
{
"epoch": 2.407704654895666,
"grad_norm": 0.1809603835495357,
"learning_rate": 3.880783220582899e-06,
"loss": 0.40673828125,
"step": 751,
"token_acc": 0.865091380129019
},
{
"epoch": 2.4109149277688604,
"grad_norm": 0.19655293867885018,
"learning_rate": 3.87355814411428e-06,
"loss": 0.4643961787223816,
"step": 752,
"token_acc": 0.8461353315018977
},
{
"epoch": 2.4141252006420544,
"grad_norm": 0.21727278049897028,
"learning_rate": 3.86633145707441e-06,
"loss": 0.4464518427848816,
"step": 753,
"token_acc": 0.8512993522371843
},
{
"epoch": 2.417335473515249,
"grad_norm": 0.19141762810827992,
"learning_rate": 3.8591031906206735e-06,
"loss": 0.4381917417049408,
"step": 754,
"token_acc": 0.8541728933456112
},
{
"epoch": 2.420545746388443,
"grad_norm": 0.1989953727497561,
"learning_rate": 3.851873375917263e-06,
"loss": 0.4230143427848816,
"step": 755,
"token_acc": 0.8586672776916691
},
{
"epoch": 2.423756019261637,
"grad_norm": 0.23181582051230495,
"learning_rate": 3.8446420441350484e-06,
"loss": 0.4677327573299408,
"step": 756,
"token_acc": 0.8457994937242906
},
{
"epoch": 2.4269662921348316,
"grad_norm": 0.23951874435691276,
"learning_rate": 3.837409226451436e-06,
"loss": 0.4405517578125,
"step": 757,
"token_acc": 0.854444105779012
},
{
"epoch": 2.4301765650080256,
"grad_norm": 0.2528461108687528,
"learning_rate": 3.830174954050243e-06,
"loss": 0.460693359375,
"step": 758,
"token_acc": 0.8479104216428002
},
{
"epoch": 2.43338683788122,
"grad_norm": 0.1975669389803627,
"learning_rate": 3.822939258121557e-06,
"loss": 0.4350179135799408,
"step": 759,
"token_acc": 0.8552898624331037
},
{
"epoch": 2.436597110754414,
"grad_norm": 0.19115008055065782,
"learning_rate": 3.815702169861602e-06,
"loss": 0.436279296875,
"step": 760,
"token_acc": 0.854701305830316
},
{
"epoch": 2.4398073836276084,
"grad_norm": 0.19506918799904394,
"learning_rate": 3.808463720472607e-06,
"loss": 0.4281005859375,
"step": 761,
"token_acc": 0.8570146038416017
},
{
"epoch": 2.4430176565008024,
"grad_norm": 0.19842708334907894,
"learning_rate": 3.8012239411626655e-06,
"loss": 0.4752604365348816,
"step": 762,
"token_acc": 0.8403435924721215
},
{
"epoch": 2.446227929373997,
"grad_norm": 0.24844771648018837,
"learning_rate": 3.79398286314561e-06,
"loss": 0.4596354365348816,
"step": 763,
"token_acc": 0.8490057426109398
},
{
"epoch": 2.449438202247191,
"grad_norm": 0.19715040303854942,
"learning_rate": 3.7867405176408694e-06,
"loss": 0.4914957880973816,
"step": 764,
"token_acc": 0.8356615134323727
},
{
"epoch": 2.452648475120385,
"grad_norm": 0.2202478281163918,
"learning_rate": 3.7794969358733367e-06,
"loss": 0.4671224057674408,
"step": 765,
"token_acc": 0.8444183398261912
},
{
"epoch": 2.4558587479935796,
"grad_norm": 0.20459951635979012,
"learning_rate": 3.772252149073237e-06,
"loss": 0.4458821713924408,
"step": 766,
"token_acc": 0.8511224546199564
},
{
"epoch": 2.4590690208667736,
"grad_norm": 0.22450217665761618,
"learning_rate": 3.765006188475989e-06,
"loss": 0.4777018427848816,
"step": 767,
"token_acc": 0.8393996073719856
},
{
"epoch": 2.462279293739968,
"grad_norm": 0.19987629475264548,
"learning_rate": 3.7577590853220737e-06,
"loss": 0.463134765625,
"step": 768,
"token_acc": 0.8434089581091296
},
{
"epoch": 2.465489566613162,
"grad_norm": 0.20397409384425116,
"learning_rate": 3.7505108708568964e-06,
"loss": 0.438232421875,
"step": 769,
"token_acc": 0.8527221354299462
},
{
"epoch": 2.4686998394863564,
"grad_norm": 0.18616229570938478,
"learning_rate": 3.7432615763306564e-06,
"loss": 0.4168294370174408,
"step": 770,
"token_acc": 0.8615336290326948
},
{
"epoch": 2.4719101123595504,
"grad_norm": 0.19419952375362187,
"learning_rate": 3.736011232998206e-06,
"loss": 0.4766438901424408,
"step": 771,
"token_acc": 0.8408157756094587
},
{
"epoch": 2.475120385232745,
"grad_norm": 0.22402713438110902,
"learning_rate": 3.7287598721189225e-06,
"loss": 0.4800211787223816,
"step": 772,
"token_acc": 0.8409965346527944
},
{
"epoch": 2.478330658105939,
"grad_norm": 0.218272784858108,
"learning_rate": 3.721507524956569e-06,
"loss": 0.4397786557674408,
"step": 773,
"token_acc": 0.8515345114181647
},
{
"epoch": 2.481540930979133,
"grad_norm": 0.20590338299311356,
"learning_rate": 3.7142542227791597e-06,
"loss": 0.4883626401424408,
"step": 774,
"token_acc": 0.839136971273008
},
{
"epoch": 2.4847512038523276,
"grad_norm": 0.19709327674881771,
"learning_rate": 3.7069999968588315e-06,
"loss": 0.4468587338924408,
"step": 775,
"token_acc": 0.850644285251288
},
{
"epoch": 2.4879614767255216,
"grad_norm": 0.22022884628507708,
"learning_rate": 3.6997448784716943e-06,
"loss": 0.4592692255973816,
"step": 776,
"token_acc": 0.8474834727817374
},
{
"epoch": 2.491171749598716,
"grad_norm": 0.19724246193795616,
"learning_rate": 3.692488898897716e-06,
"loss": 0.4389241635799408,
"step": 777,
"token_acc": 0.8543228524690571
},
{
"epoch": 2.49438202247191,
"grad_norm": 0.1985194744702316,
"learning_rate": 3.6852320894205706e-06,
"loss": 0.4640299677848816,
"step": 778,
"token_acc": 0.8440977155286219
},
{
"epoch": 2.4975922953451044,
"grad_norm": 0.19701491228097626,
"learning_rate": 3.6779744813275153e-06,
"loss": 0.4671224057674408,
"step": 779,
"token_acc": 0.8440456139508346
},
{
"epoch": 2.5008025682182984,
"grad_norm": 0.18974099331427471,
"learning_rate": 3.670716105909243e-06,
"loss": 0.44677734375,
"step": 780,
"token_acc": 0.8511272770027192
},
{
"epoch": 2.504012841091493,
"grad_norm": 0.19515822626448606,
"learning_rate": 3.6634569944597646e-06,
"loss": 0.464599609375,
"step": 781,
"token_acc": 0.8460107523664183
},
{
"epoch": 2.5072231139646872,
"grad_norm": 0.1963284075241601,
"learning_rate": 3.656197178276256e-06,
"loss": 0.4373372495174408,
"step": 782,
"token_acc": 0.8526304538973646
},
{
"epoch": 2.510433386837881,
"grad_norm": 0.20458475147133778,
"learning_rate": 3.648936688658937e-06,
"loss": 0.48388671875,
"step": 783,
"token_acc": 0.8395201014301259
},
{
"epoch": 2.513643659711075,
"grad_norm": 0.1875839660932636,
"learning_rate": 3.641675556910928e-06,
"loss": 0.4583333432674408,
"step": 784,
"token_acc": 0.8477605477347625
},
{
"epoch": 2.5168539325842696,
"grad_norm": 0.20861108437682185,
"learning_rate": 3.634413814338117e-06,
"loss": 0.4627278745174408,
"step": 785,
"token_acc": 0.8475534760608696
},
{
"epoch": 2.520064205457464,
"grad_norm": 0.18447292570032636,
"learning_rate": 3.6271514922490315e-06,
"loss": 0.4423828125,
"step": 786,
"token_acc": 0.8558945701293097
},
{
"epoch": 2.523274478330658,
"grad_norm": 0.183868580124139,
"learning_rate": 3.619888621954688e-06,
"loss": 0.484619140625,
"step": 787,
"token_acc": 0.8412224592966723
},
{
"epoch": 2.5264847512038524,
"grad_norm": 0.2087754611682151,
"learning_rate": 3.612625234768476e-06,
"loss": 0.4520670771598816,
"step": 788,
"token_acc": 0.8494552689221115
},
{
"epoch": 2.5296950240770464,
"grad_norm": 0.1737629291121771,
"learning_rate": 3.6053613620060055e-06,
"loss": 0.4601237177848816,
"step": 789,
"token_acc": 0.8457257104020746
},
{
"epoch": 2.532905296950241,
"grad_norm": 0.17419155995660712,
"learning_rate": 3.5980970349849883e-06,
"loss": 0.404541015625,
"step": 790,
"token_acc": 0.8639127816961982
},
{
"epoch": 2.5361155698234352,
"grad_norm": 0.19772364419461955,
"learning_rate": 3.590832285025086e-06,
"loss": 0.45989990234375,
"step": 791,
"token_acc": 0.8458237277279737
},
{
"epoch": 2.539325842696629,
"grad_norm": 0.22027306794790522,
"learning_rate": 3.58356714344779e-06,
"loss": 0.437255859375,
"step": 792,
"token_acc": 0.8555278541953233
},
{
"epoch": 2.542536115569823,
"grad_norm": 0.3041945916167895,
"learning_rate": 3.576301641576279e-06,
"loss": 0.460693359375,
"step": 793,
"token_acc": 0.8462330337091692
},
{
"epoch": 2.5457463884430176,
"grad_norm": 0.17833110360469118,
"learning_rate": 3.5690358107352828e-06,
"loss": 0.4147135615348816,
"step": 794,
"token_acc": 0.8615169139768822
},
{
"epoch": 2.548956661316212,
"grad_norm": 0.18640330833188645,
"learning_rate": 3.5617696822509507e-06,
"loss": 0.4475911557674408,
"step": 795,
"token_acc": 0.8501959436814666
},
{
"epoch": 2.552166934189406,
"grad_norm": 0.1998174527481229,
"learning_rate": 3.5545032874507157e-06,
"loss": 0.4499104917049408,
"step": 796,
"token_acc": 0.8487911479756449
},
{
"epoch": 2.5553772070626004,
"grad_norm": 0.20594816291408258,
"learning_rate": 3.5472366576631594e-06,
"loss": 0.4593912959098816,
"step": 797,
"token_acc": 0.8464348456884555
},
{
"epoch": 2.5585874799357944,
"grad_norm": 0.18735094753253273,
"learning_rate": 3.539969824217874e-06,
"loss": 0.4056803584098816,
"step": 798,
"token_acc": 0.8645695718537573
},
{
"epoch": 2.561797752808989,
"grad_norm": 0.2318710491909044,
"learning_rate": 3.5327028184453347e-06,
"loss": 0.4589945673942566,
"step": 799,
"token_acc": 0.8473718998422008
},
{
"epoch": 2.5650080256821832,
"grad_norm": 0.20321706431944855,
"learning_rate": 3.525435671676754e-06,
"loss": 0.43487548828125,
"step": 800,
"token_acc": 0.8548923961390004
},
{
"epoch": 2.568218298555377,
"grad_norm": 0.19226811243347322,
"learning_rate": 3.518168415243957e-06,
"loss": 0.4361775815486908,
"step": 801,
"token_acc": 0.8546455004782437
},
{
"epoch": 2.571428571428571,
"grad_norm": 0.20749835339845354,
"learning_rate": 3.510901080479237e-06,
"loss": 0.4540202021598816,
"step": 802,
"token_acc": 0.8498480465746403
},
{
"epoch": 2.5746388443017656,
"grad_norm": 0.19982609438065352,
"learning_rate": 3.5036336987152294e-06,
"loss": 0.4513753354549408,
"step": 803,
"token_acc": 0.8490265800398946
},
{
"epoch": 2.57784911717496,
"grad_norm": 0.1904092351412821,
"learning_rate": 3.4963663012847697e-06,
"loss": 0.3978678584098816,
"step": 804,
"token_acc": 0.8680137044910768
},
{
"epoch": 2.581059390048154,
"grad_norm": 0.21102941528050104,
"learning_rate": 3.4890989195207632e-06,
"loss": 0.4449462890625,
"step": 805,
"token_acc": 0.8526128110191377
},
{
"epoch": 2.5842696629213484,
"grad_norm": 0.1838912778194567,
"learning_rate": 3.481831584756044e-06,
"loss": 0.4648641049861908,
"step": 806,
"token_acc": 0.8464704322688614
},
{
"epoch": 2.5874799357945424,
"grad_norm": 0.19920244205940324,
"learning_rate": 3.4745643283232463e-06,
"loss": 0.4841105341911316,
"step": 807,
"token_acc": 0.8388059850648198
},
{
"epoch": 2.590690208667737,
"grad_norm": 0.20205278122271417,
"learning_rate": 3.467297181554665e-06,
"loss": 0.4539388120174408,
"step": 808,
"token_acc": 0.850211181323957
},
{
"epoch": 2.5939004815409312,
"grad_norm": 0.2255035918908677,
"learning_rate": 3.4600301757821263e-06,
"loss": 0.4786784052848816,
"step": 809,
"token_acc": 0.8422286899694498
},
{
"epoch": 2.597110754414125,
"grad_norm": 0.20465355855215184,
"learning_rate": 3.452763342336842e-06,
"loss": 0.445556640625,
"step": 810,
"token_acc": 0.8522404470057361
},
{
"epoch": 2.600321027287319,
"grad_norm": 0.19109348036684573,
"learning_rate": 3.4454967125492846e-06,
"loss": 0.4539388120174408,
"step": 811,
"token_acc": 0.8483287395177225
},
{
"epoch": 2.6035313001605136,
"grad_norm": 0.23276755370704322,
"learning_rate": 3.4382303177490496e-06,
"loss": 0.4812825620174408,
"step": 812,
"token_acc": 0.840984396834211
},
{
"epoch": 2.606741573033708,
"grad_norm": 0.2098915307857685,
"learning_rate": 3.430964189264718e-06,
"loss": 0.4757487177848816,
"step": 813,
"token_acc": 0.8404143981582304
},
{
"epoch": 2.609951845906902,
"grad_norm": 0.19399775263086028,
"learning_rate": 3.423698358423722e-06,
"loss": 0.4791666865348816,
"step": 814,
"token_acc": 0.840336714916079
},
{
"epoch": 2.6131621187800964,
"grad_norm": 0.19057683090155922,
"learning_rate": 3.4164328565522094e-06,
"loss": 0.44775390625,
"step": 815,
"token_acc": 0.8495610915312901
},
{
"epoch": 2.6163723916532904,
"grad_norm": 0.2334463421118229,
"learning_rate": 3.409167714974914e-06,
"loss": 0.46875,
"step": 816,
"token_acc": 0.8432766126209109
},
{
"epoch": 2.619582664526485,
"grad_norm": 0.19364829944623,
"learning_rate": 3.401902965015013e-06,
"loss": 0.4464518427848816,
"step": 817,
"token_acc": 0.8517511283500472
},
{
"epoch": 2.6227929373996792,
"grad_norm": 0.20261948803065144,
"learning_rate": 3.394638637993994e-06,
"loss": 0.4549967646598816,
"step": 818,
"token_acc": 0.8495956798893742
},
{
"epoch": 2.626003210272873,
"grad_norm": 0.20631452948450796,
"learning_rate": 3.3873747652315244e-06,
"loss": 0.4468994140625,
"step": 819,
"token_acc": 0.851270244733375
},
{
"epoch": 2.629213483146067,
"grad_norm": 0.20133481061321648,
"learning_rate": 3.3801113780453125e-06,
"loss": 0.4449869990348816,
"step": 820,
"token_acc": 0.8522231643786884
},
{
"epoch": 2.6324237560192616,
"grad_norm": 0.1962973708838001,
"learning_rate": 3.3728485077509697e-06,
"loss": 0.4533284604549408,
"step": 821,
"token_acc": 0.8500484370255164
},
{
"epoch": 2.635634028892456,
"grad_norm": 0.17071948339566242,
"learning_rate": 3.3655861856618823e-06,
"loss": 0.3774007260799408,
"step": 822,
"token_acc": 0.8739059570666708
},
{
"epoch": 2.63884430176565,
"grad_norm": 0.19106041053421854,
"learning_rate": 3.3583244430890726e-06,
"loss": 0.4742838740348816,
"step": 823,
"token_acc": 0.8406814502542903
},
{
"epoch": 2.6420545746388444,
"grad_norm": 0.20086818900021164,
"learning_rate": 3.3510633113410633e-06,
"loss": 0.4076741635799408,
"step": 824,
"token_acc": 0.8653939506998632
},
{
"epoch": 2.6452648475120384,
"grad_norm": 0.16920033356636768,
"learning_rate": 3.343802821723743e-06,
"loss": 0.4669596552848816,
"step": 825,
"token_acc": 0.8433515081125119
},
{
"epoch": 2.648475120385233,
"grad_norm": 0.1995228870044419,
"learning_rate": 3.3365430055402357e-06,
"loss": 0.4744466245174408,
"step": 826,
"token_acc": 0.842604107498794
},
{
"epoch": 2.6516853932584272,
"grad_norm": 0.2049747416218833,
"learning_rate": 3.329283894090757e-06,
"loss": 0.4518229365348816,
"step": 827,
"token_acc": 0.849603820827889
},
{
"epoch": 2.654895666131621,
"grad_norm": 0.1956104974467398,
"learning_rate": 3.3220255186724863e-06,
"loss": 0.4283854365348816,
"step": 828,
"token_acc": 0.8578459150560697
},
{
"epoch": 2.658105939004815,
"grad_norm": 0.19695478096696453,
"learning_rate": 3.314767910579429e-06,
"loss": 0.4253743588924408,
"step": 829,
"token_acc": 0.8580703400546449
},
{
"epoch": 2.6613162118780096,
"grad_norm": 0.20303513871125345,
"learning_rate": 3.307511101102284e-06,
"loss": 0.4791666865348816,
"step": 830,
"token_acc": 0.8423049552177565
},
{
"epoch": 2.664526484751204,
"grad_norm": 0.19433971541630624,
"learning_rate": 3.3002551215283064e-06,
"loss": 0.4288330078125,
"step": 831,
"token_acc": 0.8565091912178404
},
{
"epoch": 2.667736757624398,
"grad_norm": 0.18358183780570975,
"learning_rate": 3.29300000314117e-06,
"loss": 0.457763671875,
"step": 832,
"token_acc": 0.8482874371407758
},
{
"epoch": 2.6709470304975924,
"grad_norm": 0.19470243762283243,
"learning_rate": 3.2857457772208398e-06,
"loss": 0.4110514521598816,
"step": 833,
"token_acc": 0.8618599939575965
},
{
"epoch": 2.6741573033707864,
"grad_norm": 0.18791430734811737,
"learning_rate": 3.278492475043431e-06,
"loss": 0.4446004331111908,
"step": 834,
"token_acc": 0.8527573789212444
},
{
"epoch": 2.677367576243981,
"grad_norm": 0.23214117649891247,
"learning_rate": 3.2712401278810783e-06,
"loss": 0.4806315302848816,
"step": 835,
"token_acc": 0.8403051280497658
},
{
"epoch": 2.6805778491171752,
"grad_norm": 0.18133293043714516,
"learning_rate": 3.2639887670017936e-06,
"loss": 0.476318359375,
"step": 836,
"token_acc": 0.8414659617161572
},
{
"epoch": 2.683788121990369,
"grad_norm": 0.18789082052914685,
"learning_rate": 3.2567384236693443e-06,
"loss": 0.458740234375,
"step": 837,
"token_acc": 0.8453170581614093
},
{
"epoch": 2.686998394863563,
"grad_norm": 0.19806392638633866,
"learning_rate": 3.249489129143104e-06,
"loss": 0.4776204526424408,
"step": 838,
"token_acc": 0.841645773855243
},
{
"epoch": 2.6902086677367576,
"grad_norm": 0.26211907964329023,
"learning_rate": 3.242240914677927e-06,
"loss": 0.4622395932674408,
"step": 839,
"token_acc": 0.8466941454653522
},
{
"epoch": 2.693418940609952,
"grad_norm": 0.2016656550776637,
"learning_rate": 3.234993811524011e-06,
"loss": 0.4531657099723816,
"step": 840,
"token_acc": 0.851422100408659
},
{
"epoch": 2.696629213483146,
"grad_norm": 0.19652448445502282,
"learning_rate": 3.227747850926763e-06,
"loss": 0.4515380859375,
"step": 841,
"token_acc": 0.8508233163907988
},
{
"epoch": 2.6998394863563404,
"grad_norm": 0.19941596229783648,
"learning_rate": 3.2205030641266645e-06,
"loss": 0.4519856870174408,
"step": 842,
"token_acc": 0.8473833129538995
},
{
"epoch": 2.7030497592295344,
"grad_norm": 0.22351326000827032,
"learning_rate": 3.213259482359131e-06,
"loss": 0.4437662959098816,
"step": 843,
"token_acc": 0.8541969557985216
},
{
"epoch": 2.706260032102729,
"grad_norm": 0.19902224213641,
"learning_rate": 3.20601713685439e-06,
"loss": 0.4616495966911316,
"step": 844,
"token_acc": 0.8445843509396473
},
{
"epoch": 2.7094703049759232,
"grad_norm": 0.18117357208751828,
"learning_rate": 3.198776058837335e-06,
"loss": 0.4361979365348816,
"step": 845,
"token_acc": 0.8555711350441413
},
{
"epoch": 2.712680577849117,
"grad_norm": 0.18226417638252332,
"learning_rate": 3.1915362795273947e-06,
"loss": 0.4409586787223816,
"step": 846,
"token_acc": 0.8520998316484529
},
{
"epoch": 2.715890850722311,
"grad_norm": 0.20695418343201752,
"learning_rate": 3.1842978301383973e-06,
"loss": 0.4451497495174408,
"step": 847,
"token_acc": 0.8534579934212124
},
{
"epoch": 2.7191011235955056,
"grad_norm": 0.20765131898028952,
"learning_rate": 3.1770607418784433e-06,
"loss": 0.4486897885799408,
"step": 848,
"token_acc": 0.851849861967661
},
{
"epoch": 2.7223113964687,
"grad_norm": 0.28670311782786223,
"learning_rate": 3.169825045949757e-06,
"loss": 0.4382731318473816,
"step": 849,
"token_acc": 0.8550369051504041
},
{
"epoch": 2.725521669341894,
"grad_norm": 0.19885506160440816,
"learning_rate": 3.162590773548564e-06,
"loss": 0.4499918818473816,
"step": 850,
"token_acc": 0.8506404746304723
},
{
"epoch": 2.7287319422150884,
"grad_norm": 0.6588682730611575,
"learning_rate": 3.1553579558649523e-06,
"loss": 0.4435628354549408,
"step": 851,
"token_acc": 0.850721246130347
},
{
"epoch": 2.7319422150882824,
"grad_norm": 0.21443528224992153,
"learning_rate": 3.1481266240827373e-06,
"loss": 0.4580892026424408,
"step": 852,
"token_acc": 0.8484892268236739
},
{
"epoch": 2.735152487961477,
"grad_norm": 0.18428221838796965,
"learning_rate": 3.1408968093793272e-06,
"loss": 0.4455973505973816,
"step": 853,
"token_acc": 0.8506541039353359
},
{
"epoch": 2.738362760834671,
"grad_norm": 0.21296547405294453,
"learning_rate": 3.1336685429255904e-06,
"loss": 0.4669596552848816,
"step": 854,
"token_acc": 0.844635929237041
},
{
"epoch": 2.741573033707865,
"grad_norm": 0.1970673523338949,
"learning_rate": 3.126441855885721e-06,
"loss": 0.4756673276424408,
"step": 855,
"token_acc": 0.8407943698334166
},
{
"epoch": 2.744783306581059,
"grad_norm": 0.18533116824438595,
"learning_rate": 3.1192167794171016e-06,
"loss": 0.4705810546875,
"step": 856,
"token_acc": 0.8437527360868543
},
{
"epoch": 2.7479935794542536,
"grad_norm": 0.20673945150207626,
"learning_rate": 3.111993344670173e-06,
"loss": 0.4440104365348816,
"step": 857,
"token_acc": 0.8527011318719799
},
{
"epoch": 2.751203852327448,
"grad_norm": 0.20432971411016607,
"learning_rate": 3.104771582788294e-06,
"loss": 0.4159749448299408,
"step": 858,
"token_acc": 0.8613609396227064
},
{
"epoch": 2.754414125200642,
"grad_norm": 0.20358212567304018,
"learning_rate": 3.0975515249076175e-06,
"loss": 0.47308349609375,
"step": 859,
"token_acc": 0.8437423208886284
},
{
"epoch": 2.7576243980738364,
"grad_norm": 0.23790587834419183,
"learning_rate": 3.0903332021569436e-06,
"loss": 0.4655354917049408,
"step": 860,
"token_acc": 0.8450927179357717
},
{
"epoch": 2.7608346709470304,
"grad_norm": 0.18307641448187867,
"learning_rate": 3.083116645657593e-06,
"loss": 0.43994140625,
"step": 861,
"token_acc": 0.8525941455216446
},
{
"epoch": 2.764044943820225,
"grad_norm": 0.1811725024850713,
"learning_rate": 3.075901886523275e-06,
"loss": 0.4523112177848816,
"step": 862,
"token_acc": 0.85006509693893
},
{
"epoch": 2.767255216693419,
"grad_norm": 0.1989735727923974,
"learning_rate": 3.068688955859945e-06,
"loss": 0.453857421875,
"step": 863,
"token_acc": 0.8496101461606868
},
{
"epoch": 2.770465489566613,
"grad_norm": 0.19077714887441913,
"learning_rate": 3.0614778847656763e-06,
"loss": 0.4422200620174408,
"step": 864,
"token_acc": 0.8537910905751728
},
{
"epoch": 2.773675762439807,
"grad_norm": 0.1977640130828424,
"learning_rate": 3.054268704330526e-06,
"loss": 0.4406535029411316,
"step": 865,
"token_acc": 0.8514452702525254
},
{
"epoch": 2.7768860353130016,
"grad_norm": 0.1808731306349855,
"learning_rate": 3.047061445636399e-06,
"loss": 0.4327799677848816,
"step": 866,
"token_acc": 0.8544081155438911
},
{
"epoch": 2.780096308186196,
"grad_norm": 0.1851079324071504,
"learning_rate": 3.039856139756916e-06,
"loss": 0.4536946713924408,
"step": 867,
"token_acc": 0.8493277191079274
},
{
"epoch": 2.78330658105939,
"grad_norm": 0.1925088846332903,
"learning_rate": 3.032652817757274e-06,
"loss": 0.44561767578125,
"step": 868,
"token_acc": 0.8513310539364469
},
{
"epoch": 2.7865168539325844,
"grad_norm": 0.17806624631080792,
"learning_rate": 3.0254515106941246e-06,
"loss": 0.4298909604549408,
"step": 869,
"token_acc": 0.8550168629794302
},
{
"epoch": 2.7897271268057784,
"grad_norm": 0.20298500989104132,
"learning_rate": 3.018252249615423e-06,
"loss": 0.4869384765625,
"step": 870,
"token_acc": 0.8397122828775442
},
{
"epoch": 2.792937399678973,
"grad_norm": 0.20918645966745378,
"learning_rate": 3.0110550655603096e-06,
"loss": 0.4236246943473816,
"step": 871,
"token_acc": 0.8599267937563877
},
{
"epoch": 2.796147672552167,
"grad_norm": 0.1949185958594448,
"learning_rate": 3.0038599895589657e-06,
"loss": 0.4535319209098816,
"step": 872,
"token_acc": 0.8489785408365976
},
{
"epoch": 2.799357945425361,
"grad_norm": 0.2025989275985742,
"learning_rate": 2.9966670526324888e-06,
"loss": 0.4698486328125,
"step": 873,
"token_acc": 0.8440375618917535
},
{
"epoch": 2.802568218298555,
"grad_norm": 0.17858666316037558,
"learning_rate": 2.9894762857927506e-06,
"loss": 0.4878743588924408,
"step": 874,
"token_acc": 0.8400573840707599
},
{
"epoch": 2.8057784911717496,
"grad_norm": 0.1916535937295437,
"learning_rate": 2.982287720042266e-06,
"loss": 0.437744140625,
"step": 875,
"token_acc": 0.8551442274926736
},
{
"epoch": 2.808988764044944,
"grad_norm": 0.1802093570224421,
"learning_rate": 2.9751013863740598e-06,
"loss": 0.4361165463924408,
"step": 876,
"token_acc": 0.8539694867975819
},
{
"epoch": 2.812199036918138,
"grad_norm": 0.18341640952769703,
"learning_rate": 2.9679173157715376e-06,
"loss": 0.4659423828125,
"step": 877,
"token_acc": 0.8473923909641549
},
{
"epoch": 2.8154093097913324,
"grad_norm": 0.19926651946578333,
"learning_rate": 2.960735539208344e-06,
"loss": 0.4440104365348816,
"step": 878,
"token_acc": 0.8531331105177661
},
{
"epoch": 2.8186195826645264,
"grad_norm": 0.18126871767280156,
"learning_rate": 2.953556087648232e-06,
"loss": 0.4615478515625,
"step": 879,
"token_acc": 0.846012540905167
},
{
"epoch": 2.821829855537721,
"grad_norm": 0.2048869147921809,
"learning_rate": 2.9463789920449363e-06,
"loss": 0.45703125,
"step": 880,
"token_acc": 0.848266982006728
},
{
"epoch": 2.825040128410915,
"grad_norm": 0.21465195556789202,
"learning_rate": 2.9392042833420274e-06,
"loss": 0.4917399287223816,
"step": 881,
"token_acc": 0.8380246428627718
},
{
"epoch": 2.828250401284109,
"grad_norm": 0.20000902524393988,
"learning_rate": 2.9320319924727893e-06,
"loss": 0.4508056640625,
"step": 882,
"token_acc": 0.8502047138063448
},
{
"epoch": 2.831460674157303,
"grad_norm": 0.1963068923956562,
"learning_rate": 2.924862150360078e-06,
"loss": 0.4275716245174408,
"step": 883,
"token_acc": 0.8565266918083189
},
{
"epoch": 2.8346709470304976,
"grad_norm": 0.193862012915289,
"learning_rate": 2.9176947879161956e-06,
"loss": 0.4766438901424408,
"step": 884,
"token_acc": 0.843147802741985
},
{
"epoch": 2.837881219903692,
"grad_norm": 0.17750407315632236,
"learning_rate": 2.9105299360427524e-06,
"loss": 0.4188639521598816,
"step": 885,
"token_acc": 0.8615091938541838
},
{
"epoch": 2.841091492776886,
"grad_norm": 0.20567900106548453,
"learning_rate": 2.903367625630531e-06,
"loss": 0.4632568359375,
"step": 886,
"token_acc": 0.8458096462571061
},
{
"epoch": 2.8443017656500804,
"grad_norm": 0.20028858738778038,
"learning_rate": 2.8962078875593617e-06,
"loss": 0.4229329526424408,
"step": 887,
"token_acc": 0.8594632037806461
},
{
"epoch": 2.8475120385232744,
"grad_norm": 0.19611757309044633,
"learning_rate": 2.889050752697982e-06,
"loss": 0.4334309995174408,
"step": 888,
"token_acc": 0.8550386869639498
},
{
"epoch": 2.850722311396469,
"grad_norm": 0.17221350932947482,
"learning_rate": 2.8818962519039052e-06,
"loss": 0.4227091670036316,
"step": 889,
"token_acc": 0.8588126879425745
},
{
"epoch": 2.853932584269663,
"grad_norm": 0.19068808416823477,
"learning_rate": 2.874744416023286e-06,
"loss": 0.443115234375,
"step": 890,
"token_acc": 0.8531313450132613
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.18792769221448774,
"learning_rate": 2.8675952758907976e-06,
"loss": 0.4404296875,
"step": 891,
"token_acc": 0.8532470114456258
},
{
"epoch": 2.860353130016051,
"grad_norm": 0.18774354635019833,
"learning_rate": 2.8604488623294816e-06,
"loss": 0.4722086787223816,
"step": 892,
"token_acc": 0.8409748913356431
},
{
"epoch": 2.8635634028892456,
"grad_norm": 0.17303731091005972,
"learning_rate": 2.8533052061506273e-06,
"loss": 0.4560546875,
"step": 893,
"token_acc": 0.8479154513444437
},
{
"epoch": 2.86677367576244,
"grad_norm": 0.20252387350469153,
"learning_rate": 2.8461643381536386e-06,
"loss": 0.4105224609375,
"step": 894,
"token_acc": 0.8620219153015483
},
{
"epoch": 2.869983948635634,
"grad_norm": 0.23064818047776917,
"learning_rate": 2.8390262891258956e-06,
"loss": 0.44384765625,
"step": 895,
"token_acc": 0.852279850676447
},
{
"epoch": 2.8731942215088284,
"grad_norm": 0.2039541978845269,
"learning_rate": 2.831891089842623e-06,
"loss": 0.4642741084098816,
"step": 896,
"token_acc": 0.8480350311624173
},
{
"epoch": 2.8764044943820224,
"grad_norm": 0.19298964547659045,
"learning_rate": 2.824758771066761e-06,
"loss": 0.47314453125,
"step": 897,
"token_acc": 0.8419148311180962
},
{
"epoch": 2.879614767255217,
"grad_norm": 0.22735774599571984,
"learning_rate": 2.817629363548833e-06,
"loss": 0.4599202573299408,
"step": 898,
"token_acc": 0.8460708646745579
},
{
"epoch": 2.882825040128411,
"grad_norm": 0.17385809855249318,
"learning_rate": 2.8105028980268066e-06,
"loss": 0.4387614130973816,
"step": 899,
"token_acc": 0.8537094497487598
},
{
"epoch": 2.886035313001605,
"grad_norm": 0.21561442777127388,
"learning_rate": 2.8033794052259683e-06,
"loss": 0.4597981870174408,
"step": 900,
"token_acc": 0.8463050783653008
},
{
"epoch": 2.889245585874799,
"grad_norm": 0.1925141857167567,
"learning_rate": 2.796258915858783e-06,
"loss": 0.4646809995174408,
"step": 901,
"token_acc": 0.8462298777070343
},
{
"epoch": 2.8924558587479936,
"grad_norm": 0.1967049507414003,
"learning_rate": 2.789141460624775e-06,
"loss": 0.4554036557674408,
"step": 902,
"token_acc": 0.8482254803972719
},
{
"epoch": 2.895666131621188,
"grad_norm": 0.19182500378328768,
"learning_rate": 2.782027070210379e-06,
"loss": 0.4325358271598816,
"step": 903,
"token_acc": 0.8540655882553516
},
{
"epoch": 2.898876404494382,
"grad_norm": 0.20492626814952483,
"learning_rate": 2.7749157752888192e-06,
"loss": 0.447021484375,
"step": 904,
"token_acc": 0.8528819674309748
},
{
"epoch": 2.902086677367576,
"grad_norm": 0.18973215032814825,
"learning_rate": 2.767807606519975e-06,
"loss": 0.4326985776424408,
"step": 905,
"token_acc": 0.8551310028184196
},
{
"epoch": 2.9052969502407704,
"grad_norm": 0.20870842711893867,
"learning_rate": 2.760702594550246e-06,
"loss": 0.4281412959098816,
"step": 906,
"token_acc": 0.8567996447685152
},
{
"epoch": 2.908507223113965,
"grad_norm": 0.21820108404227115,
"learning_rate": 2.753600770012421e-06,
"loss": 0.4617513120174408,
"step": 907,
"token_acc": 0.846852006172273
},
{
"epoch": 2.911717495987159,
"grad_norm": 0.2082233355287873,
"learning_rate": 2.7465021635255465e-06,
"loss": 0.4466145932674408,
"step": 908,
"token_acc": 0.8515980815438643
},
{
"epoch": 2.914927768860353,
"grad_norm": 0.20502239244120005,
"learning_rate": 2.739406805694797e-06,
"loss": 0.4312540888786316,
"step": 909,
"token_acc": 0.8561211161593144
},
{
"epoch": 2.918138041733547,
"grad_norm": 0.18589225273227217,
"learning_rate": 2.732314727111338e-06,
"loss": 0.457763671875,
"step": 910,
"token_acc": 0.8447814188569882
},
{
"epoch": 2.9213483146067416,
"grad_norm": 0.19325400267429202,
"learning_rate": 2.725225958352197e-06,
"loss": 0.4157511591911316,
"step": 911,
"token_acc": 0.8615710983757084
},
{
"epoch": 2.924558587479936,
"grad_norm": 0.24863658946228606,
"learning_rate": 2.7181405299801342e-06,
"loss": 0.441162109375,
"step": 912,
"token_acc": 0.8537013414150237
},
{
"epoch": 2.92776886035313,
"grad_norm": 0.1679392411358542,
"learning_rate": 2.7110584725435037e-06,
"loss": 0.3821207880973816,
"step": 913,
"token_acc": 0.8727992820526524
},
{
"epoch": 2.930979133226324,
"grad_norm": 0.17898720333894108,
"learning_rate": 2.703979816576128e-06,
"loss": 0.4577229917049408,
"step": 914,
"token_acc": 0.8485272656543374
},
{
"epoch": 2.9341894060995184,
"grad_norm": 0.20953372056682842,
"learning_rate": 2.6969045925971647e-06,
"loss": 0.4898274838924408,
"step": 915,
"token_acc": 0.8378350593275259
},
{
"epoch": 2.937399678972713,
"grad_norm": 0.18986984243575183,
"learning_rate": 2.689832831110976e-06,
"loss": 0.4823405146598816,
"step": 916,
"token_acc": 0.8401791302339995
},
{
"epoch": 2.940609951845907,
"grad_norm": 0.18376410703464807,
"learning_rate": 2.682764562606993e-06,
"loss": 0.4639892578125,
"step": 917,
"token_acc": 0.8443565602409577
},
{
"epoch": 2.943820224719101,
"grad_norm": 0.20423108351344285,
"learning_rate": 2.6756998175595865e-06,
"loss": 0.4396159052848816,
"step": 918,
"token_acc": 0.8543520731929763
},
{
"epoch": 2.947030497592295,
"grad_norm": 0.19283053616253365,
"learning_rate": 2.6686386264279417e-06,
"loss": 0.4716796875,
"step": 919,
"token_acc": 0.8447049600482436
},
{
"epoch": 2.9502407704654896,
"grad_norm": 0.17396974121526895,
"learning_rate": 2.6615810196559143e-06,
"loss": 0.419921875,
"step": 920,
"token_acc": 0.8606441665206895
},
{
"epoch": 2.953451043338684,
"grad_norm": 0.18122123451983121,
"learning_rate": 2.6545270276719115e-06,
"loss": 0.4344889521598816,
"step": 921,
"token_acc": 0.8557424324856561
},
{
"epoch": 2.956661316211878,
"grad_norm": 0.21164130487363986,
"learning_rate": 2.6474766808887508e-06,
"loss": 0.4502767026424408,
"step": 922,
"token_acc": 0.8507737208341514
},
{
"epoch": 2.959871589085072,
"grad_norm": 0.20693230256283227,
"learning_rate": 2.6404300097035397e-06,
"loss": 0.4093017578125,
"step": 923,
"token_acc": 0.8623301720111289
},
{
"epoch": 2.9630818619582664,
"grad_norm": 0.2128202281439714,
"learning_rate": 2.6333870444975333e-06,
"loss": 0.4647623896598816,
"step": 924,
"token_acc": 0.8452988006954382
},
{
"epoch": 2.966292134831461,
"grad_norm": 0.19432742708017048,
"learning_rate": 2.6263478156360117e-06,
"loss": 0.4721272885799408,
"step": 925,
"token_acc": 0.8423774782670488
},
{
"epoch": 2.969502407704655,
"grad_norm": 0.20592585126650714,
"learning_rate": 2.619312353468143e-06,
"loss": 0.4940185546875,
"step": 926,
"token_acc": 0.8377018770376724
},
{
"epoch": 2.972712680577849,
"grad_norm": 0.1833160802061042,
"learning_rate": 2.61228068832686e-06,
"loss": 0.4482015073299408,
"step": 927,
"token_acc": 0.8498228651561001
},
{
"epoch": 2.975922953451043,
"grad_norm": 0.21859060590959328,
"learning_rate": 2.605252850528721e-06,
"loss": 0.4634196162223816,
"step": 928,
"token_acc": 0.8474287448047855
},
{
"epoch": 2.9791332263242376,
"grad_norm": 0.32814155266427864,
"learning_rate": 2.5982288703737832e-06,
"loss": 0.4471842646598816,
"step": 929,
"token_acc": 0.8499125107252692
},
{
"epoch": 2.982343499197432,
"grad_norm": 0.18383434017442052,
"learning_rate": 2.5912087781454747e-06,
"loss": 0.4506022334098816,
"step": 930,
"token_acc": 0.8479707935620925
},
{
"epoch": 2.985553772070626,
"grad_norm": 0.21009008880803093,
"learning_rate": 2.584192604110458e-06,
"loss": 0.4718831479549408,
"step": 931,
"token_acc": 0.8435183994391784
},
{
"epoch": 2.98876404494382,
"grad_norm": 0.20869763364072635,
"learning_rate": 2.577180378518505e-06,
"loss": 0.4437255859375,
"step": 932,
"token_acc": 0.8529731354610436
},
{
"epoch": 2.9919743178170144,
"grad_norm": 0.1925184488565764,
"learning_rate": 2.5701721316023596e-06,
"loss": 0.4805094599723816,
"step": 933,
"token_acc": 0.8384617433930094
},
{
"epoch": 2.995184590690209,
"grad_norm": 0.19414097485510715,
"learning_rate": 2.56316789357762e-06,
"loss": 0.4686279296875,
"step": 934,
"token_acc": 0.8461598371896937
},
{
"epoch": 2.998394863563403,
"grad_norm": 0.20311846242856332,
"learning_rate": 2.556167694642592e-06,
"loss": 0.4471435546875,
"step": 935,
"token_acc": 0.8523729443628189
},
{
"epoch": 3.0,
"grad_norm": 0.27709428039989725,
"learning_rate": 2.5491715649781713e-06,
"loss": 0.4915364682674408,
"step": 936,
"token_acc": 0.8348490575489441
},
{
"epoch": 3.0032102728731944,
"grad_norm": 0.19512840843235554,
"learning_rate": 2.54217953474771e-06,
"loss": 0.4064127802848816,
"step": 937,
"token_acc": 0.8645122771380337
},
{
"epoch": 3.0064205457463884,
"grad_norm": 0.19152822679354511,
"learning_rate": 2.5351916340968834e-06,
"loss": 0.430419921875,
"step": 938,
"token_acc": 0.8565615697709739
},
{
"epoch": 3.009630818619583,
"grad_norm": 0.19480721878283314,
"learning_rate": 2.5282078931535636e-06,
"loss": 0.4052734375,
"step": 939,
"token_acc": 0.865073050011375
},
{
"epoch": 3.012841091492777,
"grad_norm": 0.20094176240337724,
"learning_rate": 2.5212283420276868e-06,
"loss": 0.4458821713924408,
"step": 940,
"token_acc": 0.8485933197149823
},
{
"epoch": 3.016051364365971,
"grad_norm": 0.20026529443568822,
"learning_rate": 2.5142530108111283e-06,
"loss": 0.4403890073299408,
"step": 941,
"token_acc": 0.8525967790426874
},
{
"epoch": 3.019261637239165,
"grad_norm": 0.21284407108413642,
"learning_rate": 2.507281929577567e-06,
"loss": 0.4329833984375,
"step": 942,
"token_acc": 0.8550810846812996
},
{
"epoch": 3.0224719101123596,
"grad_norm": 0.18205057534794428,
"learning_rate": 2.5003151283823577e-06,
"loss": 0.4101766049861908,
"step": 943,
"token_acc": 0.8625629537300611
},
{
"epoch": 3.0256821829855536,
"grad_norm": 0.2183224419846251,
"learning_rate": 2.493352637262405e-06,
"loss": 0.4402262568473816,
"step": 944,
"token_acc": 0.8541160334907777
},
{
"epoch": 3.028892455858748,
"grad_norm": 0.19560530960918013,
"learning_rate": 2.48639448623603e-06,
"loss": 0.4363200068473816,
"step": 945,
"token_acc": 0.8543690262716656
},
{
"epoch": 3.0321027287319424,
"grad_norm": 0.20229017181299092,
"learning_rate": 2.4794407053028385e-06,
"loss": 0.4148763120174408,
"step": 946,
"token_acc": 0.8634134775874861
},
{
"epoch": 3.0353130016051364,
"grad_norm": 0.1807469477199333,
"learning_rate": 2.4724913244435983e-06,
"loss": 0.3597819209098816,
"step": 947,
"token_acc": 0.8800449186083527
},
{
"epoch": 3.038523274478331,
"grad_norm": 0.19730264955019094,
"learning_rate": 2.465546373620106e-06,
"loss": 0.4295145869255066,
"step": 948,
"token_acc": 0.8556339526610096
},
{
"epoch": 3.041733547351525,
"grad_norm": 0.203882798491726,
"learning_rate": 2.458605882775059e-06,
"loss": 0.4242350459098816,
"step": 949,
"token_acc": 0.8572224411495525
},
{
"epoch": 3.044943820224719,
"grad_norm": 0.20160987589550436,
"learning_rate": 2.4516698818319232e-06,
"loss": 0.4075114130973816,
"step": 950,
"token_acc": 0.8645283352488727
},
{
"epoch": 3.048154093097913,
"grad_norm": 0.1762688523749818,
"learning_rate": 2.444738400694808e-06,
"loss": 0.4170125424861908,
"step": 951,
"token_acc": 0.8620549639827875
},
{
"epoch": 3.0513643659711076,
"grad_norm": 0.2054457353905813,
"learning_rate": 2.4378114692483384e-06,
"loss": 0.4302571713924408,
"step": 952,
"token_acc": 0.8568869894067945
},
{
"epoch": 3.0545746388443016,
"grad_norm": 0.19065678215481155,
"learning_rate": 2.43088911735752e-06,
"loss": 0.4544677734375,
"step": 953,
"token_acc": 0.848384410373228
},
{
"epoch": 3.057784911717496,
"grad_norm": 0.1911349829751549,
"learning_rate": 2.4239713748676156e-06,
"loss": 0.4250895380973816,
"step": 954,
"token_acc": 0.8578417229285682
},
{
"epoch": 3.0609951845906904,
"grad_norm": 0.17086755774006357,
"learning_rate": 2.4170582716040163e-06,
"loss": 0.4202474057674408,
"step": 955,
"token_acc": 0.8604329880984048
},
{
"epoch": 3.0642054574638844,
"grad_norm": 0.19718572745924365,
"learning_rate": 2.4101498373721078e-06,
"loss": 0.4644775390625,
"step": 956,
"token_acc": 0.8432624399789125
},
{
"epoch": 3.067415730337079,
"grad_norm": 0.18969121035642866,
"learning_rate": 2.403246101957149e-06,
"loss": 0.4046630859375,
"step": 957,
"token_acc": 0.8642841551259974
},
{
"epoch": 3.070626003210273,
"grad_norm": 0.33889632940191633,
"learning_rate": 2.3963470951241374e-06,
"loss": 0.3937581479549408,
"step": 958,
"token_acc": 0.8685400647452663
},
{
"epoch": 3.073836276083467,
"grad_norm": 0.18441537435055547,
"learning_rate": 2.389452846617687e-06,
"loss": 0.4249267578125,
"step": 959,
"token_acc": 0.8557664988332394
},
{
"epoch": 3.077046548956661,
"grad_norm": 0.2157696044564956,
"learning_rate": 2.382563386161894e-06,
"loss": 0.4385172724723816,
"step": 960,
"token_acc": 0.8540134551100046
},
{
"epoch": 3.0802568218298556,
"grad_norm": 0.20472017309447713,
"learning_rate": 2.3756787434602096e-06,
"loss": 0.4079183042049408,
"step": 961,
"token_acc": 0.8627625265476284
},
{
"epoch": 3.0834670947030496,
"grad_norm": 0.22485005465461746,
"learning_rate": 2.3687989481953195e-06,
"loss": 0.4535319209098816,
"step": 962,
"token_acc": 0.8453375149611685
},
{
"epoch": 3.086677367576244,
"grad_norm": 0.1902880806662592,
"learning_rate": 2.3619240300290044e-06,
"loss": 0.4248250424861908,
"step": 963,
"token_acc": 0.8567758712111124
},
{
"epoch": 3.0898876404494384,
"grad_norm": 0.2626089363437338,
"learning_rate": 2.35505401860202e-06,
"loss": 0.4044189453125,
"step": 964,
"token_acc": 0.8657033443677186
},
{
"epoch": 3.0930979133226324,
"grad_norm": 0.19604226172297987,
"learning_rate": 2.348188943533965e-06,
"loss": 0.4424947202205658,
"step": 965,
"token_acc": 0.8500906036946045
},
{
"epoch": 3.096308186195827,
"grad_norm": 0.18951032806705173,
"learning_rate": 2.3413288344231596e-06,
"loss": 0.4405517578125,
"step": 966,
"token_acc": 0.8498719155442217
},
{
"epoch": 3.099518459069021,
"grad_norm": 0.26910487145254985,
"learning_rate": 2.334473720846509e-06,
"loss": 0.4261881709098816,
"step": 967,
"token_acc": 0.855647533546374
},
{
"epoch": 3.102728731942215,
"grad_norm": 0.2019726031413763,
"learning_rate": 2.3276236323593822e-06,
"loss": 0.4241536557674408,
"step": 968,
"token_acc": 0.8591222331565688
},
{
"epoch": 3.105939004815409,
"grad_norm": 0.20559129697543618,
"learning_rate": 2.3207785984954833e-06,
"loss": 0.4336751401424408,
"step": 969,
"token_acc": 0.8538620095006778
},
{
"epoch": 3.1091492776886036,
"grad_norm": 0.16203566424131216,
"learning_rate": 2.3139386487667245e-06,
"loss": 0.3148600459098816,
"step": 970,
"token_acc": 0.8947029334998015
},
{
"epoch": 3.1123595505617976,
"grad_norm": 0.19421525075904014,
"learning_rate": 2.307103812663096e-06,
"loss": 0.4341227412223816,
"step": 971,
"token_acc": 0.8553111018864443
},
{
"epoch": 3.115569823434992,
"grad_norm": 0.20517033907826865,
"learning_rate": 2.300274119652542e-06,
"loss": 0.4022013545036316,
"step": 972,
"token_acc": 0.8658842524777902
},
{
"epoch": 3.1187800963081864,
"grad_norm": 0.2040197534762892,
"learning_rate": 2.293449599180832e-06,
"loss": 0.464599609375,
"step": 973,
"token_acc": 0.8440537666160114
},
{
"epoch": 3.1219903691813804,
"grad_norm": 0.22646536031991613,
"learning_rate": 2.286630280671437e-06,
"loss": 0.42828369140625,
"step": 974,
"token_acc": 0.8590015726546195
},
{
"epoch": 3.125200642054575,
"grad_norm": 0.25340772779513127,
"learning_rate": 2.2798161935253967e-06,
"loss": 0.4293212890625,
"step": 975,
"token_acc": 0.8541718416925375
},
{
"epoch": 3.128410914927769,
"grad_norm": 0.2115180488720503,
"learning_rate": 2.2730073671211954e-06,
"loss": 0.4131673276424408,
"step": 976,
"token_acc": 0.8628617261205171
},
{
"epoch": 3.131621187800963,
"grad_norm": 0.19415733984357036,
"learning_rate": 2.2662038308146425e-06,
"loss": 0.4080810546875,
"step": 977,
"token_acc": 0.8640001032273604
},
{
"epoch": 3.134831460674157,
"grad_norm": 0.19776123553300654,
"learning_rate": 2.2594056139387326e-06,
"loss": 0.4088541865348816,
"step": 978,
"token_acc": 0.8627189376828179
},
{
"epoch": 3.1380417335473516,
"grad_norm": 0.19643692765873402,
"learning_rate": 2.2526127458035274e-06,
"loss": 0.4188232421875,
"step": 979,
"token_acc": 0.8607515414116894
},
{
"epoch": 3.1412520064205456,
"grad_norm": 0.2835352812737188,
"learning_rate": 2.245825255696032e-06,
"loss": 0.4149169921875,
"step": 980,
"token_acc": 0.8628155780149843
},
{
"epoch": 3.14446227929374,
"grad_norm": 0.19422023326342555,
"learning_rate": 2.2390431728800596e-06,
"loss": 0.4036458432674408,
"step": 981,
"token_acc": 0.8621452357882656
},
{
"epoch": 3.1476725521669344,
"grad_norm": 0.20035159839745434,
"learning_rate": 2.232266526596112e-06,
"loss": 0.4271647334098816,
"step": 982,
"token_acc": 0.8575271868100122
},
{
"epoch": 3.1508828250401284,
"grad_norm": 0.21219608624863834,
"learning_rate": 2.225495346061251e-06,
"loss": 0.4122314453125,
"step": 983,
"token_acc": 0.861788195878113
},
{
"epoch": 3.154093097913323,
"grad_norm": 0.19015710407463216,
"learning_rate": 2.218729660468976e-06,
"loss": 0.4386393427848816,
"step": 984,
"token_acc": 0.8533309039609425
},
{
"epoch": 3.157303370786517,
"grad_norm": 0.19189470990403354,
"learning_rate": 2.2119694989890917e-06,
"loss": 0.38525390625,
"step": 985,
"token_acc": 0.8706470303086263
},
{
"epoch": 3.160513643659711,
"grad_norm": 0.2180881731814661,
"learning_rate": 2.205214890767588e-06,
"loss": 0.4072062373161316,
"step": 986,
"token_acc": 0.863917869592352
},
{
"epoch": 3.163723916532905,
"grad_norm": 0.18266606436803548,
"learning_rate": 2.1984658649265122e-06,
"loss": 0.4138997495174408,
"step": 987,
"token_acc": 0.8617649696808686
},
{
"epoch": 3.1669341894060996,
"grad_norm": 0.19465676121423098,
"learning_rate": 2.1917224505638445e-06,
"loss": 0.4266764521598816,
"step": 988,
"token_acc": 0.8576549359584379
},
{
"epoch": 3.1701444622792936,
"grad_norm": 0.18927644434754856,
"learning_rate": 2.184984676753367e-06,
"loss": 0.4324951171875,
"step": 989,
"token_acc": 0.8562302060894557
},
{
"epoch": 3.173354735152488,
"grad_norm": 0.17236404705380387,
"learning_rate": 2.178252572544548e-06,
"loss": 0.41937255859375,
"step": 990,
"token_acc": 0.8578650493179596
},
{
"epoch": 3.176565008025682,
"grad_norm": 0.21801763116849704,
"learning_rate": 2.17152616696241e-06,
"loss": 0.4388834834098816,
"step": 991,
"token_acc": 0.8528569371329824
},
{
"epoch": 3.1797752808988764,
"grad_norm": 0.21130005789340922,
"learning_rate": 2.164805489007407e-06,
"loss": 0.4567057490348816,
"step": 992,
"token_acc": 0.8481650790407845
},
{
"epoch": 3.182985553772071,
"grad_norm": 0.19356572500109256,
"learning_rate": 2.1580905676552955e-06,
"loss": 0.4306844174861908,
"step": 993,
"token_acc": 0.8551083071577856
},
{
"epoch": 3.186195826645265,
"grad_norm": 0.1917648485899303,
"learning_rate": 2.151381431857016e-06,
"loss": 0.3867594599723816,
"step": 994,
"token_acc": 0.8688500205724047
},
{
"epoch": 3.189406099518459,
"grad_norm": 0.20898748869275882,
"learning_rate": 2.144678110538565e-06,
"loss": 0.4288737177848816,
"step": 995,
"token_acc": 0.8575603299412129
},
{
"epoch": 3.192616372391653,
"grad_norm": 0.1927262947095155,
"learning_rate": 2.137980632600869e-06,
"loss": 0.44091796875,
"step": 996,
"token_acc": 0.8518801725086996
},
{
"epoch": 3.1958266452648476,
"grad_norm": 0.20628044257936604,
"learning_rate": 2.1312890269196606e-06,
"loss": 0.4136962890625,
"step": 997,
"token_acc": 0.8620663241011797
},
{
"epoch": 3.1990369181380416,
"grad_norm": 0.18145359781735665,
"learning_rate": 2.1246033223453577e-06,
"loss": 0.4010009765625,
"step": 998,
"token_acc": 0.8663384327083898
},
{
"epoch": 3.202247191011236,
"grad_norm": 0.19250739862363292,
"learning_rate": 2.117923547702931e-06,
"loss": 0.4561360776424408,
"step": 999,
"token_acc": 0.8479883342024478
},
{
"epoch": 3.20545746388443,
"grad_norm": 0.21105422746513886,
"learning_rate": 2.111249731791789e-06,
"loss": 0.402587890625,
"step": 1000,
"token_acc": 0.865575555443972
},
{
"epoch": 3.2086677367576244,
"grad_norm": 0.19229547194042054,
"learning_rate": 2.1045819033856467e-06,
"loss": 0.4468587338924408,
"step": 1001,
"token_acc": 0.8522915598447114
},
{
"epoch": 3.211878009630819,
"grad_norm": 0.2212323711144648,
"learning_rate": 2.097920091232407e-06,
"loss": 0.4637858271598816,
"step": 1002,
"token_acc": 0.8459636819096726
},
{
"epoch": 3.215088282504013,
"grad_norm": 0.19460807788678874,
"learning_rate": 2.0912643240540335e-06,
"loss": 0.3863932490348816,
"step": 1003,
"token_acc": 0.8696600839923991
},
{
"epoch": 3.218298555377207,
"grad_norm": 0.1870363025497072,
"learning_rate": 2.0846146305464225e-06,
"loss": 0.3617960810661316,
"step": 1004,
"token_acc": 0.8792915105125078
},
{
"epoch": 3.221508828250401,
"grad_norm": 0.20513714397130442,
"learning_rate": 2.0779710393792932e-06,
"loss": 0.3919270932674408,
"step": 1005,
"token_acc": 0.8705517097544863
},
{
"epoch": 3.2247191011235956,
"grad_norm": 0.19359366136276182,
"learning_rate": 2.0713335791960465e-06,
"loss": 0.429443359375,
"step": 1006,
"token_acc": 0.857015650051013
},
{
"epoch": 3.2279293739967896,
"grad_norm": 0.19892443344289307,
"learning_rate": 2.0647022786136554e-06,
"loss": 0.3619384765625,
"step": 1007,
"token_acc": 0.8794119270801904
},
{
"epoch": 3.231139646869984,
"grad_norm": 0.22539113984089676,
"learning_rate": 2.0580771662225306e-06,
"loss": 0.4684651792049408,
"step": 1008,
"token_acc": 0.8445699764674204
},
{
"epoch": 3.234349919743178,
"grad_norm": 0.18846405767145527,
"learning_rate": 2.0514582705864104e-06,
"loss": 0.3821614682674408,
"step": 1009,
"token_acc": 0.8711276613950546
},
{
"epoch": 3.2375601926163724,
"grad_norm": 0.1917418511098612,
"learning_rate": 2.0448456202422237e-06,
"loss": 0.4560750424861908,
"step": 1010,
"token_acc": 0.8468040474533667
},
{
"epoch": 3.240770465489567,
"grad_norm": 0.1817089652687439,
"learning_rate": 2.038239243699975e-06,
"loss": 0.4158528745174408,
"step": 1011,
"token_acc": 0.8605414302163478
},
{
"epoch": 3.243980738362761,
"grad_norm": 0.18348380772382225,
"learning_rate": 2.0316391694426233e-06,
"loss": 0.3862508237361908,
"step": 1012,
"token_acc": 0.8690134339812778
},
{
"epoch": 3.247191011235955,
"grad_norm": 0.17828657081582217,
"learning_rate": 2.025045425925949e-06,
"loss": 0.4139607846736908,
"step": 1013,
"token_acc": 0.860817646908989
},
{
"epoch": 3.250401284109149,
"grad_norm": 0.1998903770169443,
"learning_rate": 2.0184580415784434e-06,
"loss": 0.4397786557674408,
"step": 1014,
"token_acc": 0.8526518065840059
},
{
"epoch": 3.2536115569823436,
"grad_norm": 0.20429705875890047,
"learning_rate": 2.011877044801176e-06,
"loss": 0.448974609375,
"step": 1015,
"token_acc": 0.8497649731919912
},
{
"epoch": 3.2568218298555376,
"grad_norm": 0.23203221634672286,
"learning_rate": 2.0053024639676837e-06,
"loss": 0.4231770932674408,
"step": 1016,
"token_acc": 0.858826049953659
},
{
"epoch": 3.260032102728732,
"grad_norm": 0.21003263011654322,
"learning_rate": 1.9987343274238364e-06,
"loss": 0.4134928584098816,
"step": 1017,
"token_acc": 0.8616680586797567
},
{
"epoch": 3.263242375601926,
"grad_norm": 0.18406406356467608,
"learning_rate": 1.9921726634877184e-06,
"loss": 0.4248860776424408,
"step": 1018,
"token_acc": 0.8571214968287904
},
{
"epoch": 3.2664526484751204,
"grad_norm": 0.17697575754028663,
"learning_rate": 1.9856175004495094e-06,
"loss": 0.4256998896598816,
"step": 1019,
"token_acc": 0.8561404893189583
},
{
"epoch": 3.2696629213483144,
"grad_norm": 0.2605367137069997,
"learning_rate": 1.9790688665713654e-06,
"loss": 0.4170735776424408,
"step": 1020,
"token_acc": 0.8610197515302906
},
{
"epoch": 3.272873194221509,
"grad_norm": 0.20227045376577477,
"learning_rate": 1.9725267900872873e-06,
"loss": 0.4197591245174408,
"step": 1021,
"token_acc": 0.8594213814186346
},
{
"epoch": 3.276083467094703,
"grad_norm": 0.21082038704435982,
"learning_rate": 1.965991299203003e-06,
"loss": 0.4347737729549408,
"step": 1022,
"token_acc": 0.8566319448464662
},
{
"epoch": 3.279293739967897,
"grad_norm": 0.1971525837293343,
"learning_rate": 1.9594624220958527e-06,
"loss": 0.4326985776424408,
"step": 1023,
"token_acc": 0.8559198751263897
},
{
"epoch": 3.2825040128410916,
"grad_norm": 0.22558923803886857,
"learning_rate": 1.952940186914657e-06,
"loss": 0.4354248046875,
"step": 1024,
"token_acc": 0.8549900900806426
},
{
"epoch": 3.2857142857142856,
"grad_norm": 0.20270403082450897,
"learning_rate": 1.946424621779602e-06,
"loss": 0.3877767026424408,
"step": 1025,
"token_acc": 0.8710687777985294
},
{
"epoch": 3.28892455858748,
"grad_norm": 0.18679692952044738,
"learning_rate": 1.9399157547821164e-06,
"loss": 0.4424235224723816,
"step": 1026,
"token_acc": 0.8517893098255529
},
{
"epoch": 3.292134831460674,
"grad_norm": 0.1810744270024889,
"learning_rate": 1.9334136139847496e-06,
"loss": 0.4027913510799408,
"step": 1027,
"token_acc": 0.8643299823981214
},
{
"epoch": 3.2953451043338684,
"grad_norm": 0.20370073172200107,
"learning_rate": 1.9269182274210527e-06,
"loss": 0.4059651792049408,
"step": 1028,
"token_acc": 0.8649818280909212
},
{
"epoch": 3.2985553772070624,
"grad_norm": 0.20792285403348992,
"learning_rate": 1.9204296230954554e-06,
"loss": 0.4032389521598816,
"step": 1029,
"token_acc": 0.865385416259151
},
{
"epoch": 3.301765650080257,
"grad_norm": 0.205865819325021,
"learning_rate": 1.913947828983146e-06,
"loss": 0.3989054560661316,
"step": 1030,
"token_acc": 0.8651247377965826
},
{
"epoch": 3.304975922953451,
"grad_norm": 0.18321717295258097,
"learning_rate": 1.907472873029951e-06,
"loss": 0.425048828125,
"step": 1031,
"token_acc": 0.8577393883418105
},
{
"epoch": 3.308186195826645,
"grad_norm": 0.20031557766613411,
"learning_rate": 1.9010047831522165e-06,
"loss": 0.4626871943473816,
"step": 1032,
"token_acc": 0.8463195523818052
},
{
"epoch": 3.3113964686998396,
"grad_norm": 0.18526980410321806,
"learning_rate": 1.8945435872366825e-06,
"loss": 0.3961588740348816,
"step": 1033,
"token_acc": 0.8664393756668253
},
{
"epoch": 3.3146067415730336,
"grad_norm": 0.18682164125358777,
"learning_rate": 1.8880893131403718e-06,
"loss": 0.454345703125,
"step": 1034,
"token_acc": 0.8475725916635419
},
{
"epoch": 3.317817014446228,
"grad_norm": 0.17088822277097934,
"learning_rate": 1.881641988690457e-06,
"loss": 0.3936360776424408,
"step": 1035,
"token_acc": 0.8675179845668575
},
{
"epoch": 3.321027287319422,
"grad_norm": 0.2650766352186625,
"learning_rate": 1.8752016416841512e-06,
"loss": 0.4297282099723816,
"step": 1036,
"token_acc": 0.856829096564697
},
{
"epoch": 3.3242375601926164,
"grad_norm": 0.19811218982637763,
"learning_rate": 1.8687682998885876e-06,
"loss": 0.3743693232536316,
"step": 1037,
"token_acc": 0.8732669791645157
},
{
"epoch": 3.3274478330658104,
"grad_norm": 0.1881599599845014,
"learning_rate": 1.8623419910406943e-06,
"loss": 0.4101969599723816,
"step": 1038,
"token_acc": 0.8626658730357066
},
{
"epoch": 3.330658105939005,
"grad_norm": 0.17212595163092956,
"learning_rate": 1.8559227428470747e-06,
"loss": 0.3761800229549408,
"step": 1039,
"token_acc": 0.8748992295470729
},
{
"epoch": 3.333868378812199,
"grad_norm": 0.18688649843976768,
"learning_rate": 1.8495105829838924e-06,
"loss": 0.431640625,
"step": 1040,
"token_acc": 0.8571577847439916
},
{
"epoch": 3.337078651685393,
"grad_norm": 0.1860904717139982,
"learning_rate": 1.8431055390967545e-06,
"loss": 0.4176839292049408,
"step": 1041,
"token_acc": 0.8614076828902081
},
{
"epoch": 3.3402889245585876,
"grad_norm": 0.17756907190261467,
"learning_rate": 1.8367076388005824e-06,
"loss": 0.3920491635799408,
"step": 1042,
"token_acc": 0.867667418755237
},
{
"epoch": 3.3434991974317816,
"grad_norm": 0.213774360292066,
"learning_rate": 1.8303169096795024e-06,
"loss": 0.4037882685661316,
"step": 1043,
"token_acc": 0.8641746530816108
},
{
"epoch": 3.346709470304976,
"grad_norm": 0.20945621378126955,
"learning_rate": 1.8239333792867157e-06,
"loss": 0.4090169370174408,
"step": 1044,
"token_acc": 0.8622145082550352
},
{
"epoch": 3.34991974317817,
"grad_norm": 0.22957551762719874,
"learning_rate": 1.8175570751443967e-06,
"loss": 0.4228108823299408,
"step": 1045,
"token_acc": 0.8606316225992386
},
{
"epoch": 3.3531300160513644,
"grad_norm": 0.2130972617406167,
"learning_rate": 1.8111880247435576e-06,
"loss": 0.441650390625,
"step": 1046,
"token_acc": 0.8529377749003533
},
{
"epoch": 3.3563402889245584,
"grad_norm": 0.21202748343191813,
"learning_rate": 1.8048262555439376e-06,
"loss": 0.42041015625,
"step": 1047,
"token_acc": 0.8588775990968166
},
{
"epoch": 3.359550561797753,
"grad_norm": 0.19406791196728218,
"learning_rate": 1.7984717949738856e-06,
"loss": 0.447998046875,
"step": 1048,
"token_acc": 0.8512821019043048
},
{
"epoch": 3.362760834670947,
"grad_norm": 0.20578464294018312,
"learning_rate": 1.7921246704302371e-06,
"loss": 0.420135498046875,
"step": 1049,
"token_acc": 0.8593331574316282
},
{
"epoch": 3.365971107544141,
"grad_norm": 0.1932854579791998,
"learning_rate": 1.785784909278201e-06,
"loss": 0.4093017578125,
"step": 1050,
"token_acc": 0.86227689958784
},
{
"epoch": 3.3691813804173356,
"grad_norm": 0.25070884029858714,
"learning_rate": 1.779452538851238e-06,
"loss": 0.4414876401424408,
"step": 1051,
"token_acc": 0.852587075660224
},
{
"epoch": 3.3723916532905296,
"grad_norm": 0.18868857617721213,
"learning_rate": 1.7731275864509448e-06,
"loss": 0.4228922724723816,
"step": 1052,
"token_acc": 0.8585962533972208
},
{
"epoch": 3.375601926163724,
"grad_norm": 0.18853519073810057,
"learning_rate": 1.7668100793469358e-06,
"loss": 0.4308268427848816,
"step": 1053,
"token_acc": 0.8560997968551486
},
{
"epoch": 3.378812199036918,
"grad_norm": 0.19465336749692538,
"learning_rate": 1.7605000447767236e-06,
"loss": 0.3777669370174408,
"step": 1054,
"token_acc": 0.8727723193027433
},
{
"epoch": 3.3820224719101124,
"grad_norm": 0.2399168093589425,
"learning_rate": 1.75419750994561e-06,
"loss": 0.4230143427848816,
"step": 1055,
"token_acc": 0.8576663638855595
},
{
"epoch": 3.3852327447833064,
"grad_norm": 0.19297225432494655,
"learning_rate": 1.7479025020265528e-06,
"loss": 0.4257405698299408,
"step": 1056,
"token_acc": 0.8584355198978895
},
{
"epoch": 3.388443017656501,
"grad_norm": 0.19770981928921164,
"learning_rate": 1.7416150481600637e-06,
"loss": 0.427001953125,
"step": 1057,
"token_acc": 0.8572976882953657
},
{
"epoch": 3.391653290529695,
"grad_norm": 0.1967714452508458,
"learning_rate": 1.7353351754540841e-06,
"loss": 0.4223226010799408,
"step": 1058,
"token_acc": 0.8588774508651131
},
{
"epoch": 3.394863563402889,
"grad_norm": 0.20156157565544705,
"learning_rate": 1.7290629109838722e-06,
"loss": 0.44744873046875,
"step": 1059,
"token_acc": 0.8510266465895987
},
{
"epoch": 3.3980738362760836,
"grad_norm": 0.29294148874106124,
"learning_rate": 1.7227982817918816e-06,
"loss": 0.4371337890625,
"step": 1060,
"token_acc": 0.8539906878706369
},
{
"epoch": 3.4012841091492776,
"grad_norm": 0.17675164494761733,
"learning_rate": 1.7165413148876447e-06,
"loss": 0.4316813349723816,
"step": 1061,
"token_acc": 0.8541013910639639
},
{
"epoch": 3.404494382022472,
"grad_norm": 0.1913527448471836,
"learning_rate": 1.7102920372476608e-06,
"loss": 0.4197184443473816,
"step": 1062,
"token_acc": 0.8606378569957566
},
{
"epoch": 3.407704654895666,
"grad_norm": 0.19051572266218328,
"learning_rate": 1.70405047581528e-06,
"loss": 0.3822021484375,
"step": 1063,
"token_acc": 0.8714002443917528
},
{
"epoch": 3.4109149277688604,
"grad_norm": 0.19422030281559116,
"learning_rate": 1.697816657500582e-06,
"loss": 0.4290771484375,
"step": 1064,
"token_acc": 0.8554621600932835
},
{
"epoch": 3.4141252006420544,
"grad_norm": 0.17356650516464564,
"learning_rate": 1.6915906091802583e-06,
"loss": 0.3907877802848816,
"step": 1065,
"token_acc": 0.869664393294144
},
{
"epoch": 3.417335473515249,
"grad_norm": 0.18506265465323435,
"learning_rate": 1.6853723576975085e-06,
"loss": 0.4239095151424408,
"step": 1066,
"token_acc": 0.8579896319427388
},
{
"epoch": 3.420545746388443,
"grad_norm": 0.18809580147202737,
"learning_rate": 1.6791619298619126e-06,
"loss": 0.4306233823299408,
"step": 1067,
"token_acc": 0.8561063060656751
},
{
"epoch": 3.423756019261637,
"grad_norm": 0.21513901135562274,
"learning_rate": 1.6729593524493186e-06,
"loss": 0.4053751826286316,
"step": 1068,
"token_acc": 0.8634594704413222
},
{
"epoch": 3.4269662921348316,
"grad_norm": 0.20250597705842183,
"learning_rate": 1.6667646522017295e-06,
"loss": 0.444091796875,
"step": 1069,
"token_acc": 0.8529030188498646
},
{
"epoch": 3.4301765650080256,
"grad_norm": 0.17578672679283108,
"learning_rate": 1.6605778558271862e-06,
"loss": 0.4008992612361908,
"step": 1070,
"token_acc": 0.8645487521749183
},
{
"epoch": 3.43338683788122,
"grad_norm": 0.183147867321015,
"learning_rate": 1.6543989899996526e-06,
"loss": 0.467529296875,
"step": 1071,
"token_acc": 0.8420635991977785
},
{
"epoch": 3.436597110754414,
"grad_norm": 0.18479237213228097,
"learning_rate": 1.6482280813588998e-06,
"loss": 0.4294026792049408,
"step": 1072,
"token_acc": 0.8548516566839212
},
{
"epoch": 3.4398073836276084,
"grad_norm": 0.200020323621944,
"learning_rate": 1.642065156510393e-06,
"loss": 0.4534912109375,
"step": 1073,
"token_acc": 0.849654429128104
},
{
"epoch": 3.4430176565008024,
"grad_norm": 0.2045486438267927,
"learning_rate": 1.6359102420251753e-06,
"loss": 0.4272868037223816,
"step": 1074,
"token_acc": 0.8583280572521158
},
{
"epoch": 3.446227929373997,
"grad_norm": 0.20198169472431834,
"learning_rate": 1.6297633644397536e-06,
"loss": 0.4178263545036316,
"step": 1075,
"token_acc": 0.860547363917803
},
{
"epoch": 3.449438202247191,
"grad_norm": 0.20642737825944937,
"learning_rate": 1.6236245502559828e-06,
"loss": 0.4259033203125,
"step": 1076,
"token_acc": 0.8581598032324395
},
{
"epoch": 3.452648475120385,
"grad_norm": 0.2186125005753993,
"learning_rate": 1.6174938259409593e-06,
"loss": 0.440673828125,
"step": 1077,
"token_acc": 0.8533212996389892
},
{
"epoch": 3.4558587479935796,
"grad_norm": 0.1996792453333026,
"learning_rate": 1.611371217926891e-06,
"loss": 0.4647623896598816,
"step": 1078,
"token_acc": 0.8443061839272337
},
{
"epoch": 3.4590690208667736,
"grad_norm": 0.18869326440153297,
"learning_rate": 1.6052567526109985e-06,
"loss": 0.4488525390625,
"step": 1079,
"token_acc": 0.8479692213321363
},
{
"epoch": 3.462279293739968,
"grad_norm": 0.17097828495515807,
"learning_rate": 1.5991504563553965e-06,
"loss": 0.3851521909236908,
"step": 1080,
"token_acc": 0.8694733632443263
},
{
"epoch": 3.465489566613162,
"grad_norm": 0.18433664031908198,
"learning_rate": 1.5930523554869788e-06,
"loss": 0.4507243037223816,
"step": 1081,
"token_acc": 0.8492495641690979
},
{
"epoch": 3.4686998394863564,
"grad_norm": 0.18108294483264578,
"learning_rate": 1.5869624762973012e-06,
"loss": 0.4585368037223816,
"step": 1082,
"token_acc": 0.8456435862523524
},
{
"epoch": 3.4719101123595504,
"grad_norm": 0.1862200238454039,
"learning_rate": 1.5808808450424756e-06,
"loss": 0.4508056640625,
"step": 1083,
"token_acc": 0.8489838434912815
},
{
"epoch": 3.475120385232745,
"grad_norm": 0.19007392226631,
"learning_rate": 1.5748074879430552e-06,
"loss": 0.4535319209098816,
"step": 1084,
"token_acc": 0.8487748765942239
},
{
"epoch": 3.478330658105939,
"grad_norm": 0.20476245510406388,
"learning_rate": 1.5687424311839173e-06,
"loss": 0.4156494140625,
"step": 1085,
"token_acc": 0.8628871056168218
},
{
"epoch": 3.481540930979133,
"grad_norm": 0.1815932832460928,
"learning_rate": 1.5626857009141536e-06,
"loss": 0.3939208984375,
"step": 1086,
"token_acc": 0.8679247922855052
},
{
"epoch": 3.4847512038523276,
"grad_norm": 0.1767455152787183,
"learning_rate": 1.5566373232469535e-06,
"loss": 0.4333903193473816,
"step": 1087,
"token_acc": 0.8548684500362472
},
{
"epoch": 3.4879614767255216,
"grad_norm": 0.22172730233556032,
"learning_rate": 1.5505973242595009e-06,
"loss": 0.400390625,
"step": 1088,
"token_acc": 0.8676109572690723
},
{
"epoch": 3.491171749598716,
"grad_norm": 0.19981305365613308,
"learning_rate": 1.5445657299928508e-06,
"loss": 0.4515787959098816,
"step": 1089,
"token_acc": 0.8486542162122311
},
{
"epoch": 3.49438202247191,
"grad_norm": 0.18951995793081028,
"learning_rate": 1.538542566451824e-06,
"loss": 0.4265950620174408,
"step": 1090,
"token_acc": 0.8581693972059831
},
{
"epoch": 3.4975922953451044,
"grad_norm": 0.20528684366503747,
"learning_rate": 1.5325278596048915e-06,
"loss": 0.4165852963924408,
"step": 1091,
"token_acc": 0.8622296703966629
},
{
"epoch": 3.5008025682182984,
"grad_norm": 0.18774317434356294,
"learning_rate": 1.5265216353840644e-06,
"loss": 0.4366455078125,
"step": 1092,
"token_acc": 0.8520290575983089
},
{
"epoch": 3.504012841091493,
"grad_norm": 0.18366926525043567,
"learning_rate": 1.5205239196847812e-06,
"loss": 0.4434000849723816,
"step": 1093,
"token_acc": 0.8506869031300036
},
{
"epoch": 3.5072231139646872,
"grad_norm": 0.19422266190957502,
"learning_rate": 1.5145347383657976e-06,
"loss": 0.4432576596736908,
"step": 1094,
"token_acc": 0.8515391064798179
},
{
"epoch": 3.510433386837881,
"grad_norm": 0.19262752744132836,
"learning_rate": 1.508554117249072e-06,
"loss": 0.4134928584098816,
"step": 1095,
"token_acc": 0.8611599095351802
},
{
"epoch": 3.513643659711075,
"grad_norm": 0.17738962252261778,
"learning_rate": 1.5025820821196583e-06,
"loss": 0.4323323667049408,
"step": 1096,
"token_acc": 0.8561772785419778
},
{
"epoch": 3.5168539325842696,
"grad_norm": 0.21144620803921835,
"learning_rate": 1.4966186587255889e-06,
"loss": 0.4349772334098816,
"step": 1097,
"token_acc": 0.8560637856538078
},
{
"epoch": 3.520064205457464,
"grad_norm": 0.18107969840908128,
"learning_rate": 1.4906638727777738e-06,
"loss": 0.4111124873161316,
"step": 1098,
"token_acc": 0.8620681322045094
},
{
"epoch": 3.523274478330658,
"grad_norm": 0.1930633849319264,
"learning_rate": 1.4847177499498753e-06,
"loss": 0.41156005859375,
"step": 1099,
"token_acc": 0.8612365749114891
},
{
"epoch": 3.5264847512038524,
"grad_norm": 0.20760349114225327,
"learning_rate": 1.4787803158782105e-06,
"loss": 0.4454752802848816,
"step": 1100,
"token_acc": 0.8515730061840415
},
{
"epoch": 3.5296950240770464,
"grad_norm": 0.20209657459754093,
"learning_rate": 1.4728515961616324e-06,
"loss": 0.4329020380973816,
"step": 1101,
"token_acc": 0.8555188046740311
},
{
"epoch": 3.532905296950241,
"grad_norm": 0.20337891262559785,
"learning_rate": 1.4669316163614273e-06,
"loss": 0.399169921875,
"step": 1102,
"token_acc": 0.8667258058203838
},
{
"epoch": 3.5361155698234352,
"grad_norm": 0.1858405661724601,
"learning_rate": 1.461020402001196e-06,
"loss": 0.4512125849723816,
"step": 1103,
"token_acc": 0.8481120263363716
},
{
"epoch": 3.539325842696629,
"grad_norm": 0.21039533480772285,
"learning_rate": 1.4551179785667453e-06,
"loss": 0.4616292417049408,
"step": 1104,
"token_acc": 0.8470263196161116
},
{
"epoch": 3.542536115569823,
"grad_norm": 0.21814246301992302,
"learning_rate": 1.449224371505988e-06,
"loss": 0.3966064453125,
"step": 1105,
"token_acc": 0.8673032108437989
},
{
"epoch": 3.5457463884430176,
"grad_norm": 0.1799367306449473,
"learning_rate": 1.443339606228819e-06,
"loss": 0.416748046875,
"step": 1106,
"token_acc": 0.861918932987615
},
{
"epoch": 3.548956661316212,
"grad_norm": 0.1978903281145319,
"learning_rate": 1.4374637081070172e-06,
"loss": 0.447998046875,
"step": 1107,
"token_acc": 0.8488240043187931
},
{
"epoch": 3.552166934189406,
"grad_norm": 0.20214133587068753,
"learning_rate": 1.4315967024741249e-06,
"loss": 0.4388427734375,
"step": 1108,
"token_acc": 0.8534245399592104
},
{
"epoch": 3.5553772070626004,
"grad_norm": 0.1712702137477399,
"learning_rate": 1.4257386146253524e-06,
"loss": 0.4484456479549408,
"step": 1109,
"token_acc": 0.8492724436033093
},
{
"epoch": 3.5585874799357944,
"grad_norm": 0.2248722413536391,
"learning_rate": 1.419889469817458e-06,
"loss": 0.4341227412223816,
"step": 1110,
"token_acc": 0.8557268722466961
},
{
"epoch": 3.561797752808989,
"grad_norm": 0.23382033976015443,
"learning_rate": 1.4140492932686423e-06,
"loss": 0.4353434443473816,
"step": 1111,
"token_acc": 0.8557752170517189
},
{
"epoch": 3.5650080256821832,
"grad_norm": 0.20087595501719435,
"learning_rate": 1.4082181101584404e-06,
"loss": 0.3586222529411316,
"step": 1112,
"token_acc": 0.8811013304106066
},
{
"epoch": 3.568218298555377,
"grad_norm": 0.17604647276989846,
"learning_rate": 1.4023959456276134e-06,
"loss": 0.4018758237361908,
"step": 1113,
"token_acc": 0.8666440619959501
},
{
"epoch": 3.571428571428571,
"grad_norm": 0.19157315090032623,
"learning_rate": 1.396582824778039e-06,
"loss": 0.4198405146598816,
"step": 1114,
"token_acc": 0.8591179259464835
},
{
"epoch": 3.5746388443017656,
"grad_norm": 0.20802221020070402,
"learning_rate": 1.390778772672603e-06,
"loss": 0.4651286005973816,
"step": 1115,
"token_acc": 0.8433386152010968
},
{
"epoch": 3.57784911717496,
"grad_norm": 0.18483138886919764,
"learning_rate": 1.3849838143350928e-06,
"loss": 0.42041015625,
"step": 1116,
"token_acc": 0.8592253803464793
},
{
"epoch": 3.581059390048154,
"grad_norm": 0.3119281222048741,
"learning_rate": 1.379197974750088e-06,
"loss": 0.4130859375,
"step": 1117,
"token_acc": 0.8610746129086024
},
{
"epoch": 3.5842696629213484,
"grad_norm": 0.2250280863661116,
"learning_rate": 1.3734212788628542e-06,
"loss": 0.4375407099723816,
"step": 1118,
"token_acc": 0.852986013309395
},
{
"epoch": 3.5874799357945424,
"grad_norm": 0.2319937697956126,
"learning_rate": 1.367653751579232e-06,
"loss": 0.4305623471736908,
"step": 1119,
"token_acc": 0.8549000532980083
},
{
"epoch": 3.590690208667737,
"grad_norm": 0.20874108633816135,
"learning_rate": 1.3618954177655385e-06,
"loss": 0.439453125,
"step": 1120,
"token_acc": 0.8523911679448383
},
{
"epoch": 3.5939004815409312,
"grad_norm": 0.18549311244906191,
"learning_rate": 1.3561463022484448e-06,
"loss": 0.4788004755973816,
"step": 1121,
"token_acc": 0.8400730047060446
},
{
"epoch": 3.597110754414125,
"grad_norm": 0.21571752979548245,
"learning_rate": 1.3504064298148833e-06,
"loss": 0.3993733823299408,
"step": 1122,
"token_acc": 0.8675419990525366
},
{
"epoch": 3.600321027287319,
"grad_norm": 0.20805103036182104,
"learning_rate": 1.3446758252119366e-06,
"loss": 0.4430135190486908,
"step": 1123,
"token_acc": 0.8532176419899573
},
{
"epoch": 3.6035313001605136,
"grad_norm": 0.18063209287797274,
"learning_rate": 1.3389545131467282e-06,
"loss": 0.3862711787223816,
"step": 1124,
"token_acc": 0.8700313564313901
},
{
"epoch": 3.606741573033708,
"grad_norm": 0.20079075250668693,
"learning_rate": 1.3332425182863144e-06,
"loss": 0.4552001953125,
"step": 1125,
"token_acc": 0.8473574577338565
},
{
"epoch": 3.609951845906902,
"grad_norm": 0.24814098522889355,
"learning_rate": 1.3275398652575832e-06,
"loss": 0.429931640625,
"step": 1126,
"token_acc": 0.8559153243433212
},
{
"epoch": 3.6131621187800964,
"grad_norm": 0.20186596614224114,
"learning_rate": 1.321846578647149e-06,
"loss": 0.4421793818473816,
"step": 1127,
"token_acc": 0.8515140875865225
},
{
"epoch": 3.6163723916532904,
"grad_norm": 0.1972261927521971,
"learning_rate": 1.3161626830012393e-06,
"loss": 0.3323161005973816,
"step": 1128,
"token_acc": 0.8880017935239064
},
{
"epoch": 3.619582664526485,
"grad_norm": 0.21275115071244285,
"learning_rate": 1.3104882028255943e-06,
"loss": 0.413818359375,
"step": 1129,
"token_acc": 0.8623194980291856
},
{
"epoch": 3.6227929373996792,
"grad_norm": 0.2484032770033825,
"learning_rate": 1.3048231625853613e-06,
"loss": 0.4005330502986908,
"step": 1130,
"token_acc": 0.865271191260239
},
{
"epoch": 3.626003210272873,
"grad_norm": 0.19869890039868315,
"learning_rate": 1.2991675867049857e-06,
"loss": 0.4834798276424408,
"step": 1131,
"token_acc": 0.8374210033176216
},
{
"epoch": 3.629213483146067,
"grad_norm": 0.1847331440731312,
"learning_rate": 1.29352149956811e-06,
"loss": 0.4325968623161316,
"step": 1132,
"token_acc": 0.8542738246013079
},
{
"epoch": 3.6324237560192616,
"grad_norm": 0.2072766407741738,
"learning_rate": 1.2878849255174652e-06,
"loss": 0.3939208984375,
"step": 1133,
"token_acc": 0.8672659842381567
},
{
"epoch": 3.635634028892456,
"grad_norm": 0.20737047250930485,
"learning_rate": 1.282257888854768e-06,
"loss": 0.4196370542049408,
"step": 1134,
"token_acc": 0.8583164933418487
},
{
"epoch": 3.63884430176565,
"grad_norm": 0.20254598573908703,
"learning_rate": 1.2766404138406151e-06,
"loss": 0.4584147334098816,
"step": 1135,
"token_acc": 0.845699420024143
},
{
"epoch": 3.6420545746388444,
"grad_norm": 0.18676784506103364,
"learning_rate": 1.271032524694379e-06,
"loss": 0.4593912959098816,
"step": 1136,
"token_acc": 0.8467912156242051
},
{
"epoch": 3.6452648475120384,
"grad_norm": 0.18404519693391058,
"learning_rate": 1.2654342455941026e-06,
"loss": 0.3817545771598816,
"step": 1137,
"token_acc": 0.8718617498601229
},
{
"epoch": 3.648475120385233,
"grad_norm": 0.19740877602255696,
"learning_rate": 1.2598456006763967e-06,
"loss": 0.4697265625,
"step": 1138,
"token_acc": 0.8445315531024508
},
{
"epoch": 3.6516853932584272,
"grad_norm": 0.18986912441857076,
"learning_rate": 1.2542666140363343e-06,
"loss": 0.4017333984375,
"step": 1139,
"token_acc": 0.863951752722064
},
{
"epoch": 3.654895666131621,
"grad_norm": 0.1789450935530543,
"learning_rate": 1.2486973097273469e-06,
"loss": 0.4638671875,
"step": 1140,
"token_acc": 0.8456421328521097
},
{
"epoch": 3.658105939004815,
"grad_norm": 0.22878218933401717,
"learning_rate": 1.2431377117611247e-06,
"loss": 0.4156901240348816,
"step": 1141,
"token_acc": 0.8583248996535044
},
{
"epoch": 3.6613162118780096,
"grad_norm": 0.19936716843844962,
"learning_rate": 1.2375878441075035e-06,
"loss": 0.4248860776424408,
"step": 1142,
"token_acc": 0.8554407218253489
},
{
"epoch": 3.664526484751204,
"grad_norm": 0.2154430937518813,
"learning_rate": 1.2320477306943728e-06,
"loss": 0.450439453125,
"step": 1143,
"token_acc": 0.8500447622106792
},
{
"epoch": 3.667736757624398,
"grad_norm": 0.19034491907180862,
"learning_rate": 1.2265173954075636e-06,
"loss": 0.4224446713924408,
"step": 1144,
"token_acc": 0.8556876330592085
},
{
"epoch": 3.6709470304975924,
"grad_norm": 0.17777679326467932,
"learning_rate": 1.2209968620907537e-06,
"loss": 0.3982747495174408,
"step": 1145,
"token_acc": 0.8657402270980548
},
{
"epoch": 3.6741573033707864,
"grad_norm": 0.18537100453971045,
"learning_rate": 1.2154861545453573e-06,
"loss": 0.4394938349723816,
"step": 1146,
"token_acc": 0.8517562914294342
},
{
"epoch": 3.677367576243981,
"grad_norm": 0.1815554961453709,
"learning_rate": 1.2099852965304223e-06,
"loss": 0.4091390073299408,
"step": 1147,
"token_acc": 0.8609376121760356
},
{
"epoch": 3.6805778491171752,
"grad_norm": 0.19774937138330145,
"learning_rate": 1.2044943117625385e-06,
"loss": 0.4618733823299408,
"step": 1148,
"token_acc": 0.844277955355182
},
{
"epoch": 3.683788121990369,
"grad_norm": 0.1776439903869948,
"learning_rate": 1.1990132239157223e-06,
"loss": 0.4086507260799408,
"step": 1149,
"token_acc": 0.8623496479124965
},
{
"epoch": 3.686998394863563,
"grad_norm": 0.18174880879649488,
"learning_rate": 1.193542056621323e-06,
"loss": 0.4283040463924408,
"step": 1150,
"token_acc": 0.8550191434654225
},
{
"epoch": 3.6902086677367576,
"grad_norm": 0.2481070438423436,
"learning_rate": 1.1880808334679128e-06,
"loss": 0.4292806088924408,
"step": 1151,
"token_acc": 0.8554810096320871
},
{
"epoch": 3.693418940609952,
"grad_norm": 0.20829243678595485,
"learning_rate": 1.1826295780011986e-06,
"loss": 0.4122721552848816,
"step": 1152,
"token_acc": 0.8618100241611001
},
{
"epoch": 3.696629213483146,
"grad_norm": 0.20267863009891,
"learning_rate": 1.1771883137239067e-06,
"loss": 0.4258219599723816,
"step": 1153,
"token_acc": 0.8578250897204026
},
{
"epoch": 3.6998394863563404,
"grad_norm": 0.1732365798885481,
"learning_rate": 1.171757064095688e-06,
"loss": 0.4292806088924408,
"step": 1154,
"token_acc": 0.8550376709009048
},
{
"epoch": 3.7030497592295344,
"grad_norm": 0.20397059894965575,
"learning_rate": 1.1663358525330169e-06,
"loss": 0.4227294921875,
"step": 1155,
"token_acc": 0.8581558816694186
},
{
"epoch": 3.706260032102729,
"grad_norm": 0.19318253491872506,
"learning_rate": 1.1609247024090888e-06,
"loss": 0.4373779296875,
"step": 1156,
"token_acc": 0.8549174782922965
},
{
"epoch": 3.7094703049759232,
"grad_norm": 0.2202135539105138,
"learning_rate": 1.1555236370537193e-06,
"loss": 0.3937581479549408,
"step": 1157,
"token_acc": 0.8677404738211112
},
{
"epoch": 3.712680577849117,
"grad_norm": 0.19454901193411642,
"learning_rate": 1.150132679753245e-06,
"loss": 0.4218343198299408,
"step": 1158,
"token_acc": 0.8581630168320942
},
{
"epoch": 3.715890850722311,
"grad_norm": 0.19007603665918127,
"learning_rate": 1.1447518537504223e-06,
"loss": 0.4286295771598816,
"step": 1159,
"token_acc": 0.8576809818327997
},
{
"epoch": 3.7191011235955056,
"grad_norm": 0.1864005885740874,
"learning_rate": 1.1393811822443264e-06,
"loss": 0.42724609375,
"step": 1160,
"token_acc": 0.8559439804490412
},
{
"epoch": 3.7223113964687,
"grad_norm": 0.2071776233361781,
"learning_rate": 1.134020688390253e-06,
"loss": 0.4482015073299408,
"step": 1161,
"token_acc": 0.8493717095311946
},
{
"epoch": 3.725521669341894,
"grad_norm": 0.18174949422424447,
"learning_rate": 1.1286703952996156e-06,
"loss": 0.4434000849723816,
"step": 1162,
"token_acc": 0.8515254746286025
},
{
"epoch": 3.7287319422150884,
"grad_norm": 0.21834567026723553,
"learning_rate": 1.1233303260398527e-06,
"loss": 0.4306437373161316,
"step": 1163,
"token_acc": 0.8558070632128901
},
{
"epoch": 3.7319422150882824,
"grad_norm": 0.19693327775941002,
"learning_rate": 1.1180005036343169e-06,
"loss": 0.4174397885799408,
"step": 1164,
"token_acc": 0.8600014018228531
},
{
"epoch": 3.735152487961477,
"grad_norm": 0.21766650753027825,
"learning_rate": 1.112680951062185e-06,
"loss": 0.480712890625,
"step": 1165,
"token_acc": 0.8373244659218679
},
{
"epoch": 3.738362760834671,
"grad_norm": 0.45324960611379955,
"learning_rate": 1.1073716912583585e-06,
"loss": 0.42547607421875,
"step": 1166,
"token_acc": 0.8576685244412483
},
{
"epoch": 3.741573033707865,
"grad_norm": 0.17298076771061435,
"learning_rate": 1.1020727471133605e-06,
"loss": 0.415771484375,
"step": 1167,
"token_acc": 0.8597448800837256
},
{
"epoch": 3.744783306581059,
"grad_norm": 0.23965119219664902,
"learning_rate": 1.0967841414732362e-06,
"loss": 0.4188639521598816,
"step": 1168,
"token_acc": 0.8588890963340573
},
{
"epoch": 3.7479935794542536,
"grad_norm": 0.23365722674624714,
"learning_rate": 1.0915058971394593e-06,
"loss": 0.4141032099723816,
"step": 1169,
"token_acc": 0.8631264796385842
},
{
"epoch": 3.751203852327448,
"grad_norm": 0.19910767380766714,
"learning_rate": 1.086238036868833e-06,
"loss": 0.4235026240348816,
"step": 1170,
"token_acc": 0.8583032988349921
},
{
"epoch": 3.754414125200642,
"grad_norm": 0.2076959812992001,
"learning_rate": 1.0809805833733883e-06,
"loss": 0.4021809995174408,
"step": 1171,
"token_acc": 0.864621498039401
},
{
"epoch": 3.7576243980738364,
"grad_norm": 0.2152311019271978,
"learning_rate": 1.0757335593202886e-06,
"loss": 0.4444173276424408,
"step": 1172,
"token_acc": 0.8518791374036445
},
{
"epoch": 3.7608346709470304,
"grad_norm": 0.18972984221535008,
"learning_rate": 1.0704969873317306e-06,
"loss": 0.4087321162223816,
"step": 1173,
"token_acc": 0.8632072158845779
},
{
"epoch": 3.764044943820225,
"grad_norm": 0.18348596001885115,
"learning_rate": 1.0652708899848494e-06,
"loss": 0.39996337890625,
"step": 1174,
"token_acc": 0.867081444511051
},
{
"epoch": 3.767255216693419,
"grad_norm": 0.18413587804123854,
"learning_rate": 1.0600552898116172e-06,
"loss": 0.4168701171875,
"step": 1175,
"token_acc": 0.8600618621035828
},
{
"epoch": 3.770465489566613,
"grad_norm": 0.18783556488467354,
"learning_rate": 1.05485020929875e-06,
"loss": 0.430908203125,
"step": 1176,
"token_acc": 0.8561088357397764
},
{
"epoch": 3.773675762439807,
"grad_norm": 0.19351334790591254,
"learning_rate": 1.0496556708876086e-06,
"loss": 0.4070841670036316,
"step": 1177,
"token_acc": 0.862757148366175
},
{
"epoch": 3.7768860353130016,
"grad_norm": 0.20338648261683012,
"learning_rate": 1.0444716969741018e-06,
"loss": 0.3898722529411316,
"step": 1178,
"token_acc": 0.8694972769663388
},
{
"epoch": 3.780096308186196,
"grad_norm": 0.2198683114658492,
"learning_rate": 1.0392983099085907e-06,
"loss": 0.4216715693473816,
"step": 1179,
"token_acc": 0.860980687212047
},
{
"epoch": 3.78330658105939,
"grad_norm": 0.197249833176062,
"learning_rate": 1.0341355319957916e-06,
"loss": 0.3811849057674408,
"step": 1180,
"token_acc": 0.8726937908278484
},
{
"epoch": 3.7865168539325844,
"grad_norm": 0.18020529721358267,
"learning_rate": 1.0289833854946801e-06,
"loss": 0.4131673276424408,
"step": 1181,
"token_acc": 0.8598117137770124
},
{
"epoch": 3.7897271268057784,
"grad_norm": 0.1751467317582751,
"learning_rate": 1.0238418926183956e-06,
"loss": 0.3791097104549408,
"step": 1182,
"token_acc": 0.8740450591380495
},
{
"epoch": 3.792937399678973,
"grad_norm": 0.19982818717924114,
"learning_rate": 1.0187110755341436e-06,
"loss": 0.4158935546875,
"step": 1183,
"token_acc": 0.8595560648576157
},
{
"epoch": 3.796147672552167,
"grad_norm": 0.1702957983130496,
"learning_rate": 1.0135909563631064e-06,
"loss": 0.3960774838924408,
"step": 1184,
"token_acc": 0.8663887873513991
},
{
"epoch": 3.799357945425361,
"grad_norm": 0.19815997110927433,
"learning_rate": 1.0084815571803357e-06,
"loss": 0.4346415400505066,
"step": 1185,
"token_acc": 0.8551757443018521
},
{
"epoch": 3.802568218298555,
"grad_norm": 0.19117230772053223,
"learning_rate": 1.0033829000146702e-06,
"loss": 0.4427490234375,
"step": 1186,
"token_acc": 0.8522398677275012
},
{
"epoch": 3.8057784911717496,
"grad_norm": 0.16156607508241005,
"learning_rate": 9.982950068486312e-07,
"loss": 0.3183797299861908,
"step": 1187,
"token_acc": 0.892503984314602
},
{
"epoch": 3.808988764044944,
"grad_norm": 0.19397666298086996,
"learning_rate": 9.93217899618337e-07,
"loss": 0.3892822265625,
"step": 1188,
"token_acc": 0.8689912031120274
},
{
"epoch": 3.812199036918138,
"grad_norm": 0.18867901164404668,
"learning_rate": 9.881516002133995e-07,
"loss": 0.4711507260799408,
"step": 1189,
"token_acc": 0.8411834094055081
},
{
"epoch": 3.8154093097913324,
"grad_norm": 0.19601761144473115,
"learning_rate": 9.8309613047683e-07,
"loss": 0.4140625,
"step": 1190,
"token_acc": 0.8603962006782715
},
{
"epoch": 3.8186195826645264,
"grad_norm": 0.18588347051992335,
"learning_rate": 9.780515122049564e-07,
"loss": 0.4267781674861908,
"step": 1191,
"token_acc": 0.8573543108369303
},
{
"epoch": 3.821829855537721,
"grad_norm": 0.184640472732058,
"learning_rate": 9.730177671473151e-07,
"loss": 0.4150594174861908,
"step": 1192,
"token_acc": 0.8593442234726605
},
{
"epoch": 3.825040128410915,
"grad_norm": 0.19597044202919392,
"learning_rate": 9.679949170065668e-07,
"loss": 0.4122721552848816,
"step": 1193,
"token_acc": 0.860177508284942
},
{
"epoch": 3.828250401284109,
"grad_norm": 0.16703914573084216,
"learning_rate": 9.629829834383947e-07,
"loss": 0.4218343198299408,
"step": 1194,
"token_acc": 0.8574244150518524
},
{
"epoch": 3.831460674157303,
"grad_norm": 0.1748554615572873,
"learning_rate": 9.579819880514217e-07,
"loss": 0.3974812924861908,
"step": 1195,
"token_acc": 0.8660061383591329
},
{
"epoch": 3.8346709470304976,
"grad_norm": 0.20295828023991147,
"learning_rate": 9.529919524071083e-07,
"loss": 0.3984782099723816,
"step": 1196,
"token_acc": 0.8674985290933711
},
{
"epoch": 3.837881219903692,
"grad_norm": 0.3432526102252035,
"learning_rate": 9.480128980196639e-07,
"loss": 0.4263916015625,
"step": 1197,
"token_acc": 0.8576995283116747
},
{
"epoch": 3.841091492776886,
"grad_norm": 0.20788825979486256,
"learning_rate": 9.430448463559517e-07,
"loss": 0.4392293393611908,
"step": 1198,
"token_acc": 0.8542843486838969
},
{
"epoch": 3.8443017656500804,
"grad_norm": 0.17981879755399513,
"learning_rate": 9.380878188353982e-07,
"loss": 0.4596761167049408,
"step": 1199,
"token_acc": 0.8479297106788458
},
{
"epoch": 3.8475120385232744,
"grad_norm": 0.20592159337948013,
"learning_rate": 9.331418368299001e-07,
"loss": 0.3804931640625,
"step": 1200,
"token_acc": 0.8733441746298368
},
{
"epoch": 3.850722311396469,
"grad_norm": 0.1836405567831981,
"learning_rate": 9.282069216637321e-07,
"loss": 0.4243571162223816,
"step": 1201,
"token_acc": 0.8580087289236278
},
{
"epoch": 3.853932584269663,
"grad_norm": 0.1912407427340455,
"learning_rate": 9.232830946134545e-07,
"loss": 0.4245198667049408,
"step": 1202,
"token_acc": 0.8562418907682475
},
{
"epoch": 3.857142857142857,
"grad_norm": 0.21147971576987462,
"learning_rate": 9.183703769078224e-07,
"loss": 0.4187825620174408,
"step": 1203,
"token_acc": 0.8594776104662264
},
{
"epoch": 3.860353130016051,
"grad_norm": 0.19363916582208013,
"learning_rate": 9.134687897276935e-07,
"loss": 0.4059651792049408,
"step": 1204,
"token_acc": 0.8651164117356721
},
{
"epoch": 3.8635634028892456,
"grad_norm": 0.1979326843110549,
"learning_rate": 9.085783542059362e-07,
"loss": 0.4379476010799408,
"step": 1205,
"token_acc": 0.8531043384119547
},
{
"epoch": 3.86677367576244,
"grad_norm": 0.20822780148569695,
"learning_rate": 9.036990914273424e-07,
"loss": 0.4237467646598816,
"step": 1206,
"token_acc": 0.8593451623169955
},
{
"epoch": 3.869983948635634,
"grad_norm": 0.216570254338795,
"learning_rate": 8.988310224285286e-07,
"loss": 0.4561360776424408,
"step": 1207,
"token_acc": 0.8473712875328652
},
{
"epoch": 3.8731942215088284,
"grad_norm": 0.24795318793900514,
"learning_rate": 8.939741681978527e-07,
"loss": 0.4324544370174408,
"step": 1208,
"token_acc": 0.8556788135891982
},
{
"epoch": 3.8764044943820224,
"grad_norm": 0.21213726196244703,
"learning_rate": 8.891285496753224e-07,
"loss": 0.42626953125,
"step": 1209,
"token_acc": 0.8567778242243094
},
{
"epoch": 3.879614767255217,
"grad_norm": 0.18405467114149734,
"learning_rate": 8.842941877525016e-07,
"loss": 0.3857015073299408,
"step": 1210,
"token_acc": 0.8689668297330032
},
{
"epoch": 3.882825040128411,
"grad_norm": 0.192615100166161,
"learning_rate": 8.794711032724204e-07,
"loss": 0.4140218198299408,
"step": 1211,
"token_acc": 0.8614851736081778
},
{
"epoch": 3.886035313001605,
"grad_norm": 0.19011937935493017,
"learning_rate": 8.746593170294891e-07,
"loss": 0.4407958984375,
"step": 1212,
"token_acc": 0.8529518626081146
},
{
"epoch": 3.889245585874799,
"grad_norm": 0.20036411640046875,
"learning_rate": 8.69858849769408e-07,
"loss": 0.4180094599723816,
"step": 1213,
"token_acc": 0.8602316017291542
},
{
"epoch": 3.8924558587479936,
"grad_norm": 0.43419652542817555,
"learning_rate": 8.650697221890728e-07,
"loss": 0.41845703125,
"step": 1214,
"token_acc": 0.8598270119786144
},
{
"epoch": 3.895666131621188,
"grad_norm": 0.18461668284934152,
"learning_rate": 8.602919549364914e-07,
"loss": 0.4590250849723816,
"step": 1215,
"token_acc": 0.8466133712323088
},
{
"epoch": 3.898876404494382,
"grad_norm": 0.19081795301141688,
"learning_rate": 8.55525568610691e-07,
"loss": 0.42529296875,
"step": 1216,
"token_acc": 0.8577144479993796
},
{
"epoch": 3.902086677367576,
"grad_norm": 0.19483729991308552,
"learning_rate": 8.507705837616316e-07,
"loss": 0.4462077021598816,
"step": 1217,
"token_acc": 0.8520786007770702
},
{
"epoch": 3.9052969502407704,
"grad_norm": 0.17692550192830483,
"learning_rate": 8.460270208901157e-07,
"loss": 0.4711100459098816,
"step": 1218,
"token_acc": 0.8430626746772136
},
{
"epoch": 3.908507223113965,
"grad_norm": 0.425992909344649,
"learning_rate": 8.412949004477013e-07,
"loss": 0.4374593198299408,
"step": 1219,
"token_acc": 0.8545313912557241
},
{
"epoch": 3.911717495987159,
"grad_norm": 0.18135473863521517,
"learning_rate": 8.36574242836613e-07,
"loss": 0.4267578125,
"step": 1220,
"token_acc": 0.857118550899144
},
{
"epoch": 3.914927768860353,
"grad_norm": 0.20475846877488454,
"learning_rate": 8.318650684096542e-07,
"loss": 0.3864339292049408,
"step": 1221,
"token_acc": 0.8697558260738323
},
{
"epoch": 3.918138041733547,
"grad_norm": 0.19525904130356977,
"learning_rate": 8.271673974701181e-07,
"loss": 0.3937174677848816,
"step": 1222,
"token_acc": 0.8689606579284498
},
{
"epoch": 3.9213483146067416,
"grad_norm": 0.18789724020524992,
"learning_rate": 8.224812502717055e-07,
"loss": 0.4049275815486908,
"step": 1223,
"token_acc": 0.8645857224509708
},
{
"epoch": 3.924558587479936,
"grad_norm": 0.1885665996962879,
"learning_rate": 8.178066470184274e-07,
"loss": 0.3941243588924408,
"step": 1224,
"token_acc": 0.8686516652726115
},
{
"epoch": 3.92776886035313,
"grad_norm": 0.1811780429895728,
"learning_rate": 8.13143607864528e-07,
"loss": 0.4338786005973816,
"step": 1225,
"token_acc": 0.8549170275370654
},
{
"epoch": 3.930979133226324,
"grad_norm": 0.19625335214307585,
"learning_rate": 8.084921529143908e-07,
"loss": 0.3824259638786316,
"step": 1226,
"token_acc": 0.8723617876476475
},
{
"epoch": 3.9341894060995184,
"grad_norm": 0.18965753363437576,
"learning_rate": 8.0385230222246e-07,
"loss": 0.4338786005973816,
"step": 1227,
"token_acc": 0.8552555443194048
},
{
"epoch": 3.937399678972713,
"grad_norm": 0.18550838689006327,
"learning_rate": 7.99224075793142e-07,
"loss": 0.4319661557674408,
"step": 1228,
"token_acc": 0.8545585791754989
},
{
"epoch": 3.940609951845907,
"grad_norm": 0.19022732749052057,
"learning_rate": 7.946074935807302e-07,
"loss": 0.4304606318473816,
"step": 1229,
"token_acc": 0.8549475072272658
},
{
"epoch": 3.943820224719101,
"grad_norm": 0.19115968352145013,
"learning_rate": 7.900025754893128e-07,
"loss": 0.4150797724723816,
"step": 1230,
"token_acc": 0.8610937896452516
},
{
"epoch": 3.947030497592295,
"grad_norm": 0.18621797333773915,
"learning_rate": 7.854093413726916e-07,
"loss": 0.4222005307674408,
"step": 1231,
"token_acc": 0.8602832744218125
},
{
"epoch": 3.9502407704654896,
"grad_norm": 0.17754732315506336,
"learning_rate": 7.808278110342917e-07,
"loss": 0.3959554135799408,
"step": 1232,
"token_acc": 0.8670500443540994
},
{
"epoch": 3.953451043338684,
"grad_norm": 0.19867554700988296,
"learning_rate": 7.76258004227076e-07,
"loss": 0.4458821713924408,
"step": 1233,
"token_acc": 0.8505125893374252
},
{
"epoch": 3.956661316211878,
"grad_norm": 0.21808630417891475,
"learning_rate": 7.716999406534674e-07,
"loss": 0.4442545771598816,
"step": 1234,
"token_acc": 0.8525282271934536
},
{
"epoch": 3.959871589085072,
"grad_norm": 0.1916423004733177,
"learning_rate": 7.671536399652543e-07,
"loss": 0.4322306513786316,
"step": 1235,
"token_acc": 0.8557067931446188
},
{
"epoch": 3.9630818619582664,
"grad_norm": 0.17649795985908107,
"learning_rate": 7.626191217635132e-07,
"loss": 0.4193522334098816,
"step": 1236,
"token_acc": 0.858990975699193
},
{
"epoch": 3.966292134831461,
"grad_norm": 0.1896890062451,
"learning_rate": 7.580964055985161e-07,
"loss": 0.4132080078125,
"step": 1237,
"token_acc": 0.8614007097683751
},
{
"epoch": 3.969502407704655,
"grad_norm": 0.20149925006058916,
"learning_rate": 7.535855109696586e-07,
"loss": 0.4415283203125,
"step": 1238,
"token_acc": 0.8525706179844579
},
{
"epoch": 3.972712680577849,
"grad_norm": 0.20316594264377988,
"learning_rate": 7.49086457325363e-07,
"loss": 0.4009602963924408,
"step": 1239,
"token_acc": 0.8664792315556717
},
{
"epoch": 3.975922953451043,
"grad_norm": 0.20270725393588832,
"learning_rate": 7.44599264063002e-07,
"loss": 0.4393310546875,
"step": 1240,
"token_acc": 0.8533326569378449
},
{
"epoch": 3.9791332263242376,
"grad_norm": 0.17646688963320326,
"learning_rate": 7.401239505288131e-07,
"loss": 0.4202474057674408,
"step": 1241,
"token_acc": 0.8593622787548933
},
{
"epoch": 3.982343499197432,
"grad_norm": 0.18633841996323106,
"learning_rate": 7.356605360178147e-07,
"loss": 0.4261067807674408,
"step": 1242,
"token_acc": 0.8577198930194864
},
{
"epoch": 3.985553772070626,
"grad_norm": 0.18117890155614608,
"learning_rate": 7.312090397737231e-07,
"loss": 0.4084879755973816,
"step": 1243,
"token_acc": 0.8637038960977068
},
{
"epoch": 3.98876404494382,
"grad_norm": 0.19441876582248407,
"learning_rate": 7.267694809888707e-07,
"loss": 0.4258219599723816,
"step": 1244,
"token_acc": 0.8575173994011962
},
{
"epoch": 3.9919743178170144,
"grad_norm": 0.21274611864180318,
"learning_rate": 7.223418788041214e-07,
"loss": 0.4119059443473816,
"step": 1245,
"token_acc": 0.8628398705941747
},
{
"epoch": 3.995184590690209,
"grad_norm": 0.20675182669422074,
"learning_rate": 7.179262523087899e-07,
"loss": 0.46435546875,
"step": 1246,
"token_acc": 0.8461486321611582
},
{
"epoch": 3.998394863563403,
"grad_norm": 0.18832618517471383,
"learning_rate": 7.135226205405573e-07,
"loss": 0.4163411557674408,
"step": 1247,
"token_acc": 0.8615401559705481
},
{
"epoch": 4.0,
"grad_norm": 0.27131761596680815,
"learning_rate": 7.091310024853904e-07,
"loss": 0.4319661557674408,
"step": 1248,
"token_acc": 0.8548751265570262
},
{
"epoch": 4.003210272873194,
"grad_norm": 0.211277310082995,
"learning_rate": 7.04751417077463e-07,
"loss": 0.3932088315486908,
"step": 1249,
"token_acc": 0.8676984176763316
},
{
"epoch": 4.006420545746389,
"grad_norm": 0.19452299161645747,
"learning_rate": 7.003838831990654e-07,
"loss": 0.3834025263786316,
"step": 1250,
"token_acc": 0.8716185002637179
},
{
"epoch": 4.009630818619582,
"grad_norm": 0.18002877066507167,
"learning_rate": 6.960284196805311e-07,
"loss": 0.4623616635799408,
"step": 1251,
"token_acc": 0.8435193660275556
},
{
"epoch": 4.012841091492777,
"grad_norm": 0.18782300122295928,
"learning_rate": 6.916850453001553e-07,
"loss": 0.4035237729549408,
"step": 1252,
"token_acc": 0.8641802933682129
},
{
"epoch": 4.016051364365971,
"grad_norm": 0.1864350065242437,
"learning_rate": 6.873537787841092e-07,
"loss": 0.4337565302848816,
"step": 1253,
"token_acc": 0.8550117239586191
},
{
"epoch": 4.019261637239166,
"grad_norm": 0.18476754648253124,
"learning_rate": 6.830346388063606e-07,
"loss": 0.44610595703125,
"step": 1254,
"token_acc": 0.8506658750185543
},
{
"epoch": 4.022471910112359,
"grad_norm": 0.18612140337980987,
"learning_rate": 6.787276439885962e-07,
"loss": 0.4018147885799408,
"step": 1255,
"token_acc": 0.8653541351935264
},
{
"epoch": 4.025682182985554,
"grad_norm": 0.2266902701351228,
"learning_rate": 6.744328129001411e-07,
"loss": 0.4254353940486908,
"step": 1256,
"token_acc": 0.8567928136612766
},
{
"epoch": 4.028892455858748,
"grad_norm": 0.19151032858380448,
"learning_rate": 6.701501640578749e-07,
"loss": 0.4202067255973816,
"step": 1257,
"token_acc": 0.8581763091032826
},
{
"epoch": 4.032102728731942,
"grad_norm": 0.19253982319972482,
"learning_rate": 6.65879715926155e-07,
"loss": 0.427001953125,
"step": 1258,
"token_acc": 0.8558299365735071
},
{
"epoch": 4.035313001605137,
"grad_norm": 0.20714121082879816,
"learning_rate": 6.616214869167364e-07,
"loss": 0.4225260615348816,
"step": 1259,
"token_acc": 0.857597815037414
},
{
"epoch": 4.03852327447833,
"grad_norm": 0.188984662915425,
"learning_rate": 6.573754953886914e-07,
"loss": 0.4147135615348816,
"step": 1260,
"token_acc": 0.861747953533917
},
{
"epoch": 4.041733547351525,
"grad_norm": 0.1809464007645627,
"learning_rate": 6.531417596483331e-07,
"loss": 0.4534098505973816,
"step": 1261,
"token_acc": 0.8476468626880285
},
{
"epoch": 4.044943820224719,
"grad_norm": 0.22937913994865552,
"learning_rate": 6.489202979491323e-07,
"loss": 0.4196370542049408,
"step": 1262,
"token_acc": 0.8600287993230477
},
{
"epoch": 4.048154093097914,
"grad_norm": 0.20729747118528538,
"learning_rate": 6.447111284916422e-07,
"loss": 0.3974609375,
"step": 1263,
"token_acc": 0.8688817547202342
},
{
"epoch": 4.051364365971107,
"grad_norm": 0.18473772975637504,
"learning_rate": 6.405142694234194e-07,
"loss": 0.376220703125,
"step": 1264,
"token_acc": 0.8740485875668322
},
{
"epoch": 4.054574638844302,
"grad_norm": 0.18264189060046002,
"learning_rate": 6.363297388389433e-07,
"loss": 0.4558512568473816,
"step": 1265,
"token_acc": 0.8466898374159838
},
{
"epoch": 4.057784911717496,
"grad_norm": 0.1736263742905703,
"learning_rate": 6.321575547795431e-07,
"loss": 0.4064534604549408,
"step": 1266,
"token_acc": 0.8641185236155824
},
{
"epoch": 4.06099518459069,
"grad_norm": 0.20439738580390066,
"learning_rate": 6.279977352333124e-07,
"loss": 0.385498046875,
"step": 1267,
"token_acc": 0.8698766189625405
},
{
"epoch": 4.064205457463885,
"grad_norm": 0.16700348386684835,
"learning_rate": 6.238502981350388e-07,
"loss": 0.4293619990348816,
"step": 1268,
"token_acc": 0.854447191015641
},
{
"epoch": 4.067415730337078,
"grad_norm": 0.19162768232809374,
"learning_rate": 6.197152613661231e-07,
"loss": 0.4134928584098816,
"step": 1269,
"token_acc": 0.8621758337761007
},
{
"epoch": 4.070626003210273,
"grad_norm": 0.17159317224373907,
"learning_rate": 6.155926427545048e-07,
"loss": 0.4090779721736908,
"step": 1270,
"token_acc": 0.8615923279862137
},
{
"epoch": 4.073836276083467,
"grad_norm": 0.18400403175015234,
"learning_rate": 6.114824600745797e-07,
"loss": 0.41064453125,
"step": 1271,
"token_acc": 0.8617532043587804
},
{
"epoch": 4.077046548956662,
"grad_norm": 0.19234784803990349,
"learning_rate": 6.07384731047129e-07,
"loss": 0.4007975459098816,
"step": 1272,
"token_acc": 0.8649410469740091
},
{
"epoch": 4.080256821829855,
"grad_norm": 0.18526287274208925,
"learning_rate": 6.032994733392405e-07,
"loss": 0.3879598081111908,
"step": 1273,
"token_acc": 0.869797958265236
},
{
"epoch": 4.08346709470305,
"grad_norm": 0.20014484746202704,
"learning_rate": 5.99226704564234e-07,
"loss": 0.4639485776424408,
"step": 1274,
"token_acc": 0.8447181048034629
},
{
"epoch": 4.086677367576244,
"grad_norm": 0.1863915286208777,
"learning_rate": 5.951664422815826e-07,
"loss": 0.409912109375,
"step": 1275,
"token_acc": 0.8622634658441072
},
{
"epoch": 4.089887640449438,
"grad_norm": 0.18608349962296986,
"learning_rate": 5.911187039968373e-07,
"loss": 0.4313151240348816,
"step": 1276,
"token_acc": 0.8556820057995549
},
{
"epoch": 4.093097913322633,
"grad_norm": 0.19141297490867865,
"learning_rate": 5.870835071615557e-07,
"loss": 0.388214111328125,
"step": 1277,
"token_acc": 0.869686242075975
},
{
"epoch": 4.096308186195826,
"grad_norm": 0.1899764844237124,
"learning_rate": 5.83060869173222e-07,
"loss": 0.4187418818473816,
"step": 1278,
"token_acc": 0.8578907762758216
},
{
"epoch": 4.099518459069021,
"grad_norm": 0.1850950058073246,
"learning_rate": 5.790508073751745e-07,
"loss": 0.4100341796875,
"step": 1279,
"token_acc": 0.8624161154309232
},
{
"epoch": 4.102728731942215,
"grad_norm": 0.2287332501505277,
"learning_rate": 5.750533390565272e-07,
"loss": 0.3760172724723816,
"step": 1280,
"token_acc": 0.8740483573664101
},
{
"epoch": 4.10593900481541,
"grad_norm": 0.19641243226746088,
"learning_rate": 5.710684814521035e-07,
"loss": 0.4447428584098816,
"step": 1281,
"token_acc": 0.849857485304379
},
{
"epoch": 4.109149277688603,
"grad_norm": 0.189976985915476,
"learning_rate": 5.670962517423525e-07,
"loss": 0.4637451171875,
"step": 1282,
"token_acc": 0.8427576539156075
},
{
"epoch": 4.112359550561798,
"grad_norm": 0.1906013835451214,
"learning_rate": 5.631366670532798e-07,
"loss": 0.3881022334098816,
"step": 1283,
"token_acc": 0.8686223344216135
},
{
"epoch": 4.115569823434992,
"grad_norm": 0.19301371581474472,
"learning_rate": 5.591897444563736e-07,
"loss": 0.4277750849723816,
"step": 1284,
"token_acc": 0.8561654968599927
},
{
"epoch": 4.118780096308186,
"grad_norm": 0.18155881845410735,
"learning_rate": 5.552555009685293e-07,
"loss": 0.4273885190486908,
"step": 1285,
"token_acc": 0.8550006421686092
},
{
"epoch": 4.121990369181381,
"grad_norm": 0.19461825352865922,
"learning_rate": 5.513339535519781e-07,
"loss": 0.37548828125,
"step": 1286,
"token_acc": 0.8735851601449866
},
{
"epoch": 4.125200642054574,
"grad_norm": 0.1850682483956793,
"learning_rate": 5.474251191142121e-07,
"loss": 0.3612467646598816,
"step": 1287,
"token_acc": 0.879464152848493
},
{
"epoch": 4.128410914927769,
"grad_norm": 0.18187017146954892,
"learning_rate": 5.435290145079132e-07,
"loss": 0.3720296323299408,
"step": 1288,
"token_acc": 0.8751402982064638
},
{
"epoch": 4.131621187800963,
"grad_norm": 0.18969987814168832,
"learning_rate": 5.396456565308787e-07,
"loss": 0.397216796875,
"step": 1289,
"token_acc": 0.8655026671998424
},
{
"epoch": 4.134831460674158,
"grad_norm": 0.19480607739692923,
"learning_rate": 5.35775061925949e-07,
"loss": 0.3852946162223816,
"step": 1290,
"token_acc": 0.8710818106339356
},
{
"epoch": 4.138041733547351,
"grad_norm": 0.19043422055610776,
"learning_rate": 5.319172473809382e-07,
"loss": 0.404541015625,
"step": 1291,
"token_acc": 0.8630114619051036
},
{
"epoch": 4.141252006420546,
"grad_norm": 0.18711730373563912,
"learning_rate": 5.280722295285595e-07,
"loss": 0.4005126953125,
"step": 1292,
"token_acc": 0.8656115364475748
},
{
"epoch": 4.14446227929374,
"grad_norm": 0.18535051535270514,
"learning_rate": 5.24240024946351e-07,
"loss": 0.4049886167049408,
"step": 1293,
"token_acc": 0.8628473494046731
},
{
"epoch": 4.147672552166934,
"grad_norm": 0.21974202967212095,
"learning_rate": 5.204206501566099e-07,
"loss": 0.42431640625,
"step": 1294,
"token_acc": 0.8570635265956126
},
{
"epoch": 4.150882825040128,
"grad_norm": 0.18477324438232298,
"learning_rate": 5.166141216263194e-07,
"loss": 0.4139811396598816,
"step": 1295,
"token_acc": 0.8617092496576585
},
{
"epoch": 4.154093097913322,
"grad_norm": 0.1648615941744021,
"learning_rate": 5.128204557670763e-07,
"loss": 0.4064534604549408,
"step": 1296,
"token_acc": 0.8613643514903288
},
{
"epoch": 4.157303370786517,
"grad_norm": 0.1951050217780076,
"learning_rate": 5.090396689350181e-07,
"loss": 0.4375813901424408,
"step": 1297,
"token_acc": 0.8542707776890804
},
{
"epoch": 4.160513643659711,
"grad_norm": 0.1986719886463106,
"learning_rate": 5.052717774307574e-07,
"loss": 0.4270426630973816,
"step": 1298,
"token_acc": 0.8555437054366243
},
{
"epoch": 4.163723916532906,
"grad_norm": 0.17380420268148986,
"learning_rate": 5.015167974993112e-07,
"loss": 0.337158203125,
"step": 1299,
"token_acc": 0.8873299549549549
},
{
"epoch": 4.166934189406099,
"grad_norm": 0.1902791869258638,
"learning_rate": 4.977747453300264e-07,
"loss": 0.39013671875,
"step": 1300,
"token_acc": 0.8680480170890513
},
{
"epoch": 4.170144462279294,
"grad_norm": 0.19729112488192352,
"learning_rate": 4.940456370565138e-07,
"loss": 0.3991902768611908,
"step": 1301,
"token_acc": 0.8665999383150703
},
{
"epoch": 4.173354735152488,
"grad_norm": 0.18428870178583523,
"learning_rate": 4.903294887565769e-07,
"loss": 0.4064534604549408,
"step": 1302,
"token_acc": 0.8632890489370607
},
{
"epoch": 4.176565008025682,
"grad_norm": 0.18397492412651675,
"learning_rate": 4.86626316452144e-07,
"loss": 0.4265543818473816,
"step": 1303,
"token_acc": 0.8573058678742547
},
{
"epoch": 4.179775280898877,
"grad_norm": 0.26311590341567126,
"learning_rate": 4.829361361091972e-07,
"loss": 0.3391927182674408,
"step": 1304,
"token_acc": 0.8873305688832924
},
{
"epoch": 4.18298555377207,
"grad_norm": 0.17443450600993915,
"learning_rate": 4.792589636377056e-07,
"loss": 0.4116618037223816,
"step": 1305,
"token_acc": 0.8612885944354927
},
{
"epoch": 4.186195826645265,
"grad_norm": 0.20438381798059105,
"learning_rate": 4.755948148915554e-07,
"loss": 0.398681640625,
"step": 1306,
"token_acc": 0.8670303756225186
},
{
"epoch": 4.189406099518459,
"grad_norm": 0.18502705799301722,
"learning_rate": 4.7194370566848097e-07,
"loss": 0.4068196713924408,
"step": 1307,
"token_acc": 0.8630395206028574
},
{
"epoch": 4.192616372391654,
"grad_norm": 0.18958605233060205,
"learning_rate": 4.683056517099986e-07,
"loss": 0.4548746943473816,
"step": 1308,
"token_acc": 0.8456803796560372
},
{
"epoch": 4.195826645264847,
"grad_norm": 0.21726464831762202,
"learning_rate": 4.6468066870133904e-07,
"loss": 0.3997802734375,
"step": 1309,
"token_acc": 0.8656721717315794
},
{
"epoch": 4.199036918138042,
"grad_norm": 0.19156875723094768,
"learning_rate": 4.610687722713753e-07,
"loss": 0.4374593198299408,
"step": 1310,
"token_acc": 0.8513498633256577
},
{
"epoch": 4.202247191011236,
"grad_norm": 0.18441342108706746,
"learning_rate": 4.574699779925604e-07,
"loss": 0.4659017026424408,
"step": 1311,
"token_acc": 0.8423244482044691
},
{
"epoch": 4.20545746388443,
"grad_norm": 0.19162538923812916,
"learning_rate": 4.538843013808577e-07,
"loss": 0.4342448115348816,
"step": 1312,
"token_acc": 0.8543519985629906
},
{
"epoch": 4.208667736757624,
"grad_norm": 0.16005559784512735,
"learning_rate": 4.503117578956767e-07,
"loss": 0.4018147885799408,
"step": 1313,
"token_acc": 0.8647396556289849
},
{
"epoch": 4.211878009630818,
"grad_norm": 0.18564687246334158,
"learning_rate": 4.467523629398009e-07,
"loss": 0.451904296875,
"step": 1314,
"token_acc": 0.8479268243087531
},
{
"epoch": 4.215088282504013,
"grad_norm": 0.19836548645783045,
"learning_rate": 4.432061318593257e-07,
"loss": 0.3826090693473816,
"step": 1315,
"token_acc": 0.8713426809813936
},
{
"epoch": 4.218298555377207,
"grad_norm": 0.17610950259048228,
"learning_rate": 4.3967307994359414e-07,
"loss": 0.4306640625,
"step": 1316,
"token_acc": 0.8540626922541564
},
{
"epoch": 4.221508828250402,
"grad_norm": 0.30184321987677176,
"learning_rate": 4.361532224251251e-07,
"loss": 0.3817138671875,
"step": 1317,
"token_acc": 0.8715654018295902
},
{
"epoch": 4.224719101123595,
"grad_norm": 0.1992537562440866,
"learning_rate": 4.3264657447955243e-07,
"loss": 0.40087890625,
"step": 1318,
"token_acc": 0.866381413593607
},
{
"epoch": 4.22792937399679,
"grad_norm": 0.1819451245164796,
"learning_rate": 4.2915315122555434e-07,
"loss": 0.4207763671875,
"step": 1319,
"token_acc": 0.8575846667951931
},
{
"epoch": 4.231139646869984,
"grad_norm": 0.19547967012422543,
"learning_rate": 4.256729677247972e-07,
"loss": 0.40771484375,
"step": 1320,
"token_acc": 0.8633353803433637
},
{
"epoch": 4.234349919743178,
"grad_norm": 0.199687286811994,
"learning_rate": 4.2220603898186126e-07,
"loss": 0.4063720703125,
"step": 1321,
"token_acc": 0.8648657558814221
},
{
"epoch": 4.237560192616373,
"grad_norm": 0.2227566496132186,
"learning_rate": 4.1875237994418113e-07,
"loss": 0.4142252802848816,
"step": 1322,
"token_acc": 0.8610649940844372
},
{
"epoch": 4.240770465489566,
"grad_norm": 0.217719196604295,
"learning_rate": 4.1531200550197745e-07,
"loss": 0.4201151728630066,
"step": 1323,
"token_acc": 0.8603017061548934
},
{
"epoch": 4.243980738362761,
"grad_norm": 0.19340551330810762,
"learning_rate": 4.118849304881995e-07,
"loss": 0.4350382685661316,
"step": 1324,
"token_acc": 0.8524961221250393
},
{
"epoch": 4.247191011235955,
"grad_norm": 0.18788995064724226,
"learning_rate": 4.084711696784538e-07,
"loss": 0.4066569209098816,
"step": 1325,
"token_acc": 0.864263793463344
},
{
"epoch": 4.25040128410915,
"grad_norm": 0.17846806815698746,
"learning_rate": 4.0507073779094485e-07,
"loss": 0.369384765625,
"step": 1326,
"token_acc": 0.8754548796945046
},
{
"epoch": 4.253611556982343,
"grad_norm": 0.22897066984574038,
"learning_rate": 4.0168364948640966e-07,
"loss": 0.4083251953125,
"step": 1327,
"token_acc": 0.8632289416846652
},
{
"epoch": 4.256821829855538,
"grad_norm": 0.19357435650404023,
"learning_rate": 3.9830991936805577e-07,
"loss": 0.3915202021598816,
"step": 1328,
"token_acc": 0.8685719589159757
},
{
"epoch": 4.260032102728732,
"grad_norm": 0.20243731055282949,
"learning_rate": 3.949495619814973e-07,
"loss": 0.4164225459098816,
"step": 1329,
"token_acc": 0.8601415282856744
},
{
"epoch": 4.263242375601926,
"grad_norm": 0.17523671564463278,
"learning_rate": 3.916025918146934e-07,
"loss": 0.3895263671875,
"step": 1330,
"token_acc": 0.8680372892236324
},
{
"epoch": 4.26645264847512,
"grad_norm": 0.18796892300648876,
"learning_rate": 3.8826902329788484e-07,
"loss": 0.4241740107536316,
"step": 1331,
"token_acc": 0.8579220936572364
},
{
"epoch": 4.269662921348314,
"grad_norm": 0.2064109356932018,
"learning_rate": 3.8494887080353166e-07,
"loss": 0.4365031123161316,
"step": 1332,
"token_acc": 0.8546197429602375
},
{
"epoch": 4.272873194221509,
"grad_norm": 0.18203455865795215,
"learning_rate": 3.816421486462513e-07,
"loss": 0.3876953125,
"step": 1333,
"token_acc": 0.8693844281300428
},
{
"epoch": 4.276083467094703,
"grad_norm": 0.213574817895332,
"learning_rate": 3.783488710827593e-07,
"loss": 0.3750813901424408,
"step": 1334,
"token_acc": 0.873373327290794
},
{
"epoch": 4.279293739967898,
"grad_norm": 0.2140942980497027,
"learning_rate": 3.75069052311804e-07,
"loss": 0.4498087763786316,
"step": 1335,
"token_acc": 0.847457992476908
},
{
"epoch": 4.282504012841091,
"grad_norm": 0.19578800561907392,
"learning_rate": 3.718027064741062e-07,
"loss": 0.39044189453125,
"step": 1336,
"token_acc": 0.869082210694379
},
{
"epoch": 4.285714285714286,
"grad_norm": 0.22104067865914637,
"learning_rate": 3.6854984765229984e-07,
"loss": 0.408935546875,
"step": 1337,
"token_acc": 0.8625050650133584
},
{
"epoch": 4.28892455858748,
"grad_norm": 0.2028154694754138,
"learning_rate": 3.6531048987087264e-07,
"loss": 0.4321696162223816,
"step": 1338,
"token_acc": 0.8558052507913844
},
{
"epoch": 4.292134831460674,
"grad_norm": 0.16476898952580152,
"learning_rate": 3.620846470961007e-07,
"loss": 0.4442138671875,
"step": 1339,
"token_acc": 0.8504904439728512
},
{
"epoch": 4.295345104333869,
"grad_norm": 0.20213718293160277,
"learning_rate": 3.5887233323599124e-07,
"loss": 0.3688761591911316,
"step": 1340,
"token_acc": 0.8757740050097411
},
{
"epoch": 4.298555377207062,
"grad_norm": 0.1890008889185189,
"learning_rate": 3.556735621402229e-07,
"loss": 0.4384969174861908,
"step": 1341,
"token_acc": 0.8527271306309512
},
{
"epoch": 4.301765650080257,
"grad_norm": 0.17718855075211482,
"learning_rate": 3.5248834760008757e-07,
"loss": 0.42041015625,
"step": 1342,
"token_acc": 0.8577206231880415
},
{
"epoch": 4.304975922953451,
"grad_norm": 0.21645292762759322,
"learning_rate": 3.493167033484262e-07,
"loss": 0.39642333984375,
"step": 1343,
"token_acc": 0.8662993750318829
},
{
"epoch": 4.308186195826646,
"grad_norm": 0.19869971515422113,
"learning_rate": 3.4615864305957355e-07,
"loss": 0.421630859375,
"step": 1344,
"token_acc": 0.8597636831455622
},
{
"epoch": 4.311396468699839,
"grad_norm": 0.6629400351496374,
"learning_rate": 3.430141803492977e-07,
"loss": 0.4062907099723816,
"step": 1345,
"token_acc": 0.8629788139687634
},
{
"epoch": 4.314606741573034,
"grad_norm": 0.17736584722766163,
"learning_rate": 3.398833287747417e-07,
"loss": 0.4195963740348816,
"step": 1346,
"token_acc": 0.859551744597776
},
{
"epoch": 4.317817014446228,
"grad_norm": 0.20646363440762722,
"learning_rate": 3.367661018343655e-07,
"loss": 0.389892578125,
"step": 1347,
"token_acc": 0.8695244463289735
},
{
"epoch": 4.321027287319422,
"grad_norm": 0.19356832119573253,
"learning_rate": 3.3366251296788696e-07,
"loss": 0.4179280698299408,
"step": 1348,
"token_acc": 0.859493141500395
},
{
"epoch": 4.324237560192616,
"grad_norm": 0.17699602920724225,
"learning_rate": 3.3057257555622425e-07,
"loss": 0.3675944209098816,
"step": 1349,
"token_acc": 0.8763813748365427
},
{
"epoch": 4.32744783306581,
"grad_norm": 0.1814342308311311,
"learning_rate": 3.274963029214385e-07,
"loss": 0.3459879755973816,
"step": 1350,
"token_acc": 0.8825021382326544
},
{
"epoch": 4.330658105939005,
"grad_norm": 0.2027288765185631,
"learning_rate": 3.2443370832667525e-07,
"loss": 0.3924560546875,
"step": 1351,
"token_acc": 0.8671193230773763
},
{
"epoch": 4.333868378812199,
"grad_norm": 0.19985007877208522,
"learning_rate": 3.2138480497611007e-07,
"loss": 0.3663737177848816,
"step": 1352,
"token_acc": 0.8762456313156786
},
{
"epoch": 4.337078651685394,
"grad_norm": 0.20362923197268062,
"learning_rate": 3.1834960601488696e-07,
"loss": 0.3803914487361908,
"step": 1353,
"token_acc": 0.871517642182498
},
{
"epoch": 4.340288924558587,
"grad_norm": 0.19443326332011304,
"learning_rate": 3.1532812452906623e-07,
"loss": 0.4407552182674408,
"step": 1354,
"token_acc": 0.853539740537423
},
{
"epoch": 4.343499197431782,
"grad_norm": 0.21761872939676885,
"learning_rate": 3.123203735455647e-07,
"loss": 0.4045817255973816,
"step": 1355,
"token_acc": 0.8657207443102927
},
{
"epoch": 4.346709470304976,
"grad_norm": 0.20338509759310694,
"learning_rate": 3.0932636603210396e-07,
"loss": 0.3558756709098816,
"step": 1356,
"token_acc": 0.8807724789605788
},
{
"epoch": 4.34991974317817,
"grad_norm": 0.1818871067962439,
"learning_rate": 3.0634611489714747e-07,
"loss": 0.4090169370174408,
"step": 1357,
"token_acc": 0.8621424332609016
},
{
"epoch": 4.353130016051364,
"grad_norm": 0.19834895457950263,
"learning_rate": 3.0337963298985143e-07,
"loss": 0.3870036005973816,
"step": 1358,
"token_acc": 0.8697542308742986
},
{
"epoch": 4.356340288924558,
"grad_norm": 0.19239094391841294,
"learning_rate": 3.0042693310000774e-07,
"loss": 0.4254557490348816,
"step": 1359,
"token_acc": 0.8584187137021286
},
{
"epoch": 4.359550561797753,
"grad_norm": 0.19741820606832636,
"learning_rate": 2.9748802795798573e-07,
"loss": 0.4197591245174408,
"step": 1360,
"token_acc": 0.8586211261004959
},
{
"epoch": 4.362760834670947,
"grad_norm": 0.17549385904831483,
"learning_rate": 2.9456293023468175e-07,
"loss": 0.3958333432674408,
"step": 1361,
"token_acc": 0.8666089202823778
},
{
"epoch": 4.365971107544142,
"grad_norm": 0.18418482505957448,
"learning_rate": 2.916516525414597e-07,
"loss": 0.3685709834098816,
"step": 1362,
"token_acc": 0.8757923270088352
},
{
"epoch": 4.369181380417335,
"grad_norm": 0.17301330514887706,
"learning_rate": 2.887542074301019e-07,
"loss": 0.4075927734375,
"step": 1363,
"token_acc": 0.8605788261905724
},
{
"epoch": 4.37239165329053,
"grad_norm": 0.19604758020776158,
"learning_rate": 2.8587060739275174e-07,
"loss": 0.408935546875,
"step": 1364,
"token_acc": 0.8637168977001795
},
{
"epoch": 4.375601926163724,
"grad_norm": 0.18596677995931085,
"learning_rate": 2.830008648618606e-07,
"loss": 0.384521484375,
"step": 1365,
"token_acc": 0.8701020161208599
},
{
"epoch": 4.378812199036918,
"grad_norm": 0.18061133688689154,
"learning_rate": 2.801449922101314e-07,
"loss": 0.3860066831111908,
"step": 1366,
"token_acc": 0.8702969224355789
},
{
"epoch": 4.382022471910112,
"grad_norm": 0.16828215003829966,
"learning_rate": 2.7730300175047263e-07,
"loss": 0.3342081904411316,
"step": 1367,
"token_acc": 0.8873373657943231
},
{
"epoch": 4.385232744783306,
"grad_norm": 0.35555500278869245,
"learning_rate": 2.744749057359378e-07,
"loss": 0.4285888671875,
"step": 1368,
"token_acc": 0.8555756341438024
},
{
"epoch": 4.388443017656501,
"grad_norm": 0.18514465832240257,
"learning_rate": 2.716607163596759e-07,
"loss": 0.387451171875,
"step": 1369,
"token_acc": 0.8703410803035158
},
{
"epoch": 4.391653290529695,
"grad_norm": 0.1937482638050718,
"learning_rate": 2.688604457548783e-07,
"loss": 0.4518229365348816,
"step": 1370,
"token_acc": 0.8489506475979381
},
{
"epoch": 4.39486356340289,
"grad_norm": 0.22216734997918833,
"learning_rate": 2.660741059947267e-07,
"loss": 0.4341634213924408,
"step": 1371,
"token_acc": 0.8556407722211241
},
{
"epoch": 4.398073836276083,
"grad_norm": 0.17854562511660027,
"learning_rate": 2.6330170909234055e-07,
"loss": 0.3616740107536316,
"step": 1372,
"token_acc": 0.878413306698194
},
{
"epoch": 4.401284109149278,
"grad_norm": 0.15704979914554196,
"learning_rate": 2.605432670007265e-07,
"loss": 0.360595703125,
"step": 1373,
"token_acc": 0.8790725725260599
},
{
"epoch": 4.404494382022472,
"grad_norm": 0.18433319937167036,
"learning_rate": 2.5779879161272474e-07,
"loss": 0.44140625,
"step": 1374,
"token_acc": 0.8535901582776324
},
{
"epoch": 4.407704654895666,
"grad_norm": 0.25076313067208356,
"learning_rate": 2.550682947609599e-07,
"loss": 0.4233601987361908,
"step": 1375,
"token_acc": 0.8588385113791485
},
{
"epoch": 4.41091492776886,
"grad_norm": 0.22379555574253152,
"learning_rate": 2.5235178821778793e-07,
"loss": 0.44354248046875,
"step": 1376,
"token_acc": 0.8516841003543864
},
{
"epoch": 4.414125200642054,
"grad_norm": 0.1936509284000272,
"learning_rate": 2.496492836952486e-07,
"loss": 0.4043172299861908,
"step": 1377,
"token_acc": 0.8650482422621322
},
{
"epoch": 4.417335473515249,
"grad_norm": 0.18093605782753522,
"learning_rate": 2.469607928450114e-07,
"loss": 0.3992106318473816,
"step": 1378,
"token_acc": 0.8653276993296248
},
{
"epoch": 4.420545746388443,
"grad_norm": 0.17557936684703238,
"learning_rate": 2.442863272583258e-07,
"loss": 0.4185791015625,
"step": 1379,
"token_acc": 0.8599879260834397
},
{
"epoch": 4.423756019261638,
"grad_norm": 0.1928260334885516,
"learning_rate": 2.4162589846597307e-07,
"loss": 0.4438883662223816,
"step": 1380,
"token_acc": 0.8516120924151115
},
{
"epoch": 4.426966292134831,
"grad_norm": 0.19447542899409975,
"learning_rate": 2.389795179382183e-07,
"loss": 0.4063720703125,
"step": 1381,
"token_acc": 0.8649005377133958
},
{
"epoch": 4.430176565008026,
"grad_norm": 0.1961732042045321,
"learning_rate": 2.3634719708475504e-07,
"loss": 0.4538167417049408,
"step": 1382,
"token_acc": 0.8484574145451315
},
{
"epoch": 4.43338683788122,
"grad_norm": 0.2123227934041795,
"learning_rate": 2.3372894725465985e-07,
"loss": 0.4500325620174408,
"step": 1383,
"token_acc": 0.8502840199626592
},
{
"epoch": 4.436597110754414,
"grad_norm": 0.18582801640426513,
"learning_rate": 2.3112477973634532e-07,
"loss": 0.3983357846736908,
"step": 1384,
"token_acc": 0.865070239292316
},
{
"epoch": 4.439807383627608,
"grad_norm": 0.19852931167320112,
"learning_rate": 2.2853470575750666e-07,
"loss": 0.4342448115348816,
"step": 1385,
"token_acc": 0.8552494168787108
},
{
"epoch": 4.443017656500802,
"grad_norm": 0.2054451058675366,
"learning_rate": 2.2595873648507686e-07,
"loss": 0.4345296323299408,
"step": 1386,
"token_acc": 0.854400060575443
},
{
"epoch": 4.446227929373997,
"grad_norm": 0.20872486334243198,
"learning_rate": 2.2339688302517752e-07,
"loss": 0.3849284052848816,
"step": 1387,
"token_acc": 0.8710310516021074
},
{
"epoch": 4.449438202247191,
"grad_norm": 0.1677447267888503,
"learning_rate": 2.208491564230704e-07,
"loss": 0.3639323115348816,
"step": 1388,
"token_acc": 0.8765660123119159
},
{
"epoch": 4.452648475120386,
"grad_norm": 0.20084403258571848,
"learning_rate": 2.1831556766310999e-07,
"loss": 0.4356689453125,
"step": 1389,
"token_acc": 0.8543227886379977
},
{
"epoch": 4.455858747993579,
"grad_norm": 0.19999448020787874,
"learning_rate": 2.1579612766869688e-07,
"loss": 0.3855387568473816,
"step": 1390,
"token_acc": 0.8703626579023295
},
{
"epoch": 4.459069020866774,
"grad_norm": 0.17428186503514684,
"learning_rate": 2.132908473022303e-07,
"loss": 0.3995158076286316,
"step": 1391,
"token_acc": 0.867400816167333
},
{
"epoch": 4.462279293739968,
"grad_norm": 0.19426057533907357,
"learning_rate": 2.1079973736506118e-07,
"loss": 0.3830973505973816,
"step": 1392,
"token_acc": 0.872240673210398
},
{
"epoch": 4.465489566613162,
"grad_norm": 0.18324896369450797,
"learning_rate": 2.0832280859744473e-07,
"loss": 0.4040934443473816,
"step": 1393,
"token_acc": 0.8639917176570526
},
{
"epoch": 4.468699839486356,
"grad_norm": 0.1958613880239227,
"learning_rate": 2.058600716784957e-07,
"loss": 0.4177653193473816,
"step": 1394,
"token_acc": 0.8590019981339825
},
{
"epoch": 4.47191011235955,
"grad_norm": 0.20201865980897998,
"learning_rate": 2.034115372261433e-07,
"loss": 0.4073486328125,
"step": 1395,
"token_acc": 0.8629729587765791
},
{
"epoch": 4.475120385232745,
"grad_norm": 0.219232387300634,
"learning_rate": 2.0097721579707965e-07,
"loss": 0.4217122495174408,
"step": 1396,
"token_acc": 0.8575014442075773
},
{
"epoch": 4.478330658105939,
"grad_norm": 0.2053024503342439,
"learning_rate": 1.985571178867216e-07,
"loss": 0.4247233271598816,
"step": 1397,
"token_acc": 0.8595181614995406
},
{
"epoch": 4.481540930979134,
"grad_norm": 0.197725254363412,
"learning_rate": 1.9615125392916088e-07,
"loss": 0.4213460385799408,
"step": 1398,
"token_acc": 0.8587392006416064
},
{
"epoch": 4.484751203852327,
"grad_norm": 0.18897462231141934,
"learning_rate": 1.9375963429712278e-07,
"loss": 0.4065144956111908,
"step": 1399,
"token_acc": 0.8645349504146544
},
{
"epoch": 4.487961476725522,
"grad_norm": 0.2470233814439173,
"learning_rate": 1.9138226930191543e-07,
"loss": 0.3607584834098816,
"step": 1400,
"token_acc": 0.8788034346501037
},
{
"epoch": 4.491171749598716,
"grad_norm": 0.21359818787913157,
"learning_rate": 1.8901916919339063e-07,
"loss": 0.4027913510799408,
"step": 1401,
"token_acc": 0.8638582562514848
},
{
"epoch": 4.49438202247191,
"grad_norm": 0.1851844159684862,
"learning_rate": 1.866703441598999e-07,
"loss": 0.4230143427848816,
"step": 1402,
"token_acc": 0.8582203402573039
},
{
"epoch": 4.497592295345104,
"grad_norm": 0.2157643414774233,
"learning_rate": 1.8433580432824604e-07,
"loss": 0.443603515625,
"step": 1403,
"token_acc": 0.8509721741439272
},
{
"epoch": 4.500802568218298,
"grad_norm": 0.18862363130606546,
"learning_rate": 1.8201555976364443e-07,
"loss": 0.4197998046875,
"step": 1404,
"token_acc": 0.8586452996070447
},
{
"epoch": 4.504012841091493,
"grad_norm": 0.21602876440113533,
"learning_rate": 1.7970962046967388e-07,
"loss": 0.4269612729549408,
"step": 1405,
"token_acc": 0.8580807203352152
},
{
"epoch": 4.507223113964687,
"grad_norm": 0.19575645391056692,
"learning_rate": 1.7741799638824157e-07,
"loss": 0.4242960810661316,
"step": 1406,
"token_acc": 0.8554766200266856
},
{
"epoch": 4.510433386837882,
"grad_norm": 0.18945812539306195,
"learning_rate": 1.7514069739953219e-07,
"loss": 0.373779296875,
"step": 1407,
"token_acc": 0.8747324228171732
},
{
"epoch": 4.513643659711075,
"grad_norm": 0.19167885218390776,
"learning_rate": 1.728777333219698e-07,
"loss": 0.447265625,
"step": 1408,
"token_acc": 0.8509208269492929
},
{
"epoch": 4.51685393258427,
"grad_norm": 0.18022233494326212,
"learning_rate": 1.7062911391217515e-07,
"loss": 0.4439290463924408,
"step": 1409,
"token_acc": 0.849044687255004
},
{
"epoch": 4.520064205457464,
"grad_norm": 0.23067829941613088,
"learning_rate": 1.6839484886492133e-07,
"loss": 0.3721517026424408,
"step": 1410,
"token_acc": 0.8760147029100293
},
{
"epoch": 4.523274478330658,
"grad_norm": 0.1756868996664974,
"learning_rate": 1.6617494781309534e-07,
"loss": 0.389892578125,
"step": 1411,
"token_acc": 0.8671128158562844
},
{
"epoch": 4.526484751203852,
"grad_norm": 0.20804654536137415,
"learning_rate": 1.6396942032765293e-07,
"loss": 0.4468587338924408,
"step": 1412,
"token_acc": 0.850228639546784
},
{
"epoch": 4.529695024077046,
"grad_norm": 0.21308168454138018,
"learning_rate": 1.617782759175807e-07,
"loss": 0.4160970151424408,
"step": 1413,
"token_acc": 0.8622860673783908
},
{
"epoch": 4.532905296950241,
"grad_norm": 0.22297079035303088,
"learning_rate": 1.5960152402985277e-07,
"loss": 0.3673502802848816,
"step": 1414,
"token_acc": 0.8770901721910811
},
{
"epoch": 4.536115569823435,
"grad_norm": 0.18659098268766408,
"learning_rate": 1.574391740493913e-07,
"loss": 0.4529622495174408,
"step": 1415,
"token_acc": 0.8480984391282939
},
{
"epoch": 4.539325842696629,
"grad_norm": 0.197242571565292,
"learning_rate": 1.5529123529902472e-07,
"loss": 0.4353841245174408,
"step": 1416,
"token_acc": 0.8548159161410306
},
{
"epoch": 4.542536115569823,
"grad_norm": 0.20715809917824884,
"learning_rate": 1.5315771703944953e-07,
"loss": 0.43798828125,
"step": 1417,
"token_acc": 0.8528016169399727
},
{
"epoch": 4.545746388443018,
"grad_norm": 0.18474227890442782,
"learning_rate": 1.5103862846918847e-07,
"loss": 0.4179891049861908,
"step": 1418,
"token_acc": 0.8599016224268302
},
{
"epoch": 4.548956661316212,
"grad_norm": 0.20946972284990092,
"learning_rate": 1.4893397872455183e-07,
"loss": 0.3983154296875,
"step": 1419,
"token_acc": 0.8677837933098281
},
{
"epoch": 4.552166934189406,
"grad_norm": 0.19986064963725622,
"learning_rate": 1.468437768795981e-07,
"loss": 0.4313151240348816,
"step": 1420,
"token_acc": 0.8547023739216093
},
{
"epoch": 4.5553772070626,
"grad_norm": 0.18486103454766173,
"learning_rate": 1.4476803194609477e-07,
"loss": 0.38690185546875,
"step": 1421,
"token_acc": 0.869865230167784
},
{
"epoch": 4.558587479935794,
"grad_norm": 0.19289911775635488,
"learning_rate": 1.4270675287347833e-07,
"loss": 0.435546875,
"step": 1422,
"token_acc": 0.8535203141238655
},
{
"epoch": 4.561797752808989,
"grad_norm": 0.18956544459287866,
"learning_rate": 1.4065994854881654e-07,
"loss": 0.4341227412223816,
"step": 1423,
"token_acc": 0.8545348884673308
},
{
"epoch": 4.565008025682183,
"grad_norm": 0.17367270405131094,
"learning_rate": 1.3862762779677262e-07,
"loss": 0.4180094599723816,
"step": 1424,
"token_acc": 0.8567365363850598
},
{
"epoch": 4.568218298555378,
"grad_norm": 0.20824227581500948,
"learning_rate": 1.3660979937956268e-07,
"loss": 0.4293619990348816,
"step": 1425,
"token_acc": 0.8565126394525634
},
{
"epoch": 4.571428571428571,
"grad_norm": 0.19559983755189972,
"learning_rate": 1.3460647199691945e-07,
"loss": 0.4120280146598816,
"step": 1426,
"token_acc": 0.8629014499737051
},
{
"epoch": 4.574638844301766,
"grad_norm": 0.20180565953660315,
"learning_rate": 1.3261765428605766e-07,
"loss": 0.3741862177848816,
"step": 1427,
"token_acc": 0.8753082725636212
},
{
"epoch": 4.57784911717496,
"grad_norm": 0.20104038941301108,
"learning_rate": 1.3064335482163337e-07,
"loss": 0.4160970151424408,
"step": 1428,
"token_acc": 0.8594041300507503
},
{
"epoch": 4.581059390048154,
"grad_norm": 0.1965890577139458,
"learning_rate": 1.2868358211570812e-07,
"loss": 0.4365234375,
"step": 1429,
"token_acc": 0.8519271262041217
},
{
"epoch": 4.584269662921348,
"grad_norm": 0.19053080611967452,
"learning_rate": 1.267383446177121e-07,
"loss": 0.361114501953125,
"step": 1430,
"token_acc": 0.878369035008151
},
{
"epoch": 4.587479935794542,
"grad_norm": 0.6296605966778805,
"learning_rate": 1.248076507144087e-07,
"loss": 0.4423421323299408,
"step": 1431,
"token_acc": 0.8508115231639366
},
{
"epoch": 4.590690208667737,
"grad_norm": 0.17643731780551508,
"learning_rate": 1.2289150872985642e-07,
"loss": 0.3570760190486908,
"step": 1432,
"token_acc": 0.8799221630383898
},
{
"epoch": 4.593900481540931,
"grad_norm": 0.1920351337084536,
"learning_rate": 1.2098992692537563e-07,
"loss": 0.3970947265625,
"step": 1433,
"token_acc": 0.8656649371161228
},
{
"epoch": 4.597110754414125,
"grad_norm": 0.24155602493913697,
"learning_rate": 1.1910291349951024e-07,
"loss": 0.401123046875,
"step": 1434,
"token_acc": 0.8663016423800989
},
{
"epoch": 4.600321027287319,
"grad_norm": 0.23493455092849216,
"learning_rate": 1.1723047658799368e-07,
"loss": 0.3884684443473816,
"step": 1435,
"token_acc": 0.8696389403108131
},
{
"epoch": 4.603531300160514,
"grad_norm": 0.20660259029932984,
"learning_rate": 1.1537262426371425e-07,
"loss": 0.3963419795036316,
"step": 1436,
"token_acc": 0.8671111703086193
},
{
"epoch": 4.606741573033708,
"grad_norm": 0.1878147356962549,
"learning_rate": 1.1352936453667861e-07,
"loss": 0.4197591245174408,
"step": 1437,
"token_acc": 0.8595295977028828
},
{
"epoch": 4.609951845906902,
"grad_norm": 0.19082198521134558,
"learning_rate": 1.1170070535398108e-07,
"loss": 0.4320882260799408,
"step": 1438,
"token_acc": 0.8556060993004294
},
{
"epoch": 4.613162118780096,
"grad_norm": 0.18879001901055564,
"learning_rate": 1.098866545997636e-07,
"loss": 0.4278564453125,
"step": 1439,
"token_acc": 0.8558024296675192
},
{
"epoch": 4.61637239165329,
"grad_norm": 0.18946714623578712,
"learning_rate": 1.0808722009518584e-07,
"loss": 0.3866373896598816,
"step": 1440,
"token_acc": 0.8710574365640688
},
{
"epoch": 4.619582664526485,
"grad_norm": 0.1939962029433925,
"learning_rate": 1.0630240959839137e-07,
"loss": 0.4243571162223816,
"step": 1441,
"token_acc": 0.8579994341180615
},
{
"epoch": 4.622792937399679,
"grad_norm": 0.21957470785072045,
"learning_rate": 1.0453223080447272e-07,
"loss": 0.4058024287223816,
"step": 1442,
"token_acc": 0.864173404097019
},
{
"epoch": 4.626003210272874,
"grad_norm": 0.19758763574672267,
"learning_rate": 1.0277669134543866e-07,
"loss": 0.3876953125,
"step": 1443,
"token_acc": 0.869858307509244
},
{
"epoch": 4.629213483146067,
"grad_norm": 0.20067056305259742,
"learning_rate": 1.0103579879018088e-07,
"loss": 0.3555908203125,
"step": 1444,
"token_acc": 0.8820659184604286
},
{
"epoch": 4.632423756019262,
"grad_norm": 0.23677955722305155,
"learning_rate": 9.930956064444363e-08,
"loss": 0.4253336787223816,
"step": 1445,
"token_acc": 0.857207984790406
},
{
"epoch": 4.635634028892456,
"grad_norm": 0.20852218454016413,
"learning_rate": 9.759798435078798e-08,
"loss": 0.3988037109375,
"step": 1446,
"token_acc": 0.8677262375703081
},
{
"epoch": 4.63884430176565,
"grad_norm": 0.20420025529095903,
"learning_rate": 9.590107728856268e-08,
"loss": 0.4260661005973816,
"step": 1447,
"token_acc": 0.8579336664926283
},
{
"epoch": 4.642054574638844,
"grad_norm": 0.23411856679995743,
"learning_rate": 9.421884677386915e-08,
"loss": 0.4188639521598816,
"step": 1448,
"token_acc": 0.859142508633147
},
{
"epoch": 4.645264847512038,
"grad_norm": 0.18200197390792874,
"learning_rate": 9.255130005953398e-08,
"loss": 0.3849690854549408,
"step": 1449,
"token_acc": 0.872107766818619
},
{
"epoch": 4.648475120385233,
"grad_norm": 0.2916389218672418,
"learning_rate": 9.089844433507426e-08,
"loss": 0.4009196162223816,
"step": 1450,
"token_acc": 0.8648391516003826
},
{
"epoch": 4.651685393258427,
"grad_norm": 0.17308443268439605,
"learning_rate": 8.926028672666886e-08,
"loss": 0.383544921875,
"step": 1451,
"token_acc": 0.871333844537722
},
{
"epoch": 4.654895666131621,
"grad_norm": 0.17301234315676522,
"learning_rate": 8.763683429712498e-08,
"loss": 0.3865559995174408,
"step": 1452,
"token_acc": 0.8685990988114892
},
{
"epoch": 4.658105939004815,
"grad_norm": 0.280043466935383,
"learning_rate": 8.602809404585143e-08,
"loss": 0.4070638120174408,
"step": 1453,
"token_acc": 0.8629206897298851
},
{
"epoch": 4.66131621187801,
"grad_norm": 0.21422537153162374,
"learning_rate": 8.44340729088251e-08,
"loss": 0.4311930537223816,
"step": 1454,
"token_acc": 0.8536791894362028
},
{
"epoch": 4.664526484751204,
"grad_norm": 0.17201229972776902,
"learning_rate": 8.285477775856264e-08,
"loss": 0.3789469599723816,
"step": 1455,
"token_acc": 0.8721962958676278
},
{
"epoch": 4.667736757624398,
"grad_norm": 0.20234057546368325,
"learning_rate": 8.129021540409099e-08,
"loss": 0.43701171875,
"step": 1456,
"token_acc": 0.8541992043359913
},
{
"epoch": 4.670947030497592,
"grad_norm": 0.22317333643394066,
"learning_rate": 7.974039259091692e-08,
"loss": 0.3818766474723816,
"step": 1457,
"token_acc": 0.8724333764763185
},
{
"epoch": 4.674157303370786,
"grad_norm": 0.19956761588146485,
"learning_rate": 7.820531600099962e-08,
"loss": 0.4542236328125,
"step": 1458,
"token_acc": 0.8496136963053602
},
{
"epoch": 4.677367576243981,
"grad_norm": 0.2720616063396832,
"learning_rate": 7.668499225272025e-08,
"loss": 0.4219563901424408,
"step": 1459,
"token_acc": 0.8568100054002388
},
{
"epoch": 4.680577849117175,
"grad_norm": 0.1929782285557914,
"learning_rate": 7.517942790085363e-08,
"loss": 0.4552815854549408,
"step": 1460,
"token_acc": 0.8469568147492945
},
{
"epoch": 4.68378812199037,
"grad_norm": 0.20352188963079326,
"learning_rate": 7.368862943654147e-08,
"loss": 0.384765625,
"step": 1461,
"token_acc": 0.8714652683423914
},
{
"epoch": 4.686998394863563,
"grad_norm": 0.19969466438102046,
"learning_rate": 7.221260328726276e-08,
"loss": 0.4564616084098816,
"step": 1462,
"token_acc": 0.8487703460161007
},
{
"epoch": 4.690208667736758,
"grad_norm": 0.17978501403580038,
"learning_rate": 7.075135581680658e-08,
"loss": 0.4076334834098816,
"step": 1463,
"token_acc": 0.8603003744043567
},
{
"epoch": 4.693418940609952,
"grad_norm": 0.20366337402097498,
"learning_rate": 6.930489332524536e-08,
"loss": 0.4206136167049408,
"step": 1464,
"token_acc": 0.8593547259114949
},
{
"epoch": 4.696629213483146,
"grad_norm": 0.19419110557592836,
"learning_rate": 6.787322204890527e-08,
"loss": 0.4336954951286316,
"step": 1465,
"token_acc": 0.8534309255558069
},
{
"epoch": 4.69983948635634,
"grad_norm": 0.18294404853503618,
"learning_rate": 6.645634816034335e-08,
"loss": 0.4189046323299408,
"step": 1466,
"token_acc": 0.8585958978222711
},
{
"epoch": 4.703049759229534,
"grad_norm": 0.17335992283179122,
"learning_rate": 6.50542777683179e-08,
"loss": 0.4037272334098816,
"step": 1467,
"token_acc": 0.8626201549093779
},
{
"epoch": 4.706260032102729,
"grad_norm": 0.1795614119830208,
"learning_rate": 6.366701691776256e-08,
"loss": 0.4220377802848816,
"step": 1468,
"token_acc": 0.8588724081963655
},
{
"epoch": 4.709470304975923,
"grad_norm": 0.20472933364890156,
"learning_rate": 6.229457158976014e-08,
"loss": 0.4786784052848816,
"step": 1469,
"token_acc": 0.8412095888868124
},
{
"epoch": 4.712680577849117,
"grad_norm": 0.18039371374750934,
"learning_rate": 6.09369477015187e-08,
"loss": 0.3718668818473816,
"step": 1470,
"token_acc": 0.8749293821269497
},
{
"epoch": 4.715890850722311,
"grad_norm": 0.32958483208060935,
"learning_rate": 5.959415110634375e-08,
"loss": 0.3703206479549408,
"step": 1471,
"token_acc": 0.8735466357782282
},
{
"epoch": 4.719101123595506,
"grad_norm": 0.19713307989540768,
"learning_rate": 5.826618759361396e-08,
"loss": 0.4169515073299408,
"step": 1472,
"token_acc": 0.8602492479587451
},
{
"epoch": 4.7223113964687,
"grad_norm": 0.19385974784241175,
"learning_rate": 5.6953062888756566e-08,
"loss": 0.4407552182674408,
"step": 1473,
"token_acc": 0.8521287513030279
},
{
"epoch": 4.725521669341894,
"grad_norm": 0.19774134819733538,
"learning_rate": 5.565478265322138e-08,
"loss": 0.3862508237361908,
"step": 1474,
"token_acc": 0.8715619692168759
},
{
"epoch": 4.728731942215088,
"grad_norm": 0.19218443693933007,
"learning_rate": 5.4371352484458235e-08,
"loss": 0.3851521909236908,
"step": 1475,
"token_acc": 0.8701733830356677
},
{
"epoch": 4.731942215088282,
"grad_norm": 0.19666432540868903,
"learning_rate": 5.310277791589174e-08,
"loss": 0.4151204526424408,
"step": 1476,
"token_acc": 0.8608209992893119
},
{
"epoch": 4.735152487961477,
"grad_norm": 0.18416564772277194,
"learning_rate": 5.1849064416896796e-08,
"loss": 0.4468994140625,
"step": 1477,
"token_acc": 0.8500327093928715
},
{
"epoch": 4.738362760834671,
"grad_norm": 0.19335434520437636,
"learning_rate": 5.061021739277605e-08,
"loss": 0.4487711787223816,
"step": 1478,
"token_acc": 0.848170593264962
},
{
"epoch": 4.741573033707866,
"grad_norm": 0.20958484707121441,
"learning_rate": 4.9386242184737364e-08,
"loss": 0.4549560546875,
"step": 1479,
"token_acc": 0.8482572892840267
},
{
"epoch": 4.744783306581059,
"grad_norm": 0.23289234156230862,
"learning_rate": 4.817714406986856e-08,
"loss": 0.4112345576286316,
"step": 1480,
"token_acc": 0.8634959645826338
},
{
"epoch": 4.747993579454254,
"grad_norm": 0.18402731735735614,
"learning_rate": 4.698292826111644e-08,
"loss": 0.3708903193473816,
"step": 1481,
"token_acc": 0.8732122154116143
},
{
"epoch": 4.751203852327448,
"grad_norm": 0.2018874354019744,
"learning_rate": 4.580359990726307e-08,
"loss": 0.4150390625,
"step": 1482,
"token_acc": 0.8606020526688325
},
{
"epoch": 4.754414125200642,
"grad_norm": 0.1861877841024952,
"learning_rate": 4.4639164092905194e-08,
"loss": 0.412841796875,
"step": 1483,
"token_acc": 0.861394849559571
},
{
"epoch": 4.757624398073836,
"grad_norm": 0.20083510848897834,
"learning_rate": 4.3489625838430524e-08,
"loss": 0.4234822690486908,
"step": 1484,
"token_acc": 0.8593256868624384
},
{
"epoch": 4.76083467094703,
"grad_norm": 0.18492708213928652,
"learning_rate": 4.235499009999794e-08,
"loss": 0.3972371518611908,
"step": 1485,
"token_acc": 0.8659506220432518
},
{
"epoch": 4.764044943820225,
"grad_norm": 0.24624452749715423,
"learning_rate": 4.1235261769513364e-08,
"loss": 0.4518229365348816,
"step": 1486,
"token_acc": 0.848350926099052
},
{
"epoch": 4.767255216693419,
"grad_norm": 0.20175500356945403,
"learning_rate": 4.0130445674612326e-08,
"loss": 0.4331461787223816,
"step": 1487,
"token_acc": 0.8546658464127549
},
{
"epoch": 4.770465489566613,
"grad_norm": 0.17764076834579673,
"learning_rate": 3.9040546578635814e-08,
"loss": 0.411865234375,
"step": 1488,
"token_acc": 0.8623808613308531
},
{
"epoch": 4.773675762439807,
"grad_norm": 0.1781652695765274,
"learning_rate": 3.796556918061245e-08,
"loss": 0.408203125,
"step": 1489,
"token_acc": 0.86270329295958
},
{
"epoch": 4.776886035313002,
"grad_norm": 0.28325044353823575,
"learning_rate": 3.69055181152359e-08,
"loss": 0.4093017578125,
"step": 1490,
"token_acc": 0.8637463671884823
},
{
"epoch": 4.780096308186196,
"grad_norm": 0.21238301054517855,
"learning_rate": 3.586039795284629e-08,
"loss": 0.3710530698299408,
"step": 1491,
"token_acc": 0.8759641921780346
},
{
"epoch": 4.78330658105939,
"grad_norm": 0.17753991981401762,
"learning_rate": 3.483021319940993e-08,
"loss": 0.3804118037223816,
"step": 1492,
"token_acc": 0.870852992790632
},
{
"epoch": 4.786516853932584,
"grad_norm": 0.17697812521163442,
"learning_rate": 3.381496829650032e-08,
"loss": 0.4010416865348816,
"step": 1493,
"token_acc": 0.8652478551294941
},
{
"epoch": 4.789727126805778,
"grad_norm": 0.17608131878830596,
"learning_rate": 3.28146676212791e-08,
"loss": 0.4404703974723816,
"step": 1494,
"token_acc": 0.8498514163405005
},
{
"epoch": 4.792937399678973,
"grad_norm": 0.20871930874623706,
"learning_rate": 3.182931548647622e-08,
"loss": 0.4266357421875,
"step": 1495,
"token_acc": 0.8558437590199414
},
{
"epoch": 4.796147672552167,
"grad_norm": 0.19722909141928582,
"learning_rate": 3.085891614037245e-08,
"loss": 0.4145914912223816,
"step": 1496,
"token_acc": 0.8616536435520983
},
{
"epoch": 4.799357945425362,
"grad_norm": 0.20580969760791265,
"learning_rate": 2.9903473766780376e-08,
"loss": 0.385986328125,
"step": 1497,
"token_acc": 0.8712499849602349
},
{
"epoch": 4.802568218298555,
"grad_norm": 0.17137359713107464,
"learning_rate": 2.896299248502687e-08,
"loss": 0.3893229365348816,
"step": 1498,
"token_acc": 0.8688513441804728
},
{
"epoch": 4.80577849117175,
"grad_norm": 0.18970792203275755,
"learning_rate": 2.8037476349934474e-08,
"loss": 0.4296875,
"step": 1499,
"token_acc": 0.8569213598383156
},
{
"epoch": 4.808988764044944,
"grad_norm": 0.19649667842047547,
"learning_rate": 2.7126929351804662e-08,
"loss": 0.4334309995174408,
"step": 1500,
"token_acc": 0.8548346521520377
},
{
"epoch": 4.8121990369181376,
"grad_norm": 0.17153631748781512,
"learning_rate": 2.6231355416401148e-08,
"loss": 0.4295857846736908,
"step": 1501,
"token_acc": 0.8533903553607919
},
{
"epoch": 4.815409309791332,
"grad_norm": 0.2067960920586377,
"learning_rate": 2.5350758404931617e-08,
"loss": 0.4170328974723816,
"step": 1502,
"token_acc": 0.8612176379149875
},
{
"epoch": 4.818619582664526,
"grad_norm": 0.1776631162311648,
"learning_rate": 2.4485142114032187e-08,
"loss": 0.3851725459098816,
"step": 1503,
"token_acc": 0.8703423965016557
},
{
"epoch": 4.821829855537721,
"grad_norm": 0.1793386927959324,
"learning_rate": 2.363451027574953e-08,
"loss": 0.3998616635799408,
"step": 1504,
"token_acc": 0.8650888541318452
},
{
"epoch": 4.825040128410915,
"grad_norm": 0.17069429441349365,
"learning_rate": 2.2798866557526888e-08,
"loss": 0.3791911005973816,
"step": 1505,
"token_acc": 0.8722738939382442
},
{
"epoch": 4.828250401284109,
"grad_norm": 0.17633776655076083,
"learning_rate": 2.197821456218696e-08,
"loss": 0.3572896420955658,
"step": 1506,
"token_acc": 0.8801751024701113
},
{
"epoch": 4.831460674157303,
"grad_norm": 0.22903235280353987,
"learning_rate": 2.117255782791716e-08,
"loss": 0.4180501401424408,
"step": 1507,
"token_acc": 0.8608980437218196
},
{
"epoch": 4.834670947030498,
"grad_norm": 0.17214558829297427,
"learning_rate": 2.0381899828252504e-08,
"loss": 0.4264323115348816,
"step": 1508,
"token_acc": 0.8573647249971219
},
{
"epoch": 4.837881219903692,
"grad_norm": 0.17332641221650316,
"learning_rate": 1.9606243972063175e-08,
"loss": 0.4503580927848816,
"step": 1509,
"token_acc": 0.8480119506732333
},
{
"epoch": 4.841091492776886,
"grad_norm": 0.20349974006143934,
"learning_rate": 1.8845593603537436e-08,
"loss": 0.4302571713924408,
"step": 1510,
"token_acc": 0.8569897104456586
},
{
"epoch": 4.84430176565008,
"grad_norm": 0.23142774813924935,
"learning_rate": 1.809995200217035e-08,
"loss": 0.4117431640625,
"step": 1511,
"token_acc": 0.8611640929603565
},
{
"epoch": 4.847512038523274,
"grad_norm": 0.2264181651137007,
"learning_rate": 1.7369322382744746e-08,
"loss": 0.4283447265625,
"step": 1512,
"token_acc": 0.8577302561930976
},
{
"epoch": 4.850722311396469,
"grad_norm": 0.1927470595102296,
"learning_rate": 1.6653707895323444e-08,
"loss": 0.4225260615348816,
"step": 1513,
"token_acc": 0.8582438450309673
},
{
"epoch": 4.853932584269663,
"grad_norm": 0.20210916477600854,
"learning_rate": 1.595311162523022e-08,
"loss": 0.3760783076286316,
"step": 1514,
"token_acc": 0.8735678850105595
},
{
"epoch": 4.857142857142857,
"grad_norm": 0.1975673822042727,
"learning_rate": 1.5267536593039698e-08,
"loss": 0.3906657099723816,
"step": 1515,
"token_acc": 0.870322058087935
},
{
"epoch": 4.860353130016051,
"grad_norm": 0.24991527614051293,
"learning_rate": 1.4596985754563363e-08,
"loss": 0.4143269956111908,
"step": 1516,
"token_acc": 0.862886973900068
},
{
"epoch": 4.863563402889246,
"grad_norm": 0.18917055000975244,
"learning_rate": 1.3941462000837124e-08,
"loss": 0.4033203125,
"step": 1517,
"token_acc": 0.8647954232777991
},
{
"epoch": 4.86677367576244,
"grad_norm": 0.18869019026368106,
"learning_rate": 1.3300968158107717e-08,
"loss": 0.4206136167049408,
"step": 1518,
"token_acc": 0.8589735573207866
},
{
"epoch": 4.8699839486356336,
"grad_norm": 0.1784602865105331,
"learning_rate": 1.2675506987822216e-08,
"loss": 0.4408366084098816,
"step": 1519,
"token_acc": 0.852177246209995
},
{
"epoch": 4.873194221508828,
"grad_norm": 0.18978019341405125,
"learning_rate": 1.206508118661559e-08,
"loss": 0.4346110224723816,
"step": 1520,
"token_acc": 0.8547129687397841
},
{
"epoch": 4.876404494382022,
"grad_norm": 0.18810412184264527,
"learning_rate": 1.1469693386297885e-08,
"loss": 0.398681640625,
"step": 1521,
"token_acc": 0.8660221011534742
},
{
"epoch": 4.879614767255217,
"grad_norm": 0.1950421054563736,
"learning_rate": 1.0889346153844515e-08,
"loss": 0.394287109375,
"step": 1522,
"token_acc": 0.8688735336654346
},
{
"epoch": 4.882825040128411,
"grad_norm": 0.18702049234084817,
"learning_rate": 1.0324041991383814e-08,
"loss": 0.406494140625,
"step": 1523,
"token_acc": 0.8623502341858371
},
{
"epoch": 4.886035313001605,
"grad_norm": 0.21819621396045558,
"learning_rate": 9.773783336188114e-09,
"loss": 0.40478515625,
"step": 1524,
"token_acc": 0.8655094446934141
},
{
"epoch": 4.889245585874799,
"grad_norm": 0.19955785487879174,
"learning_rate": 9.238572560660129e-09,
"loss": 0.4108479917049408,
"step": 1525,
"token_acc": 0.8620435119485541
},
{
"epoch": 4.892455858747994,
"grad_norm": 0.18647782080518002,
"learning_rate": 8.718411972326757e-09,
"loss": 0.4232991635799408,
"step": 1526,
"token_acc": 0.8586699978124964
},
{
"epoch": 4.895666131621188,
"grad_norm": 0.1800512728032429,
"learning_rate": 8.213303813825068e-09,
"loss": 0.3704020380973816,
"step": 1527,
"token_acc": 0.8743921676948102
},
{
"epoch": 4.898876404494382,
"grad_norm": 0.22287140390515817,
"learning_rate": 7.723250262896497e-09,
"loss": 0.3697102963924408,
"step": 1528,
"token_acc": 0.8767438855396187
},
{
"epoch": 4.902086677367576,
"grad_norm": 0.1748776167395626,
"learning_rate": 7.248253432374007e-09,
"loss": 0.4410807490348816,
"step": 1529,
"token_acc": 0.8521788539708903
},
{
"epoch": 4.90529695024077,
"grad_norm": 0.19350528949432078,
"learning_rate": 6.788315370174713e-09,
"loss": 0.4403076171875,
"step": 1530,
"token_acc": 0.8539682000022666
},
{
"epoch": 4.908507223113965,
"grad_norm": 0.1766161126773489,
"learning_rate": 6.343438059291717e-09,
"loss": 0.388671875,
"step": 1531,
"token_acc": 0.8677562657105857
},
{
"epoch": 4.911717495987159,
"grad_norm": 0.18770179884828989,
"learning_rate": 5.913623417784008e-09,
"loss": 0.3328043818473816,
"step": 1532,
"token_acc": 0.8878665562077332
},
{
"epoch": 4.914927768860353,
"grad_norm": 0.2087284223936544,
"learning_rate": 5.49887329876908e-09,
"loss": 0.445068359375,
"step": 1533,
"token_acc": 0.851431813180593
},
{
"epoch": 4.918138041733547,
"grad_norm": 0.29371025063288597,
"learning_rate": 5.0991894904143795e-09,
"loss": 0.3640950620174408,
"step": 1534,
"token_acc": 0.8780885164228094
},
{
"epoch": 4.921348314606742,
"grad_norm": 0.19461732027423637,
"learning_rate": 4.714573715930703e-09,
"loss": 0.4474284052848816,
"step": 1535,
"token_acc": 0.8507598477321591
},
{
"epoch": 4.924558587479936,
"grad_norm": 0.17865477768308327,
"learning_rate": 4.34502763356287e-09,
"loss": 0.4408772885799408,
"step": 1536,
"token_acc": 0.8527419721633549
},
{
"epoch": 4.9277688603531296,
"grad_norm": 0.18898947607595706,
"learning_rate": 3.990552836585059e-09,
"loss": 0.3997599482536316,
"step": 1537,
"token_acc": 0.8667962943740363
},
{
"epoch": 4.930979133226324,
"grad_norm": 0.18329780463853837,
"learning_rate": 3.651150853291485e-09,
"loss": 0.3968505859375,
"step": 1538,
"token_acc": 0.8657463388056985
},
{
"epoch": 4.934189406099518,
"grad_norm": 0.1884122370512164,
"learning_rate": 3.3268231469913423e-09,
"loss": 0.3992919921875,
"step": 1539,
"token_acc": 0.8665628475564735
},
{
"epoch": 4.937399678972713,
"grad_norm": 0.18874464868375143,
"learning_rate": 3.017571116002593e-09,
"loss": 0.3417561948299408,
"step": 1540,
"token_acc": 0.8858301255573328
},
{
"epoch": 4.940609951845907,
"grad_norm": 0.17451525373232735,
"learning_rate": 2.723396093644581e-09,
"loss": 0.357666015625,
"step": 1541,
"token_acc": 0.8792415830543884
},
{
"epoch": 4.943820224719101,
"grad_norm": 0.19102676574494215,
"learning_rate": 2.44429934823337e-09,
"loss": 0.4149983823299408,
"step": 1542,
"token_acc": 0.8621060995517145
},
{
"epoch": 4.947030497592295,
"grad_norm": 0.24015820065262727,
"learning_rate": 2.1802820830763012e-09,
"loss": 0.4070638120174408,
"step": 1543,
"token_acc": 0.8646235807125959
},
{
"epoch": 4.95024077046549,
"grad_norm": 0.1838007188654105,
"learning_rate": 1.9313454364661698e-09,
"loss": 0.4093424677848816,
"step": 1544,
"token_acc": 0.8628462854637304
},
{
"epoch": 4.953451043338684,
"grad_norm": 0.19924252505425089,
"learning_rate": 1.6974904816773328e-09,
"loss": 0.40667724609375,
"step": 1545,
"token_acc": 0.8624877265573008
},
{
"epoch": 4.956661316211878,
"grad_norm": 0.2092848808914314,
"learning_rate": 1.4787182269594967e-09,
"loss": 0.4183756709098816,
"step": 1546,
"token_acc": 0.8607182153873112
},
{
"epoch": 4.959871589085072,
"grad_norm": 0.19043936673199025,
"learning_rate": 1.275029615534995e-09,
"loss": 0.3769124448299408,
"step": 1547,
"token_acc": 0.8732620497102043
},
{
"epoch": 4.963081861958266,
"grad_norm": 0.19103057854328268,
"learning_rate": 1.0864255255941257e-09,
"loss": 0.39697265625,
"step": 1548,
"token_acc": 0.8661367649301114
},
{
"epoch": 4.966292134831461,
"grad_norm": 0.17564224410576695,
"learning_rate": 9.129067702901006e-10,
"loss": 0.386474609375,
"step": 1549,
"token_acc": 0.8696541885592229
},
{
"epoch": 4.969502407704655,
"grad_norm": 0.1817798033003913,
"learning_rate": 7.544740977382669e-10,
"loss": 0.4063313901424408,
"step": 1550,
"token_acc": 0.8644004938785294
},
{
"epoch": 4.972712680577849,
"grad_norm": 0.18250215981034876,
"learning_rate": 6.11128191010668e-10,
"loss": 0.3870442807674408,
"step": 1551,
"token_acc": 0.8708193163813337
},
{
"epoch": 4.975922953451043,
"grad_norm": 0.18730655534724885,
"learning_rate": 4.828696681333233e-10,
"loss": 0.3853759765625,
"step": 1552,
"token_acc": 0.8718526354510426
},
{
"epoch": 4.979133226324238,
"grad_norm": 0.1679151678172249,
"learning_rate": 3.696990820842849e-10,
"loss": 0.3643595576286316,
"step": 1553,
"token_acc": 0.8772456253978552
},
{
"epoch": 4.982343499197432,
"grad_norm": 0.20075363139655922,
"learning_rate": 2.716169207916952e-10,
"loss": 0.4112142026424408,
"step": 1554,
"token_acc": 0.8639921075248038
},
{
"epoch": 4.9855537720706256,
"grad_norm": 0.19697059102789963,
"learning_rate": 1.886236071295122e-10,
"loss": 0.3818359375,
"step": 1555,
"token_acc": 0.8715292497469914
},
{
"epoch": 4.98876404494382,
"grad_norm": 0.21720741450497374,
"learning_rate": 1.207194989186755e-10,
"loss": 0.377685546875,
"step": 1556,
"token_acc": 0.8723909723400645
},
{
"epoch": 4.991974317817014,
"grad_norm": 0.1837173297906277,
"learning_rate": 6.790488892283176e-11,
"loss": 0.3875732421875,
"step": 1557,
"token_acc": 0.8704183097200379
},
{
"epoch": 4.995184590690209,
"grad_norm": 0.21153622842687272,
"learning_rate": 3.01800048487233e-11,
"loss": 0.4175618588924408,
"step": 1558,
"token_acc": 0.8587473362060063
},
{
"epoch": 4.998394863563403,
"grad_norm": 0.19093467860237212,
"learning_rate": 7.545009344633868e-12,
"loss": 0.392578125,
"step": 1559,
"token_acc": 0.8685876987099844
},
{
"epoch": 5.0,
"grad_norm": 0.3700371401403032,
"learning_rate": 0.0,
"loss": 0.412109375,
"step": 1560,
"token_acc": 0.8622697000631958
}
],
"logging_steps": 1,
"max_steps": 1560,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.5902748949715354e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}