TensorBoard
Safetensors
English
long_speech_qwen2audio
FastLongSpeech / LoRA_parameters /trainer_state.json
guoshoutao's picture
Upload folder using huggingface_hub
567160a verified
{
"best_metric": 0.8637903928756714,
"best_model_checkpoint": "CTCLLMs_backboneTrain/checkpoints/LongSpeech_CTC-Shrink_augment_data_self_tokenizer_addMLS_projector_restore_2epoch_OpenASQA_LibriSQA_CommonVoice_random/checkpoint-44000",
"epoch": 2.0,
"eval_steps": 2000,
"global_step": 48676,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0008217602103706139,
"grad_norm": 2.583192825317383,
"learning_rate": 2.737850787132101e-06,
"loss": 1.7034,
"step": 20
},
{
"epoch": 0.0016435204207412278,
"grad_norm": 1.2284363508224487,
"learning_rate": 5.475701574264202e-06,
"loss": 1.3595,
"step": 40
},
{
"epoch": 0.0024652806311118414,
"grad_norm": 0.7835968732833862,
"learning_rate": 8.213552361396305e-06,
"loss": 1.0712,
"step": 60
},
{
"epoch": 0.0032870408414824555,
"grad_norm": 0.7891668081283569,
"learning_rate": 1.0951403148528404e-05,
"loss": 0.9601,
"step": 80
},
{
"epoch": 0.004108801051853069,
"grad_norm": 0.7853180170059204,
"learning_rate": 1.3689253935660506e-05,
"loss": 0.9205,
"step": 100
},
{
"epoch": 0.004930561262223683,
"grad_norm": 0.8246210217475891,
"learning_rate": 1.642710472279261e-05,
"loss": 0.8811,
"step": 120
},
{
"epoch": 0.005752321472594297,
"grad_norm": 0.8358054757118225,
"learning_rate": 1.916495550992471e-05,
"loss": 0.8942,
"step": 140
},
{
"epoch": 0.006574081682964911,
"grad_norm": 0.8320625424385071,
"learning_rate": 2.190280629705681e-05,
"loss": 0.8571,
"step": 160
},
{
"epoch": 0.007395841893335525,
"grad_norm": 0.915210485458374,
"learning_rate": 2.464065708418891e-05,
"loss": 0.8753,
"step": 180
},
{
"epoch": 0.008217602103706138,
"grad_norm": 0.819446861743927,
"learning_rate": 2.7378507871321012e-05,
"loss": 0.8164,
"step": 200
},
{
"epoch": 0.009039362314076753,
"grad_norm": 0.8161728978157043,
"learning_rate": 3.0116358658453113e-05,
"loss": 0.8124,
"step": 220
},
{
"epoch": 0.009861122524447366,
"grad_norm": 0.8150123953819275,
"learning_rate": 3.285420944558522e-05,
"loss": 0.8046,
"step": 240
},
{
"epoch": 0.01068288273481798,
"grad_norm": 0.7666240334510803,
"learning_rate": 3.559206023271732e-05,
"loss": 0.804,
"step": 260
},
{
"epoch": 0.011504642945188595,
"grad_norm": 0.7685117721557617,
"learning_rate": 3.832991101984942e-05,
"loss": 0.7959,
"step": 280
},
{
"epoch": 0.012326403155559208,
"grad_norm": 0.7646723985671997,
"learning_rate": 4.1067761806981516e-05,
"loss": 0.7919,
"step": 300
},
{
"epoch": 0.013148163365929822,
"grad_norm": 0.6962915062904358,
"learning_rate": 4.380561259411362e-05,
"loss": 0.8057,
"step": 320
},
{
"epoch": 0.013969923576300435,
"grad_norm": 0.7180787324905396,
"learning_rate": 4.654346338124572e-05,
"loss": 0.7921,
"step": 340
},
{
"epoch": 0.01479168378667105,
"grad_norm": 0.6863545179367065,
"learning_rate": 4.928131416837782e-05,
"loss": 0.7752,
"step": 360
},
{
"epoch": 0.015613443997041664,
"grad_norm": 0.7281647324562073,
"learning_rate": 5.201916495550992e-05,
"loss": 0.8258,
"step": 380
},
{
"epoch": 0.016435204207412277,
"grad_norm": 0.6535085439682007,
"learning_rate": 5.4757015742642024e-05,
"loss": 0.8181,
"step": 400
},
{
"epoch": 0.01725696441778289,
"grad_norm": 0.722362220287323,
"learning_rate": 5.7494866529774125e-05,
"loss": 0.7875,
"step": 420
},
{
"epoch": 0.018078724628153506,
"grad_norm": 0.6223776340484619,
"learning_rate": 6.023271731690623e-05,
"loss": 0.7995,
"step": 440
},
{
"epoch": 0.01890048483852412,
"grad_norm": 0.6497051119804382,
"learning_rate": 6.297056810403833e-05,
"loss": 0.8061,
"step": 460
},
{
"epoch": 0.01972224504889473,
"grad_norm": 0.6080052256584167,
"learning_rate": 6.570841889117044e-05,
"loss": 0.7808,
"step": 480
},
{
"epoch": 0.020544005259265346,
"grad_norm": 0.6562979221343994,
"learning_rate": 6.844626967830253e-05,
"loss": 0.8084,
"step": 500
},
{
"epoch": 0.02136576546963596,
"grad_norm": 0.5958985686302185,
"learning_rate": 7.118412046543464e-05,
"loss": 0.7635,
"step": 520
},
{
"epoch": 0.022187525680006575,
"grad_norm": 0.6067186594009399,
"learning_rate": 7.392197125256673e-05,
"loss": 0.8001,
"step": 540
},
{
"epoch": 0.02300928589037719,
"grad_norm": 0.5998467803001404,
"learning_rate": 7.665982203969884e-05,
"loss": 0.7907,
"step": 560
},
{
"epoch": 0.0238310461007478,
"grad_norm": 0.6743142604827881,
"learning_rate": 7.939767282683094e-05,
"loss": 0.7716,
"step": 580
},
{
"epoch": 0.024652806311118415,
"grad_norm": 0.5700744390487671,
"learning_rate": 8.213552361396303e-05,
"loss": 0.775,
"step": 600
},
{
"epoch": 0.02547456652148903,
"grad_norm": 0.5921510457992554,
"learning_rate": 8.487337440109514e-05,
"loss": 0.7874,
"step": 620
},
{
"epoch": 0.026296326731859644,
"grad_norm": 0.6034173965454102,
"learning_rate": 8.761122518822724e-05,
"loss": 0.7815,
"step": 640
},
{
"epoch": 0.02711808694223026,
"grad_norm": 0.5742852091789246,
"learning_rate": 9.034907597535934e-05,
"loss": 0.7881,
"step": 660
},
{
"epoch": 0.02793984715260087,
"grad_norm": 0.5726728439331055,
"learning_rate": 9.308692676249144e-05,
"loss": 0.7905,
"step": 680
},
{
"epoch": 0.028761607362971484,
"grad_norm": 0.5717937350273132,
"learning_rate": 9.582477754962355e-05,
"loss": 0.7674,
"step": 700
},
{
"epoch": 0.0295833675733421,
"grad_norm": 0.5768669247627258,
"learning_rate": 9.856262833675564e-05,
"loss": 0.7687,
"step": 720
},
{
"epoch": 0.030405127783712713,
"grad_norm": 0.577836275100708,
"learning_rate": 0.00010130047912388776,
"loss": 0.775,
"step": 740
},
{
"epoch": 0.031226887994083328,
"grad_norm": 0.5653334259986877,
"learning_rate": 0.00010403832991101984,
"loss": 0.7531,
"step": 760
},
{
"epoch": 0.03204864820445394,
"grad_norm": 0.6732868552207947,
"learning_rate": 0.00010677618069815197,
"loss": 0.7961,
"step": 780
},
{
"epoch": 0.03287040841482455,
"grad_norm": 0.5599177479743958,
"learning_rate": 0.00010951403148528405,
"loss": 0.7617,
"step": 800
},
{
"epoch": 0.03369216862519517,
"grad_norm": 0.6064387559890747,
"learning_rate": 0.00011225188227241617,
"loss": 0.7279,
"step": 820
},
{
"epoch": 0.03451392883556578,
"grad_norm": 0.5485804080963135,
"learning_rate": 0.00011498973305954825,
"loss": 0.7499,
"step": 840
},
{
"epoch": 0.0353356890459364,
"grad_norm": 0.5458228588104248,
"learning_rate": 0.00011772758384668037,
"loss": 0.7726,
"step": 860
},
{
"epoch": 0.03615744925630701,
"grad_norm": 0.5380986928939819,
"learning_rate": 0.00012046543463381245,
"loss": 0.8033,
"step": 880
},
{
"epoch": 0.036979209466677626,
"grad_norm": 0.6237996220588684,
"learning_rate": 0.00012320328542094456,
"loss": 0.7961,
"step": 900
},
{
"epoch": 0.03780096967704824,
"grad_norm": 0.561638593673706,
"learning_rate": 0.00012594113620807666,
"loss": 0.7799,
"step": 920
},
{
"epoch": 0.03862272988741885,
"grad_norm": 0.5861026644706726,
"learning_rate": 0.00012867898699520878,
"loss": 0.7801,
"step": 940
},
{
"epoch": 0.03944449009778946,
"grad_norm": 0.5767973065376282,
"learning_rate": 0.00013141683778234087,
"loss": 0.7743,
"step": 960
},
{
"epoch": 0.04026625030816008,
"grad_norm": 0.5575984120368958,
"learning_rate": 0.00013415468856947297,
"loss": 0.7781,
"step": 980
},
{
"epoch": 0.04108801051853069,
"grad_norm": 0.6257224678993225,
"learning_rate": 0.00013689253935660506,
"loss": 0.7942,
"step": 1000
},
{
"epoch": 0.041909770728901306,
"grad_norm": 0.5779247283935547,
"learning_rate": 0.00013963039014373718,
"loss": 0.7781,
"step": 1020
},
{
"epoch": 0.04273153093927192,
"grad_norm": 0.580823540687561,
"learning_rate": 0.00014236824093086928,
"loss": 0.7821,
"step": 1040
},
{
"epoch": 0.043553291149642535,
"grad_norm": 0.6481038331985474,
"learning_rate": 0.00014510609171800137,
"loss": 0.7743,
"step": 1060
},
{
"epoch": 0.04437505136001315,
"grad_norm": 0.5689835548400879,
"learning_rate": 0.00014784394250513347,
"loss": 0.7839,
"step": 1080
},
{
"epoch": 0.045196811570383764,
"grad_norm": 0.6186492443084717,
"learning_rate": 0.0001505817932922656,
"loss": 0.8148,
"step": 1100
},
{
"epoch": 0.04601857178075438,
"grad_norm": 0.6207014322280884,
"learning_rate": 0.00015331964407939769,
"loss": 0.7854,
"step": 1120
},
{
"epoch": 0.04684033199112499,
"grad_norm": 0.649277925491333,
"learning_rate": 0.00015605749486652978,
"loss": 0.792,
"step": 1140
},
{
"epoch": 0.0476620922014956,
"grad_norm": 0.6359161734580994,
"learning_rate": 0.00015879534565366188,
"loss": 0.7871,
"step": 1160
},
{
"epoch": 0.048483852411866216,
"grad_norm": 0.6189965009689331,
"learning_rate": 0.000161533196440794,
"loss": 0.8014,
"step": 1180
},
{
"epoch": 0.04930561262223683,
"grad_norm": 0.6509179472923279,
"learning_rate": 0.00016427104722792606,
"loss": 0.7993,
"step": 1200
},
{
"epoch": 0.050127372832607445,
"grad_norm": 0.70870041847229,
"learning_rate": 0.0001670088980150582,
"loss": 0.7892,
"step": 1220
},
{
"epoch": 0.05094913304297806,
"grad_norm": 0.6406404376029968,
"learning_rate": 0.00016974674880219028,
"loss": 0.7823,
"step": 1240
},
{
"epoch": 0.051770893253348674,
"grad_norm": 0.65333092212677,
"learning_rate": 0.0001724845995893224,
"loss": 0.788,
"step": 1260
},
{
"epoch": 0.05259265346371929,
"grad_norm": 0.6580297946929932,
"learning_rate": 0.00017522245037645447,
"loss": 0.7818,
"step": 1280
},
{
"epoch": 0.0534144136740899,
"grad_norm": 0.6168191432952881,
"learning_rate": 0.0001779603011635866,
"loss": 0.8192,
"step": 1300
},
{
"epoch": 0.05423617388446052,
"grad_norm": 0.6828853487968445,
"learning_rate": 0.0001806981519507187,
"loss": 0.7965,
"step": 1320
},
{
"epoch": 0.055057934094831125,
"grad_norm": 0.6340402960777283,
"learning_rate": 0.0001834360027378508,
"loss": 0.7994,
"step": 1340
},
{
"epoch": 0.05587969430520174,
"grad_norm": 0.6091774106025696,
"learning_rate": 0.00018617385352498288,
"loss": 0.8142,
"step": 1360
},
{
"epoch": 0.056701454515572354,
"grad_norm": 0.6250841617584229,
"learning_rate": 0.000188911704312115,
"loss": 0.7998,
"step": 1380
},
{
"epoch": 0.05752321472594297,
"grad_norm": 0.6069123148918152,
"learning_rate": 0.0001916495550992471,
"loss": 0.8138,
"step": 1400
},
{
"epoch": 0.05834497493631358,
"grad_norm": 0.6658041477203369,
"learning_rate": 0.00019438740588637922,
"loss": 0.8082,
"step": 1420
},
{
"epoch": 0.0591667351466842,
"grad_norm": 0.6972244381904602,
"learning_rate": 0.00019712525667351128,
"loss": 0.8203,
"step": 1440
},
{
"epoch": 0.05998849535705481,
"grad_norm": 0.6969318985939026,
"learning_rate": 0.0001998631074606434,
"loss": 0.7889,
"step": 1460
},
{
"epoch": 0.06081025556742543,
"grad_norm": 0.6873449087142944,
"learning_rate": 0.00019999992008709735,
"loss": 0.8483,
"step": 1480
},
{
"epoch": 0.06163201577779604,
"grad_norm": 0.6407928466796875,
"learning_rate": 0.0001999996633033991,
"loss": 0.8578,
"step": 1500
},
{
"epoch": 0.062453775988166656,
"grad_norm": 0.6195780634880066,
"learning_rate": 0.0001999992294279946,
"loss": 0.8053,
"step": 1520
},
{
"epoch": 0.06327553619853726,
"grad_norm": 0.7080503702163696,
"learning_rate": 0.00019999861846165223,
"loss": 0.7953,
"step": 1540
},
{
"epoch": 0.06409729640890788,
"grad_norm": 0.8221389055252075,
"learning_rate": 0.0001999978304054539,
"loss": 0.831,
"step": 1560
},
{
"epoch": 0.06491905661927849,
"grad_norm": 0.7362856268882751,
"learning_rate": 0.00019999686526079525,
"loss": 0.8333,
"step": 1580
},
{
"epoch": 0.0657408168296491,
"grad_norm": 0.7318360805511475,
"learning_rate": 0.0001999957230293855,
"loss": 0.8352,
"step": 1600
},
{
"epoch": 0.06656257704001972,
"grad_norm": 0.7396681904792786,
"learning_rate": 0.0001999944037132474,
"loss": 0.8108,
"step": 1620
},
{
"epoch": 0.06738433725039034,
"grad_norm": 0.624099850654602,
"learning_rate": 0.00019999290731471738,
"loss": 0.8049,
"step": 1640
},
{
"epoch": 0.06820609746076095,
"grad_norm": 0.8170691728591919,
"learning_rate": 0.00019999123383644544,
"loss": 0.8198,
"step": 1660
},
{
"epoch": 0.06902785767113157,
"grad_norm": 0.7529473304748535,
"learning_rate": 0.00019998938328139517,
"loss": 0.8276,
"step": 1680
},
{
"epoch": 0.06984961788150218,
"grad_norm": 0.6491063833236694,
"learning_rate": 0.0001999873556528438,
"loss": 0.8363,
"step": 1700
},
{
"epoch": 0.0706713780918728,
"grad_norm": 0.6839701533317566,
"learning_rate": 0.00019998515095438207,
"loss": 0.8183,
"step": 1720
},
{
"epoch": 0.07149313830224341,
"grad_norm": 0.7682334184646606,
"learning_rate": 0.00019998276918991437,
"loss": 0.8124,
"step": 1740
},
{
"epoch": 0.07231489851261402,
"grad_norm": 0.7527047395706177,
"learning_rate": 0.00019998021036365856,
"loss": 0.8009,
"step": 1760
},
{
"epoch": 0.07313665872298464,
"grad_norm": 0.7900727391242981,
"learning_rate": 0.00019997747448014615,
"loss": 0.8252,
"step": 1780
},
{
"epoch": 0.07395841893335525,
"grad_norm": 0.799541175365448,
"learning_rate": 0.0001999745615442222,
"loss": 0.8285,
"step": 1800
},
{
"epoch": 0.07478017914372587,
"grad_norm": 0.7471742033958435,
"learning_rate": 0.00019997147156104527,
"loss": 0.8078,
"step": 1820
},
{
"epoch": 0.07560193935409648,
"grad_norm": 0.8705668449401855,
"learning_rate": 0.00019996820453608752,
"loss": 0.847,
"step": 1840
},
{
"epoch": 0.07642369956446708,
"grad_norm": 0.6803821921348572,
"learning_rate": 0.00019996476047513454,
"loss": 0.8152,
"step": 1860
},
{
"epoch": 0.0772454597748377,
"grad_norm": 0.7618655562400818,
"learning_rate": 0.00019996113938428555,
"loss": 0.8178,
"step": 1880
},
{
"epoch": 0.07806721998520831,
"grad_norm": 0.7413930296897888,
"learning_rate": 0.0001999573412699532,
"loss": 0.8538,
"step": 1900
},
{
"epoch": 0.07888898019557893,
"grad_norm": 0.7185872197151184,
"learning_rate": 0.0001999533661388637,
"loss": 0.8205,
"step": 1920
},
{
"epoch": 0.07971074040594954,
"grad_norm": 0.7739173769950867,
"learning_rate": 0.0001999492139980566,
"loss": 0.8305,
"step": 1940
},
{
"epoch": 0.08053250061632015,
"grad_norm": 0.7355234026908875,
"learning_rate": 0.0001999448848548851,
"loss": 0.8434,
"step": 1960
},
{
"epoch": 0.08135426082669077,
"grad_norm": 0.6984680891036987,
"learning_rate": 0.00019994037871701577,
"loss": 0.8307,
"step": 1980
},
{
"epoch": 0.08217602103706138,
"grad_norm": 0.8437952995300293,
"learning_rate": 0.00019993569559242864,
"loss": 0.8156,
"step": 2000
},
{
"epoch": 0.08217602103706138,
"eval_loss": 1.0192168951034546,
"eval_runtime": 16.6004,
"eval_samples_per_second": 157.827,
"eval_steps_per_second": 4.94,
"step": 2000
},
{
"epoch": 0.082997781247432,
"grad_norm": 0.7162949442863464,
"learning_rate": 0.00019993132946350455,
"loss": 0.8269,
"step": 2020
},
{
"epoch": 0.08381954145780261,
"grad_norm": 0.8335065841674805,
"learning_rate": 0.00019992631008726108,
"loss": 0.8296,
"step": 2040
},
{
"epoch": 0.08464130166817323,
"grad_norm": 0.7615776658058167,
"learning_rate": 0.00019992111374921422,
"loss": 0.8325,
"step": 2060
},
{
"epoch": 0.08546306187854384,
"grad_norm": 0.8376593589782715,
"learning_rate": 0.00019991574045856637,
"loss": 0.8299,
"step": 2080
},
{
"epoch": 0.08628482208891446,
"grad_norm": 0.7417710423469543,
"learning_rate": 0.00019991019022483312,
"loss": 0.8419,
"step": 2100
},
{
"epoch": 0.08710658229928507,
"grad_norm": 0.781035840511322,
"learning_rate": 0.00019990446305784358,
"loss": 0.8179,
"step": 2120
},
{
"epoch": 0.08792834250965569,
"grad_norm": 0.7956770062446594,
"learning_rate": 0.0001998985589677401,
"loss": 0.8365,
"step": 2140
},
{
"epoch": 0.0887501027200263,
"grad_norm": 0.8096093535423279,
"learning_rate": 0.00019989247796497838,
"loss": 0.8122,
"step": 2160
},
{
"epoch": 0.08957186293039691,
"grad_norm": 0.9604154825210571,
"learning_rate": 0.00019988622006032736,
"loss": 0.8284,
"step": 2180
},
{
"epoch": 0.09039362314076753,
"grad_norm": 0.7302993535995483,
"learning_rate": 0.00019988011120562424,
"loss": 0.8275,
"step": 2200
},
{
"epoch": 0.09121538335113814,
"grad_norm": 0.7435317635536194,
"learning_rate": 0.00019987350837444987,
"loss": 0.8482,
"step": 2220
},
{
"epoch": 0.09203714356150876,
"grad_norm": 0.7641948461532593,
"learning_rate": 0.00019986672867497988,
"loss": 0.8277,
"step": 2240
},
{
"epoch": 0.09285890377187937,
"grad_norm": 0.8367336392402649,
"learning_rate": 0.00019985977211922068,
"loss": 0.8148,
"step": 2260
},
{
"epoch": 0.09368066398224997,
"grad_norm": 0.8000660538673401,
"learning_rate": 0.0001998526387194917,
"loss": 0.8525,
"step": 2280
},
{
"epoch": 0.09450242419262059,
"grad_norm": 0.7250078320503235,
"learning_rate": 0.0001998453284884257,
"loss": 0.8394,
"step": 2300
},
{
"epoch": 0.0953241844029912,
"grad_norm": 0.8181660771369934,
"learning_rate": 0.00019983784143896854,
"loss": 0.8396,
"step": 2320
},
{
"epoch": 0.09614594461336182,
"grad_norm": 0.8068580031394958,
"learning_rate": 0.00019983017758437916,
"loss": 0.8431,
"step": 2340
},
{
"epoch": 0.09696770482373243,
"grad_norm": 0.8218814134597778,
"learning_rate": 0.0001998223369382297,
"loss": 0.8335,
"step": 2360
},
{
"epoch": 0.09778946503410305,
"grad_norm": 0.7909825444221497,
"learning_rate": 0.00019981431951440537,
"loss": 0.8337,
"step": 2380
},
{
"epoch": 0.09861122524447366,
"grad_norm": 0.7333732843399048,
"learning_rate": 0.00019980612532710434,
"loss": 0.8423,
"step": 2400
},
{
"epoch": 0.09943298545484427,
"grad_norm": 0.8515979647636414,
"learning_rate": 0.00019979775439083795,
"loss": 0.8397,
"step": 2420
},
{
"epoch": 0.10025474566521489,
"grad_norm": 0.7907799482345581,
"learning_rate": 0.0001997892067204304,
"loss": 0.8332,
"step": 2440
},
{
"epoch": 0.1010765058755855,
"grad_norm": 0.862369179725647,
"learning_rate": 0.00019978048233101903,
"loss": 0.8295,
"step": 2460
},
{
"epoch": 0.10189826608595612,
"grad_norm": 0.733180046081543,
"learning_rate": 0.00019977158123805403,
"loss": 0.8182,
"step": 2480
},
{
"epoch": 0.10272002629632673,
"grad_norm": 0.8313851952552795,
"learning_rate": 0.00019976250345729856,
"loss": 0.8501,
"step": 2500
},
{
"epoch": 0.10354178650669735,
"grad_norm": 0.7268729209899902,
"learning_rate": 0.0001997532490048287,
"loss": 0.8188,
"step": 2520
},
{
"epoch": 0.10436354671706796,
"grad_norm": 0.8857830166816711,
"learning_rate": 0.0001997438178970333,
"loss": 0.8125,
"step": 2540
},
{
"epoch": 0.10518530692743858,
"grad_norm": 0.8165369033813477,
"learning_rate": 0.0001997342101506142,
"loss": 0.8283,
"step": 2560
},
{
"epoch": 0.10600706713780919,
"grad_norm": 0.8085136413574219,
"learning_rate": 0.00019972442578258597,
"loss": 0.8499,
"step": 2580
},
{
"epoch": 0.1068288273481798,
"grad_norm": 0.914569079875946,
"learning_rate": 0.00019971446481027591,
"loss": 0.8235,
"step": 2600
},
{
"epoch": 0.10765058755855042,
"grad_norm": 0.8516772389411926,
"learning_rate": 0.0001997043272513242,
"loss": 0.8541,
"step": 2620
},
{
"epoch": 0.10847234776892103,
"grad_norm": 0.8908547163009644,
"learning_rate": 0.0001996940131236836,
"loss": 0.8696,
"step": 2640
},
{
"epoch": 0.10929410797929165,
"grad_norm": 0.7930579781532288,
"learning_rate": 0.00019968352244561976,
"loss": 0.8295,
"step": 2660
},
{
"epoch": 0.11011586818966225,
"grad_norm": 0.8197824358940125,
"learning_rate": 0.00019967285523571075,
"loss": 0.8251,
"step": 2680
},
{
"epoch": 0.11093762840003286,
"grad_norm": 0.8889797925949097,
"learning_rate": 0.00019966201151284745,
"loss": 0.8482,
"step": 2700
},
{
"epoch": 0.11175938861040348,
"grad_norm": 0.8669871091842651,
"learning_rate": 0.0001996509912962332,
"loss": 0.848,
"step": 2720
},
{
"epoch": 0.1125811488207741,
"grad_norm": 0.8625435829162598,
"learning_rate": 0.00019963979460538398,
"loss": 0.853,
"step": 2740
},
{
"epoch": 0.11340290903114471,
"grad_norm": 0.9078089594841003,
"learning_rate": 0.00019962842146012828,
"loss": 0.8474,
"step": 2760
},
{
"epoch": 0.11422466924151532,
"grad_norm": 0.8258838057518005,
"learning_rate": 0.00019961687188060708,
"loss": 0.8446,
"step": 2780
},
{
"epoch": 0.11504642945188594,
"grad_norm": 0.8453534841537476,
"learning_rate": 0.00019960573637644915,
"loss": 0.8416,
"step": 2800
},
{
"epoch": 0.11586818966225655,
"grad_norm": 0.8726224899291992,
"learning_rate": 0.00019959384280922383,
"loss": 0.8436,
"step": 2820
},
{
"epoch": 0.11668994987262717,
"grad_norm": 0.7921317219734192,
"learning_rate": 0.00019958177286896915,
"loss": 0.8331,
"step": 2840
},
{
"epoch": 0.11751171008299778,
"grad_norm": 1.0417413711547852,
"learning_rate": 0.00019956952657706,
"loss": 0.8612,
"step": 2860
},
{
"epoch": 0.1183334702933684,
"grad_norm": 0.8408219218254089,
"learning_rate": 0.00019955710395518363,
"loss": 0.8326,
"step": 2880
},
{
"epoch": 0.11915523050373901,
"grad_norm": 0.9015172123908997,
"learning_rate": 0.00019954450502533954,
"loss": 0.831,
"step": 2900
},
{
"epoch": 0.11997699071410962,
"grad_norm": 0.8820521831512451,
"learning_rate": 0.00019953172980983949,
"loss": 0.8383,
"step": 2920
},
{
"epoch": 0.12079875092448024,
"grad_norm": 0.899238646030426,
"learning_rate": 0.00019951877833130737,
"loss": 0.8387,
"step": 2940
},
{
"epoch": 0.12162051113485085,
"grad_norm": 0.827013373374939,
"learning_rate": 0.00019950565061267929,
"loss": 0.8421,
"step": 2960
},
{
"epoch": 0.12244227134522147,
"grad_norm": 0.8747543692588806,
"learning_rate": 0.00019949234667720336,
"loss": 0.819,
"step": 2980
},
{
"epoch": 0.12326403155559208,
"grad_norm": 0.7559501528739929,
"learning_rate": 0.00019947886654843991,
"loss": 0.8384,
"step": 3000
},
{
"epoch": 0.1240857917659627,
"grad_norm": 0.8533095121383667,
"learning_rate": 0.00019946521025026117,
"loss": 0.8291,
"step": 3020
},
{
"epoch": 0.12490755197633331,
"grad_norm": 0.9044725894927979,
"learning_rate": 0.00019945137780685138,
"loss": 0.8311,
"step": 3040
},
{
"epoch": 0.1257293121867039,
"grad_norm": 1.075642466545105,
"learning_rate": 0.00019943736924270679,
"loss": 0.8177,
"step": 3060
},
{
"epoch": 0.12655107239707453,
"grad_norm": 0.8421041369438171,
"learning_rate": 0.0001994231845826354,
"loss": 0.8205,
"step": 3080
},
{
"epoch": 0.12737283260744514,
"grad_norm": 0.9026604890823364,
"learning_rate": 0.0001994088238517572,
"loss": 0.8431,
"step": 3100
},
{
"epoch": 0.12819459281781576,
"grad_norm": 0.882366955280304,
"learning_rate": 0.00019939428707550395,
"loss": 0.836,
"step": 3120
},
{
"epoch": 0.12901635302818637,
"grad_norm": 0.8641519546508789,
"learning_rate": 0.00019937957427961918,
"loss": 0.8237,
"step": 3140
},
{
"epoch": 0.12983811323855698,
"grad_norm": 0.9230402708053589,
"learning_rate": 0.00019936468549015804,
"loss": 0.847,
"step": 3160
},
{
"epoch": 0.1306598734489276,
"grad_norm": 0.8539056181907654,
"learning_rate": 0.0001993496207334875,
"loss": 0.8347,
"step": 3180
},
{
"epoch": 0.1314816336592982,
"grad_norm": 1.0330878496170044,
"learning_rate": 0.00019933438003628604,
"loss": 0.8183,
"step": 3200
},
{
"epoch": 0.13230339386966883,
"grad_norm": 0.9674934148788452,
"learning_rate": 0.0001993189634255438,
"loss": 0.8457,
"step": 3220
},
{
"epoch": 0.13312515408003944,
"grad_norm": 0.7981789708137512,
"learning_rate": 0.00019930337092856243,
"loss": 0.8404,
"step": 3240
},
{
"epoch": 0.13394691429041006,
"grad_norm": 0.9389087557792664,
"learning_rate": 0.00019928760257295494,
"loss": 0.8212,
"step": 3260
},
{
"epoch": 0.13476867450078067,
"grad_norm": 0.907522439956665,
"learning_rate": 0.00019927165838664598,
"loss": 0.829,
"step": 3280
},
{
"epoch": 0.1355904347111513,
"grad_norm": 0.7471171021461487,
"learning_rate": 0.00019925553839787147,
"loss": 0.8199,
"step": 3300
},
{
"epoch": 0.1364121949215219,
"grad_norm": 0.8877785205841064,
"learning_rate": 0.00019923924263517856,
"loss": 0.82,
"step": 3320
},
{
"epoch": 0.13723395513189252,
"grad_norm": 0.9058935046195984,
"learning_rate": 0.00019922277112742592,
"loss": 0.8416,
"step": 3340
},
{
"epoch": 0.13805571534226313,
"grad_norm": 1.0315223932266235,
"learning_rate": 0.0001992061239037833,
"loss": 0.8356,
"step": 3360
},
{
"epoch": 0.13887747555263374,
"grad_norm": 0.9553551077842712,
"learning_rate": 0.00019918930099373157,
"loss": 0.8114,
"step": 3380
},
{
"epoch": 0.13969923576300436,
"grad_norm": 0.8403427600860596,
"learning_rate": 0.00019917230242706287,
"loss": 0.8311,
"step": 3400
},
{
"epoch": 0.14052099597337497,
"grad_norm": 0.8313325643539429,
"learning_rate": 0.00019915512823388034,
"loss": 0.8412,
"step": 3420
},
{
"epoch": 0.1413427561837456,
"grad_norm": 0.9792637228965759,
"learning_rate": 0.00019913777844459814,
"loss": 0.8405,
"step": 3440
},
{
"epoch": 0.1421645163941162,
"grad_norm": 0.8566033244132996,
"learning_rate": 0.00019912025308994148,
"loss": 0.8007,
"step": 3460
},
{
"epoch": 0.14298627660448682,
"grad_norm": 0.83521568775177,
"learning_rate": 0.00019910255220094634,
"loss": 0.8301,
"step": 3480
},
{
"epoch": 0.14380803681485743,
"grad_norm": 0.9577664136886597,
"learning_rate": 0.00019908467580895964,
"loss": 0.8332,
"step": 3500
},
{
"epoch": 0.14462979702522805,
"grad_norm": 0.8675875663757324,
"learning_rate": 0.00019906662394563913,
"loss": 0.8434,
"step": 3520
},
{
"epoch": 0.14545155723559866,
"grad_norm": 0.9066608548164368,
"learning_rate": 0.00019904839664295322,
"loss": 0.8273,
"step": 3540
},
{
"epoch": 0.14627331744596928,
"grad_norm": 0.8817648887634277,
"learning_rate": 0.00019902999393318113,
"loss": 0.8201,
"step": 3560
},
{
"epoch": 0.1470950776563399,
"grad_norm": 0.931098997592926,
"learning_rate": 0.00019901141584891262,
"loss": 0.8631,
"step": 3580
},
{
"epoch": 0.1479168378667105,
"grad_norm": 0.9475566744804382,
"learning_rate": 0.00019899266242304797,
"loss": 0.8335,
"step": 3600
},
{
"epoch": 0.14873859807708112,
"grad_norm": 0.9767733812332153,
"learning_rate": 0.00019897373368879816,
"loss": 0.8283,
"step": 3620
},
{
"epoch": 0.14956035828745173,
"grad_norm": 0.9582000374794006,
"learning_rate": 0.00019895462967968444,
"loss": 0.857,
"step": 3640
},
{
"epoch": 0.15038211849782235,
"grad_norm": 1.07439124584198,
"learning_rate": 0.0001989353504295386,
"loss": 0.8316,
"step": 3660
},
{
"epoch": 0.15120387870819296,
"grad_norm": 0.7958013415336609,
"learning_rate": 0.00019891589597250265,
"loss": 0.8285,
"step": 3680
},
{
"epoch": 0.15202563891856358,
"grad_norm": 0.9222463965415955,
"learning_rate": 0.000198896266343029,
"loss": 0.8262,
"step": 3700
},
{
"epoch": 0.15284739912893416,
"grad_norm": 0.8631669282913208,
"learning_rate": 0.00019887646157588015,
"loss": 0.8399,
"step": 3720
},
{
"epoch": 0.15366915933930478,
"grad_norm": 0.9066493511199951,
"learning_rate": 0.0001988564817061289,
"loss": 0.8441,
"step": 3740
},
{
"epoch": 0.1544909195496754,
"grad_norm": 0.8802810907363892,
"learning_rate": 0.000198836326769158,
"loss": 0.8382,
"step": 3760
},
{
"epoch": 0.155312679760046,
"grad_norm": 0.8617781400680542,
"learning_rate": 0.00019881599680066024,
"loss": 0.8322,
"step": 3780
},
{
"epoch": 0.15613443997041662,
"grad_norm": 0.9121951460838318,
"learning_rate": 0.00019879549183663854,
"loss": 0.8103,
"step": 3800
},
{
"epoch": 0.15695620018078724,
"grad_norm": 0.8364813923835754,
"learning_rate": 0.0001987748119134056,
"loss": 0.8258,
"step": 3820
},
{
"epoch": 0.15777796039115785,
"grad_norm": 0.971182644367218,
"learning_rate": 0.00019875395706758388,
"loss": 0.8195,
"step": 3840
},
{
"epoch": 0.15859972060152847,
"grad_norm": 0.9081626534461975,
"learning_rate": 0.00019873292733610577,
"loss": 0.7991,
"step": 3860
},
{
"epoch": 0.15942148081189908,
"grad_norm": 1.0428310632705688,
"learning_rate": 0.00019871278713727932,
"loss": 0.8337,
"step": 3880
},
{
"epoch": 0.1602432410222697,
"grad_norm": 0.8817542791366577,
"learning_rate": 0.0001986914164861707,
"loss": 0.8422,
"step": 3900
},
{
"epoch": 0.1610650012326403,
"grad_norm": 0.9848262667655945,
"learning_rate": 0.00019867095248177687,
"loss": 0.8604,
"step": 3920
},
{
"epoch": 0.16188676144301092,
"grad_norm": 0.8702915906906128,
"learning_rate": 0.00019864924105494623,
"loss": 0.824,
"step": 3940
},
{
"epoch": 0.16270852165338154,
"grad_norm": 0.8769912123680115,
"learning_rate": 0.00019862735492790314,
"loss": 0.8125,
"step": 3960
},
{
"epoch": 0.16353028186375215,
"grad_norm": 0.9141522645950317,
"learning_rate": 0.00019860529413940633,
"loss": 0.8426,
"step": 3980
},
{
"epoch": 0.16435204207412277,
"grad_norm": 0.9294391870498657,
"learning_rate": 0.00019858305872852373,
"loss": 0.8605,
"step": 4000
},
{
"epoch": 0.16435204207412277,
"eval_loss": 1.0459537506103516,
"eval_runtime": 16.5325,
"eval_samples_per_second": 158.476,
"eval_steps_per_second": 4.96,
"step": 4000
},
{
"epoch": 0.16517380228449338,
"grad_norm": 0.8110593557357788,
"learning_rate": 0.00019856064873463252,
"loss": 0.839,
"step": 4020
},
{
"epoch": 0.165995562494864,
"grad_norm": 0.8444618582725525,
"learning_rate": 0.00019853806419741908,
"loss": 0.8355,
"step": 4040
},
{
"epoch": 0.1668173227052346,
"grad_norm": 1.0283797979354858,
"learning_rate": 0.00019851530515687892,
"loss": 0.8491,
"step": 4060
},
{
"epoch": 0.16763908291560523,
"grad_norm": 1.0117839574813843,
"learning_rate": 0.00019849237165331656,
"loss": 0.8466,
"step": 4080
},
{
"epoch": 0.16846084312597584,
"grad_norm": 0.8671867847442627,
"learning_rate": 0.0001984692637273454,
"loss": 0.8684,
"step": 4100
},
{
"epoch": 0.16928260333634645,
"grad_norm": 1.0693740844726562,
"learning_rate": 0.00019844598141988782,
"loss": 0.8419,
"step": 4120
},
{
"epoch": 0.17010436354671707,
"grad_norm": 1.088922381401062,
"learning_rate": 0.00019842252477217503,
"loss": 0.8169,
"step": 4140
},
{
"epoch": 0.17092612375708768,
"grad_norm": 0.8914449214935303,
"learning_rate": 0.00019839889382574692,
"loss": 0.8305,
"step": 4160
},
{
"epoch": 0.1717478839674583,
"grad_norm": 1.0136765241622925,
"learning_rate": 0.00019837508862245208,
"loss": 0.8481,
"step": 4180
},
{
"epoch": 0.1725696441778289,
"grad_norm": 0.9876135587692261,
"learning_rate": 0.00019835110920444772,
"loss": 0.8384,
"step": 4200
},
{
"epoch": 0.17339140438819953,
"grad_norm": 0.9133214354515076,
"learning_rate": 0.0001983269556141995,
"loss": 0.8679,
"step": 4220
},
{
"epoch": 0.17421316459857014,
"grad_norm": 0.9155489802360535,
"learning_rate": 0.0001983026278944816,
"loss": 0.8546,
"step": 4240
},
{
"epoch": 0.17503492480894076,
"grad_norm": 1.0384551286697388,
"learning_rate": 0.0001982781260883765,
"loss": 0.8076,
"step": 4260
},
{
"epoch": 0.17585668501931137,
"grad_norm": 0.9135899543762207,
"learning_rate": 0.00019825345023927505,
"loss": 0.8113,
"step": 4280
},
{
"epoch": 0.17667844522968199,
"grad_norm": 0.8842368125915527,
"learning_rate": 0.00019822860039087628,
"loss": 0.8315,
"step": 4300
},
{
"epoch": 0.1775002054400526,
"grad_norm": 0.9855481386184692,
"learning_rate": 0.00019820357658718738,
"loss": 0.8051,
"step": 4320
},
{
"epoch": 0.17832196565042321,
"grad_norm": 0.9533560276031494,
"learning_rate": 0.0001981783788725235,
"loss": 0.8215,
"step": 4340
},
{
"epoch": 0.17914372586079383,
"grad_norm": 0.9931275248527527,
"learning_rate": 0.00019815300729150793,
"loss": 0.8463,
"step": 4360
},
{
"epoch": 0.17996548607116444,
"grad_norm": 0.9402872323989868,
"learning_rate": 0.00019812746188907173,
"loss": 0.8228,
"step": 4380
},
{
"epoch": 0.18078724628153506,
"grad_norm": 0.8862301111221313,
"learning_rate": 0.0001981017427104539,
"loss": 0.8377,
"step": 4400
},
{
"epoch": 0.18160900649190567,
"grad_norm": 1.069446086883545,
"learning_rate": 0.0001980771485720597,
"loss": 0.8357,
"step": 4420
},
{
"epoch": 0.1824307667022763,
"grad_norm": 0.9147841334342957,
"learning_rate": 0.0001980510906611715,
"loss": 0.8329,
"step": 4440
},
{
"epoch": 0.1832525269126469,
"grad_norm": 0.9529426097869873,
"learning_rate": 0.0001980248591093492,
"loss": 0.8439,
"step": 4460
},
{
"epoch": 0.18407428712301752,
"grad_norm": 0.9662571549415588,
"learning_rate": 0.00019799845396304688,
"loss": 0.8325,
"step": 4480
},
{
"epoch": 0.18489604733338813,
"grad_norm": 0.9648681879043579,
"learning_rate": 0.000197971875269026,
"loss": 0.8324,
"step": 4500
},
{
"epoch": 0.18571780754375875,
"grad_norm": 1.0208882093429565,
"learning_rate": 0.00019794646480399925,
"loss": 0.8536,
"step": 4520
},
{
"epoch": 0.18653956775412933,
"grad_norm": 1.0145078897476196,
"learning_rate": 0.00019791954782758873,
"loss": 0.8317,
"step": 4540
},
{
"epoch": 0.18736132796449995,
"grad_norm": 1.0626345872879028,
"learning_rate": 0.00019789245744319638,
"loss": 0.8443,
"step": 4560
},
{
"epoch": 0.18818308817487056,
"grad_norm": 1.0466902256011963,
"learning_rate": 0.00019786519369879716,
"loss": 0.823,
"step": 4580
},
{
"epoch": 0.18900484838524118,
"grad_norm": 0.9382312893867493,
"learning_rate": 0.00019783775664267302,
"loss": 0.8326,
"step": 4600
},
{
"epoch": 0.1898266085956118,
"grad_norm": 0.9509291648864746,
"learning_rate": 0.00019781014632341292,
"loss": 0.8415,
"step": 4620
},
{
"epoch": 0.1906483688059824,
"grad_norm": 1.018986701965332,
"learning_rate": 0.0001977823627899126,
"loss": 0.8415,
"step": 4640
},
{
"epoch": 0.19147012901635302,
"grad_norm": 1.0662530660629272,
"learning_rate": 0.0001977544060913746,
"loss": 0.8453,
"step": 4660
},
{
"epoch": 0.19229188922672363,
"grad_norm": 1.0088191032409668,
"learning_rate": 0.000197726276277308,
"loss": 0.8611,
"step": 4680
},
{
"epoch": 0.19311364943709425,
"grad_norm": 0.9862759709358215,
"learning_rate": 0.0001976979733975286,
"loss": 0.8222,
"step": 4700
},
{
"epoch": 0.19393540964746486,
"grad_norm": 0.982672929763794,
"learning_rate": 0.0001976694975021586,
"loss": 0.8109,
"step": 4720
},
{
"epoch": 0.19475716985783548,
"grad_norm": 0.8332194685935974,
"learning_rate": 0.0001976408486416266,
"loss": 0.826,
"step": 4740
},
{
"epoch": 0.1955789300682061,
"grad_norm": 1.1170557737350464,
"learning_rate": 0.00019761202686666756,
"loss": 0.8748,
"step": 4760
},
{
"epoch": 0.1964006902785767,
"grad_norm": 0.8429856300354004,
"learning_rate": 0.0001975830322283226,
"loss": 0.8304,
"step": 4780
},
{
"epoch": 0.19722245048894732,
"grad_norm": 1.002530813217163,
"learning_rate": 0.000197553864777939,
"loss": 0.8542,
"step": 4800
},
{
"epoch": 0.19804421069931794,
"grad_norm": 1.2244071960449219,
"learning_rate": 0.0001975245245671701,
"loss": 0.8368,
"step": 4820
},
{
"epoch": 0.19886597090968855,
"grad_norm": 0.9923454523086548,
"learning_rate": 0.0001974950116479751,
"loss": 0.8236,
"step": 4840
},
{
"epoch": 0.19968773112005916,
"grad_norm": 0.9827476143836975,
"learning_rate": 0.00019746532607261915,
"loss": 0.8356,
"step": 4860
},
{
"epoch": 0.20050949133042978,
"grad_norm": 0.9938998222351074,
"learning_rate": 0.0001974354678936731,
"loss": 0.8383,
"step": 4880
},
{
"epoch": 0.2013312515408004,
"grad_norm": 0.9436901807785034,
"learning_rate": 0.00019740543716401346,
"loss": 0.8464,
"step": 4900
},
{
"epoch": 0.202153011751171,
"grad_norm": 0.8767272233963013,
"learning_rate": 0.0001973752339368224,
"loss": 0.849,
"step": 4920
},
{
"epoch": 0.20297477196154162,
"grad_norm": 0.9653998613357544,
"learning_rate": 0.00019734485826558747,
"loss": 0.8356,
"step": 4940
},
{
"epoch": 0.20379653217191224,
"grad_norm": 0.8907719850540161,
"learning_rate": 0.00019731431020410167,
"loss": 0.8323,
"step": 4960
},
{
"epoch": 0.20461829238228285,
"grad_norm": 1.1022579669952393,
"learning_rate": 0.00019728358980646325,
"loss": 0.8437,
"step": 4980
},
{
"epoch": 0.20544005259265347,
"grad_norm": 0.9100618958473206,
"learning_rate": 0.00019725269712707566,
"loss": 0.8502,
"step": 5000
},
{
"epoch": 0.20626181280302408,
"grad_norm": 0.9730123281478882,
"learning_rate": 0.0001972216322206475,
"loss": 0.8245,
"step": 5020
},
{
"epoch": 0.2070835730133947,
"grad_norm": 0.9601908922195435,
"learning_rate": 0.00019719039514219224,
"loss": 0.8326,
"step": 5040
},
{
"epoch": 0.2079053332237653,
"grad_norm": 1.0868589878082275,
"learning_rate": 0.00019715898594702843,
"loss": 0.8378,
"step": 5060
},
{
"epoch": 0.20872709343413592,
"grad_norm": 0.924371600151062,
"learning_rate": 0.0001971274046907793,
"loss": 0.832,
"step": 5080
},
{
"epoch": 0.20954885364450654,
"grad_norm": 1.1059744358062744,
"learning_rate": 0.00019709565142937287,
"loss": 0.8093,
"step": 5100
},
{
"epoch": 0.21037061385487715,
"grad_norm": 1.0641423463821411,
"learning_rate": 0.00019706372621904164,
"loss": 0.8173,
"step": 5120
},
{
"epoch": 0.21119237406524777,
"grad_norm": 1.1420958042144775,
"learning_rate": 0.00019703162911632275,
"loss": 0.8424,
"step": 5140
},
{
"epoch": 0.21201413427561838,
"grad_norm": 0.9624399542808533,
"learning_rate": 0.00019699936017805768,
"loss": 0.8347,
"step": 5160
},
{
"epoch": 0.212835894485989,
"grad_norm": 0.9680808186531067,
"learning_rate": 0.00019696691946139225,
"loss": 0.8207,
"step": 5180
},
{
"epoch": 0.2136576546963596,
"grad_norm": 0.9132868647575378,
"learning_rate": 0.00019693430702377647,
"loss": 0.8398,
"step": 5200
},
{
"epoch": 0.21447941490673023,
"grad_norm": 0.9321950674057007,
"learning_rate": 0.00019690152292296446,
"loss": 0.85,
"step": 5220
},
{
"epoch": 0.21530117511710084,
"grad_norm": 0.9252221584320068,
"learning_rate": 0.00019686856721701435,
"loss": 0.8251,
"step": 5240
},
{
"epoch": 0.21612293532747145,
"grad_norm": 0.9873983860015869,
"learning_rate": 0.00019683543996428811,
"loss": 0.8092,
"step": 5260
},
{
"epoch": 0.21694469553784207,
"grad_norm": 1.0303717851638794,
"learning_rate": 0.0001968021412234516,
"loss": 0.8396,
"step": 5280
},
{
"epoch": 0.21776645574821268,
"grad_norm": 0.9478332996368408,
"learning_rate": 0.00019676867105347431,
"loss": 0.8194,
"step": 5300
},
{
"epoch": 0.2185882159585833,
"grad_norm": 1.01088547706604,
"learning_rate": 0.00019673502951362935,
"loss": 0.8207,
"step": 5320
},
{
"epoch": 0.2194099761689539,
"grad_norm": 0.9483580589294434,
"learning_rate": 0.00019670121666349327,
"loss": 0.8452,
"step": 5340
},
{
"epoch": 0.2202317363793245,
"grad_norm": 1.0161420106887817,
"learning_rate": 0.00019666723256294604,
"loss": 0.8377,
"step": 5360
},
{
"epoch": 0.22105349658969511,
"grad_norm": 1.0933947563171387,
"learning_rate": 0.00019663307727217085,
"loss": 0.847,
"step": 5380
},
{
"epoch": 0.22187525680006573,
"grad_norm": 1.0978140830993652,
"learning_rate": 0.0001965987508516542,
"loss": 0.8423,
"step": 5400
},
{
"epoch": 0.22269701701043634,
"grad_norm": 0.9424787163734436,
"learning_rate": 0.00019656425336218544,
"loss": 0.8106,
"step": 5420
},
{
"epoch": 0.22351877722080696,
"grad_norm": 0.9634792804718018,
"learning_rate": 0.00019652958486485696,
"loss": 0.8387,
"step": 5440
},
{
"epoch": 0.22434053743117757,
"grad_norm": 1.0137280225753784,
"learning_rate": 0.0001964947454210641,
"loss": 0.8375,
"step": 5460
},
{
"epoch": 0.2251622976415482,
"grad_norm": 1.0315325260162354,
"learning_rate": 0.00019645973509250467,
"loss": 0.851,
"step": 5480
},
{
"epoch": 0.2259840578519188,
"grad_norm": 0.978634238243103,
"learning_rate": 0.00019642455394117944,
"loss": 0.7957,
"step": 5500
},
{
"epoch": 0.22680581806228942,
"grad_norm": 0.9586151838302612,
"learning_rate": 0.00019638920202939142,
"loss": 0.8423,
"step": 5520
},
{
"epoch": 0.22762757827266003,
"grad_norm": 0.9884860515594482,
"learning_rate": 0.00019635367941974615,
"loss": 0.85,
"step": 5540
},
{
"epoch": 0.22844933848303065,
"grad_norm": 0.884602963924408,
"learning_rate": 0.00019631798617515144,
"loss": 0.8204,
"step": 5560
},
{
"epoch": 0.22927109869340126,
"grad_norm": 0.9110316038131714,
"learning_rate": 0.0001962821223588173,
"loss": 0.8184,
"step": 5580
},
{
"epoch": 0.23009285890377187,
"grad_norm": 0.9724137783050537,
"learning_rate": 0.00019624608803425574,
"loss": 0.8198,
"step": 5600
},
{
"epoch": 0.2309146191141425,
"grad_norm": 1.004752278327942,
"learning_rate": 0.00019620988326528077,
"loss": 0.8502,
"step": 5620
},
{
"epoch": 0.2317363793245131,
"grad_norm": 1.0108088254928589,
"learning_rate": 0.00019617350811600831,
"loss": 0.8367,
"step": 5640
},
{
"epoch": 0.23255813953488372,
"grad_norm": 1.0130361318588257,
"learning_rate": 0.00019613696265085591,
"loss": 0.8151,
"step": 5660
},
{
"epoch": 0.23337989974525433,
"grad_norm": 1.0931516885757446,
"learning_rate": 0.0001961002469345428,
"loss": 0.8427,
"step": 5680
},
{
"epoch": 0.23420165995562495,
"grad_norm": 0.9896870851516724,
"learning_rate": 0.00019606336103208968,
"loss": 0.8365,
"step": 5700
},
{
"epoch": 0.23502342016599556,
"grad_norm": 1.0382894277572632,
"learning_rate": 0.0001960263050088186,
"loss": 0.8229,
"step": 5720
},
{
"epoch": 0.23584518037636618,
"grad_norm": 0.9921779036521912,
"learning_rate": 0.00019598907893035299,
"loss": 0.8462,
"step": 5740
},
{
"epoch": 0.2366669405867368,
"grad_norm": 1.02907395362854,
"learning_rate": 0.00019595168286261732,
"loss": 0.8369,
"step": 5760
},
{
"epoch": 0.2374887007971074,
"grad_norm": 0.962459921836853,
"learning_rate": 0.00019591411687183715,
"loss": 0.8207,
"step": 5780
},
{
"epoch": 0.23831046100747802,
"grad_norm": 1.0783615112304688,
"learning_rate": 0.000195876381024539,
"loss": 0.8238,
"step": 5800
},
{
"epoch": 0.23913222121784863,
"grad_norm": 1.0806901454925537,
"learning_rate": 0.00019583847538755014,
"loss": 0.8596,
"step": 5820
},
{
"epoch": 0.23995398142821925,
"grad_norm": 1.0261567831039429,
"learning_rate": 0.00019580040002799848,
"loss": 0.835,
"step": 5840
},
{
"epoch": 0.24077574163858986,
"grad_norm": 1.0381710529327393,
"learning_rate": 0.0001957621550133126,
"loss": 0.8614,
"step": 5860
},
{
"epoch": 0.24159750184896048,
"grad_norm": 1.0195953845977783,
"learning_rate": 0.00019572374041122148,
"loss": 0.8411,
"step": 5880
},
{
"epoch": 0.2424192620593311,
"grad_norm": 0.9467645287513733,
"learning_rate": 0.0001956851562897544,
"loss": 0.8375,
"step": 5900
},
{
"epoch": 0.2432410222697017,
"grad_norm": 0.9575105309486389,
"learning_rate": 0.0001956464027172409,
"loss": 0.8215,
"step": 5920
},
{
"epoch": 0.24406278248007232,
"grad_norm": 1.0752394199371338,
"learning_rate": 0.00019560747976231054,
"loss": 0.8377,
"step": 5940
},
{
"epoch": 0.24488454269044294,
"grad_norm": 0.9938384890556335,
"learning_rate": 0.0001955683874938929,
"loss": 0.8152,
"step": 5960
},
{
"epoch": 0.24570630290081355,
"grad_norm": 0.9279704689979553,
"learning_rate": 0.00019552912598121735,
"loss": 0.8061,
"step": 5980
},
{
"epoch": 0.24652806311118416,
"grad_norm": 0.9615955948829651,
"learning_rate": 0.00019548969529381306,
"loss": 0.8492,
"step": 6000
},
{
"epoch": 0.24652806311118416,
"eval_loss": 1.0622910261154175,
"eval_runtime": 16.4076,
"eval_samples_per_second": 159.682,
"eval_steps_per_second": 4.998,
"step": 6000
},
{
"epoch": 0.24734982332155478,
"grad_norm": 1.052895188331604,
"learning_rate": 0.0001954500955015087,
"loss": 0.8352,
"step": 6020
},
{
"epoch": 0.2481715835319254,
"grad_norm": 1.0382367372512817,
"learning_rate": 0.00019541032667443243,
"loss": 0.837,
"step": 6040
},
{
"epoch": 0.248993343742296,
"grad_norm": 1.0128381252288818,
"learning_rate": 0.00019537038888301183,
"loss": 0.8292,
"step": 6060
},
{
"epoch": 0.24981510395266662,
"grad_norm": 1.0595107078552246,
"learning_rate": 0.00019533028219797366,
"loss": 0.8431,
"step": 6080
},
{
"epoch": 0.2506368641630372,
"grad_norm": 1.0556915998458862,
"learning_rate": 0.00019529000669034376,
"loss": 0.8316,
"step": 6100
},
{
"epoch": 0.2514586243734078,
"grad_norm": 0.9882562160491943,
"learning_rate": 0.00019524956243144692,
"loss": 0.8377,
"step": 6120
},
{
"epoch": 0.25228038458377844,
"grad_norm": 1.0965570211410522,
"learning_rate": 0.00019520894949290684,
"loss": 0.8383,
"step": 6140
},
{
"epoch": 0.25310214479414905,
"grad_norm": 1.075129747390747,
"learning_rate": 0.0001951681679466459,
"loss": 0.8372,
"step": 6160
},
{
"epoch": 0.25392390500451967,
"grad_norm": 1.1094706058502197,
"learning_rate": 0.00019512721786488509,
"loss": 0.8321,
"step": 6180
},
{
"epoch": 0.2547456652148903,
"grad_norm": 0.9913383722305298,
"learning_rate": 0.00019508609932014382,
"loss": 0.8238,
"step": 6200
},
{
"epoch": 0.2555674254252609,
"grad_norm": 1.10612154006958,
"learning_rate": 0.0001950448123852399,
"loss": 0.8637,
"step": 6220
},
{
"epoch": 0.2563891856356315,
"grad_norm": 1.0326836109161377,
"learning_rate": 0.00019500335713328932,
"loss": 0.84,
"step": 6240
},
{
"epoch": 0.2572109458460021,
"grad_norm": 0.9649391174316406,
"learning_rate": 0.00019496173363770615,
"loss": 0.8217,
"step": 6260
},
{
"epoch": 0.25803270605637274,
"grad_norm": 0.9617984890937805,
"learning_rate": 0.0001949199419722023,
"loss": 0.8537,
"step": 6280
},
{
"epoch": 0.25885446626674335,
"grad_norm": 1.1347591876983643,
"learning_rate": 0.0001948779822107877,
"loss": 0.8131,
"step": 6300
},
{
"epoch": 0.25967622647711397,
"grad_norm": 0.9121894240379333,
"learning_rate": 0.00019483585442776983,
"loss": 0.8407,
"step": 6320
},
{
"epoch": 0.2604979866874846,
"grad_norm": 1.203627586364746,
"learning_rate": 0.00019479355869775374,
"loss": 0.8455,
"step": 6340
},
{
"epoch": 0.2613197468978552,
"grad_norm": 1.0639876127243042,
"learning_rate": 0.00019475109509564192,
"loss": 0.8466,
"step": 6360
},
{
"epoch": 0.2621415071082258,
"grad_norm": 1.0065891742706299,
"learning_rate": 0.00019470846369663413,
"loss": 0.8502,
"step": 6380
},
{
"epoch": 0.2629632673185964,
"grad_norm": 1.0567289590835571,
"learning_rate": 0.00019466566457622734,
"loss": 0.8375,
"step": 6400
},
{
"epoch": 0.26378502752896704,
"grad_norm": 1.1206752061843872,
"learning_rate": 0.0001946226978102154,
"loss": 0.8333,
"step": 6420
},
{
"epoch": 0.26460678773933766,
"grad_norm": 1.0563714504241943,
"learning_rate": 0.00019457956347468925,
"loss": 0.836,
"step": 6440
},
{
"epoch": 0.26542854794970827,
"grad_norm": 1.051429033279419,
"learning_rate": 0.0001945362616460364,
"loss": 0.8287,
"step": 6460
},
{
"epoch": 0.2662503081600789,
"grad_norm": 0.9783703088760376,
"learning_rate": 0.0001944927924009411,
"loss": 0.8148,
"step": 6480
},
{
"epoch": 0.2670720683704495,
"grad_norm": 1.2782011032104492,
"learning_rate": 0.00019444915581638404,
"loss": 0.866,
"step": 6500
},
{
"epoch": 0.2678938285808201,
"grad_norm": 0.8880527019500732,
"learning_rate": 0.0001944053519696422,
"loss": 0.8282,
"step": 6520
},
{
"epoch": 0.26871558879119073,
"grad_norm": 1.0727986097335815,
"learning_rate": 0.0001943613809382889,
"loss": 0.8523,
"step": 6540
},
{
"epoch": 0.26953734900156134,
"grad_norm": 1.0758675336837769,
"learning_rate": 0.00019431724280019342,
"loss": 0.8381,
"step": 6560
},
{
"epoch": 0.27035910921193196,
"grad_norm": 1.10956609249115,
"learning_rate": 0.00019427293763352096,
"loss": 0.8159,
"step": 6580
},
{
"epoch": 0.2711808694223026,
"grad_norm": 1.1092921495437622,
"learning_rate": 0.00019422846551673262,
"loss": 0.8387,
"step": 6600
},
{
"epoch": 0.2720026296326732,
"grad_norm": 1.057029366493225,
"learning_rate": 0.00019418382652858506,
"loss": 0.8306,
"step": 6620
},
{
"epoch": 0.2728243898430438,
"grad_norm": 0.9744523763656616,
"learning_rate": 0.00019413902074813047,
"loss": 0.8582,
"step": 6640
},
{
"epoch": 0.2736461500534144,
"grad_norm": 0.9532150626182556,
"learning_rate": 0.00019409404825471654,
"loss": 0.8531,
"step": 6660
},
{
"epoch": 0.27446791026378503,
"grad_norm": 1.0654603242874146,
"learning_rate": 0.00019404890912798597,
"loss": 0.8437,
"step": 6680
},
{
"epoch": 0.27528967047415565,
"grad_norm": 1.0381238460540771,
"learning_rate": 0.00019400360344787676,
"loss": 0.8618,
"step": 6700
},
{
"epoch": 0.27611143068452626,
"grad_norm": 1.1071590185165405,
"learning_rate": 0.00019395813129462176,
"loss": 0.824,
"step": 6720
},
{
"epoch": 0.2769331908948969,
"grad_norm": 1.1044433116912842,
"learning_rate": 0.00019391249274874865,
"loss": 0.8332,
"step": 6740
},
{
"epoch": 0.2777549511052675,
"grad_norm": 1.145683765411377,
"learning_rate": 0.00019386668789107977,
"loss": 0.83,
"step": 6760
},
{
"epoch": 0.2785767113156381,
"grad_norm": 1.196481466293335,
"learning_rate": 0.00019382071680273198,
"loss": 0.8368,
"step": 6780
},
{
"epoch": 0.2793984715260087,
"grad_norm": 1.0642255544662476,
"learning_rate": 0.00019377457956511662,
"loss": 0.8439,
"step": 6800
},
{
"epoch": 0.28022023173637933,
"grad_norm": 1.0132989883422852,
"learning_rate": 0.0001937282762599391,
"loss": 0.8403,
"step": 6820
},
{
"epoch": 0.28104199194674995,
"grad_norm": 1.1021807193756104,
"learning_rate": 0.00019368180696919905,
"loss": 0.8373,
"step": 6840
},
{
"epoch": 0.28186375215712056,
"grad_norm": 1.0249390602111816,
"learning_rate": 0.00019363517177519004,
"loss": 0.8246,
"step": 6860
},
{
"epoch": 0.2826855123674912,
"grad_norm": 1.0310267210006714,
"learning_rate": 0.0001935883707604993,
"loss": 0.8266,
"step": 6880
},
{
"epoch": 0.2835072725778618,
"grad_norm": 1.1064010858535767,
"learning_rate": 0.00019354140400800797,
"loss": 0.8403,
"step": 6900
},
{
"epoch": 0.2843290327882324,
"grad_norm": 1.0507344007492065,
"learning_rate": 0.0001934942716008904,
"loss": 0.8365,
"step": 6920
},
{
"epoch": 0.285150792998603,
"grad_norm": 1.2774583101272583,
"learning_rate": 0.00019344697362261458,
"loss": 0.8394,
"step": 6940
},
{
"epoch": 0.28597255320897363,
"grad_norm": 1.1305222511291504,
"learning_rate": 0.0001933995101569415,
"loss": 0.8446,
"step": 6960
},
{
"epoch": 0.28679431341934425,
"grad_norm": 1.0519880056381226,
"learning_rate": 0.00019335188128792542,
"loss": 0.8589,
"step": 6980
},
{
"epoch": 0.28761607362971486,
"grad_norm": 0.9783779978752136,
"learning_rate": 0.00019330408709991326,
"loss": 0.8364,
"step": 7000
},
{
"epoch": 0.2884378338400855,
"grad_norm": 1.033211350440979,
"learning_rate": 0.0001932561276775449,
"loss": 0.8412,
"step": 7020
},
{
"epoch": 0.2892595940504561,
"grad_norm": 1.1831096410751343,
"learning_rate": 0.00019320800310575288,
"loss": 0.8495,
"step": 7040
},
{
"epoch": 0.2900813542608267,
"grad_norm": 1.0064650774002075,
"learning_rate": 0.00019315971346976193,
"loss": 0.8482,
"step": 7060
},
{
"epoch": 0.2909031144711973,
"grad_norm": 1.0976219177246094,
"learning_rate": 0.00019311125885508945,
"loss": 0.8173,
"step": 7080
},
{
"epoch": 0.29172487468156794,
"grad_norm": 1.110113263130188,
"learning_rate": 0.00019306263934754477,
"loss": 0.8285,
"step": 7100
},
{
"epoch": 0.29254663489193855,
"grad_norm": 1.0953800678253174,
"learning_rate": 0.0001930138550332292,
"loss": 0.8382,
"step": 7120
},
{
"epoch": 0.29336839510230917,
"grad_norm": 1.049208402633667,
"learning_rate": 0.0001929649059985362,
"loss": 0.8238,
"step": 7140
},
{
"epoch": 0.2941901553126798,
"grad_norm": 1.096807837486267,
"learning_rate": 0.0001929157923301506,
"loss": 0.8139,
"step": 7160
},
{
"epoch": 0.2950119155230504,
"grad_norm": 1.2268364429473877,
"learning_rate": 0.00019286651411504893,
"loss": 0.8349,
"step": 7180
},
{
"epoch": 0.295833675733421,
"grad_norm": 1.0595046281814575,
"learning_rate": 0.00019281707144049915,
"loss": 0.8448,
"step": 7200
},
{
"epoch": 0.2966554359437916,
"grad_norm": 1.0562009811401367,
"learning_rate": 0.00019276746439406047,
"loss": 0.834,
"step": 7220
},
{
"epoch": 0.29747719615416224,
"grad_norm": 1.0876846313476562,
"learning_rate": 0.0001927176930635831,
"loss": 0.8201,
"step": 7240
},
{
"epoch": 0.29829895636453285,
"grad_norm": 0.9775159955024719,
"learning_rate": 0.00019266775753720822,
"loss": 0.8612,
"step": 7260
},
{
"epoch": 0.29912071657490347,
"grad_norm": 1.1259452104568481,
"learning_rate": 0.00019261765790336784,
"loss": 0.8139,
"step": 7280
},
{
"epoch": 0.2999424767852741,
"grad_norm": 1.00784432888031,
"learning_rate": 0.00019256739425078454,
"loss": 0.8237,
"step": 7300
},
{
"epoch": 0.3007642369956447,
"grad_norm": 1.0394659042358398,
"learning_rate": 0.00019251696666847137,
"loss": 0.8692,
"step": 7320
},
{
"epoch": 0.3015859972060153,
"grad_norm": 1.1670759916305542,
"learning_rate": 0.00019246637524573173,
"loss": 0.8478,
"step": 7340
},
{
"epoch": 0.3024077574163859,
"grad_norm": 1.0558823347091675,
"learning_rate": 0.0001924156200721591,
"loss": 0.8413,
"step": 7360
},
{
"epoch": 0.30322951762675654,
"grad_norm": 1.0909830331802368,
"learning_rate": 0.000192364701237637,
"loss": 0.8584,
"step": 7380
},
{
"epoch": 0.30405127783712715,
"grad_norm": 0.9988498091697693,
"learning_rate": 0.00019231361883233878,
"loss": 0.8212,
"step": 7400
},
{
"epoch": 0.3048730380474977,
"grad_norm": 1.0689078569412231,
"learning_rate": 0.00019226237294672744,
"loss": 0.8464,
"step": 7420
},
{
"epoch": 0.3056947982578683,
"grad_norm": 1.0881212949752808,
"learning_rate": 0.00019221096367155548,
"loss": 0.8607,
"step": 7440
},
{
"epoch": 0.30651655846823894,
"grad_norm": 1.1320979595184326,
"learning_rate": 0.00019215939109786477,
"loss": 0.8359,
"step": 7460
},
{
"epoch": 0.30733831867860956,
"grad_norm": 0.8782603144645691,
"learning_rate": 0.0001921076553169864,
"loss": 0.8558,
"step": 7480
},
{
"epoch": 0.30816007888898017,
"grad_norm": 1.0550236701965332,
"learning_rate": 0.00019205575642054044,
"loss": 0.8088,
"step": 7500
},
{
"epoch": 0.3089818390993508,
"grad_norm": 1.1194961071014404,
"learning_rate": 0.0001920036945004358,
"loss": 0.8233,
"step": 7520
},
{
"epoch": 0.3098035993097214,
"grad_norm": 1.10885751247406,
"learning_rate": 0.00019195146964887024,
"loss": 0.8341,
"step": 7540
},
{
"epoch": 0.310625359520092,
"grad_norm": 1.015629768371582,
"learning_rate": 0.0001918990819583298,
"loss": 0.8547,
"step": 7560
},
{
"epoch": 0.31144711973046263,
"grad_norm": 1.142196536064148,
"learning_rate": 0.0001918465315215892,
"loss": 0.856,
"step": 7580
},
{
"epoch": 0.31226887994083324,
"grad_norm": 0.9691776037216187,
"learning_rate": 0.0001917938184317111,
"loss": 0.8295,
"step": 7600
},
{
"epoch": 0.31309064015120386,
"grad_norm": 0.9759687781333923,
"learning_rate": 0.00019174094278204636,
"loss": 0.8333,
"step": 7620
},
{
"epoch": 0.3139124003615745,
"grad_norm": 1.148779034614563,
"learning_rate": 0.00019168790466623375,
"loss": 0.8263,
"step": 7640
},
{
"epoch": 0.3147341605719451,
"grad_norm": 1.2947983741760254,
"learning_rate": 0.00019163470417819963,
"loss": 0.8627,
"step": 7660
},
{
"epoch": 0.3155559207823157,
"grad_norm": 1.0895724296569824,
"learning_rate": 0.00019158134141215792,
"loss": 0.8429,
"step": 7680
},
{
"epoch": 0.3163776809926863,
"grad_norm": 1.070154070854187,
"learning_rate": 0.0001915278164626101,
"loss": 0.8188,
"step": 7700
},
{
"epoch": 0.31719944120305693,
"grad_norm": 1.00252366065979,
"learning_rate": 0.00019147412942434463,
"loss": 0.8482,
"step": 7720
},
{
"epoch": 0.31802120141342755,
"grad_norm": 1.273224949836731,
"learning_rate": 0.00019142028039243717,
"loss": 0.8326,
"step": 7740
},
{
"epoch": 0.31884296162379816,
"grad_norm": 1.2733259201049805,
"learning_rate": 0.00019136626946225017,
"loss": 0.8377,
"step": 7760
},
{
"epoch": 0.3196647218341688,
"grad_norm": 0.9789584279060364,
"learning_rate": 0.00019131209672943288,
"loss": 0.8355,
"step": 7780
},
{
"epoch": 0.3204864820445394,
"grad_norm": 1.1093429327011108,
"learning_rate": 0.00019125776228992103,
"loss": 0.836,
"step": 7800
},
{
"epoch": 0.32130824225491,
"grad_norm": 1.0080488920211792,
"learning_rate": 0.00019120326623993668,
"loss": 0.8375,
"step": 7820
},
{
"epoch": 0.3221300024652806,
"grad_norm": 1.0703438520431519,
"learning_rate": 0.0001911486086759882,
"loss": 0.8306,
"step": 7840
},
{
"epoch": 0.32295176267565123,
"grad_norm": 1.0767182111740112,
"learning_rate": 0.0001910937896948699,
"loss": 0.8483,
"step": 7860
},
{
"epoch": 0.32377352288602185,
"grad_norm": 0.9747923612594604,
"learning_rate": 0.00019103880939366197,
"loss": 0.8489,
"step": 7880
},
{
"epoch": 0.32459528309639246,
"grad_norm": 1.0351313352584839,
"learning_rate": 0.00019098366786973032,
"loss": 0.8351,
"step": 7900
},
{
"epoch": 0.3254170433067631,
"grad_norm": 1.049666404724121,
"learning_rate": 0.00019092836522072631,
"loss": 0.8271,
"step": 7920
},
{
"epoch": 0.3262388035171337,
"grad_norm": 1.0181846618652344,
"learning_rate": 0.0001908729015445867,
"loss": 0.8352,
"step": 7940
},
{
"epoch": 0.3270605637275043,
"grad_norm": 1.1390068531036377,
"learning_rate": 0.00019081727693953337,
"loss": 0.8392,
"step": 7960
},
{
"epoch": 0.3278823239378749,
"grad_norm": 1.0242650508880615,
"learning_rate": 0.00019076149150407324,
"loss": 0.8398,
"step": 7980
},
{
"epoch": 0.32870408414824553,
"grad_norm": 1.0052822828292847,
"learning_rate": 0.0001907083464611993,
"loss": 0.8257,
"step": 8000
},
{
"epoch": 0.32870408414824553,
"eval_loss": 1.0951544046401978,
"eval_runtime": 16.4946,
"eval_samples_per_second": 158.84,
"eval_steps_per_second": 4.971,
"step": 8000
},
{
"epoch": 0.32952584435861615,
"grad_norm": 1.0948665142059326,
"learning_rate": 0.00019065224769085476,
"loss": 0.8343,
"step": 8020
},
{
"epoch": 0.33034760456898676,
"grad_norm": 1.1585348844528198,
"learning_rate": 0.00019059598838235754,
"loss": 0.8272,
"step": 8040
},
{
"epoch": 0.3311693647793574,
"grad_norm": 1.0641188621520996,
"learning_rate": 0.00019053956863533854,
"loss": 0.8499,
"step": 8060
},
{
"epoch": 0.331991124989728,
"grad_norm": 1.0595240592956543,
"learning_rate": 0.00019048298854971272,
"loss": 0.8407,
"step": 8080
},
{
"epoch": 0.3328128852000986,
"grad_norm": 1.1425433158874512,
"learning_rate": 0.00019042624822567908,
"loss": 0.8671,
"step": 8100
},
{
"epoch": 0.3336346454104692,
"grad_norm": 1.1736706495285034,
"learning_rate": 0.0001903693477637204,
"loss": 0.8115,
"step": 8120
},
{
"epoch": 0.33445640562083984,
"grad_norm": 1.062788724899292,
"learning_rate": 0.000190312287264603,
"loss": 0.8416,
"step": 8140
},
{
"epoch": 0.33527816583121045,
"grad_norm": 1.0873854160308838,
"learning_rate": 0.0001902550668293766,
"loss": 0.8513,
"step": 8160
},
{
"epoch": 0.33609992604158107,
"grad_norm": 1.0588126182556152,
"learning_rate": 0.00019019768655937423,
"loss": 0.8232,
"step": 8180
},
{
"epoch": 0.3369216862519517,
"grad_norm": 1.0401087999343872,
"learning_rate": 0.00019014014655621193,
"loss": 0.8369,
"step": 8200
},
{
"epoch": 0.3377434464623223,
"grad_norm": 0.988150417804718,
"learning_rate": 0.0001900824469217886,
"loss": 0.8076,
"step": 8220
},
{
"epoch": 0.3385652066726929,
"grad_norm": 1.2257081270217896,
"learning_rate": 0.00019002458775828584,
"loss": 0.824,
"step": 8240
},
{
"epoch": 0.3393869668830635,
"grad_norm": 0.9933615922927856,
"learning_rate": 0.0001899665691681678,
"loss": 0.8391,
"step": 8260
},
{
"epoch": 0.34020872709343414,
"grad_norm": 1.1779851913452148,
"learning_rate": 0.0001899083912541809,
"loss": 0.8333,
"step": 8280
},
{
"epoch": 0.34103048730380475,
"grad_norm": 1.118120551109314,
"learning_rate": 0.0001898500541193538,
"loss": 0.8333,
"step": 8300
},
{
"epoch": 0.34185224751417537,
"grad_norm": 1.0865180492401123,
"learning_rate": 0.00018979155786699706,
"loss": 0.8497,
"step": 8320
},
{
"epoch": 0.342674007724546,
"grad_norm": 0.9836400747299194,
"learning_rate": 0.000189732902600703,
"loss": 0.8547,
"step": 8340
},
{
"epoch": 0.3434957679349166,
"grad_norm": 1.1521192789077759,
"learning_rate": 0.00018967408842434562,
"loss": 0.8476,
"step": 8360
},
{
"epoch": 0.3443175281452872,
"grad_norm": 1.1589045524597168,
"learning_rate": 0.0001896151154420803,
"loss": 0.8479,
"step": 8380
},
{
"epoch": 0.3451392883556578,
"grad_norm": 1.0231435298919678,
"learning_rate": 0.00018955598375834364,
"loss": 0.863,
"step": 8400
},
{
"epoch": 0.34596104856602844,
"grad_norm": 1.0295898914337158,
"learning_rate": 0.00018949669347785328,
"loss": 0.8224,
"step": 8420
},
{
"epoch": 0.34678280877639905,
"grad_norm": 1.1238269805908203,
"learning_rate": 0.00018943724470560778,
"loss": 0.8276,
"step": 8440
},
{
"epoch": 0.34760456898676967,
"grad_norm": 1.0870115756988525,
"learning_rate": 0.00018937763754688634,
"loss": 0.8372,
"step": 8460
},
{
"epoch": 0.3484263291971403,
"grad_norm": 1.1568728685379028,
"learning_rate": 0.0001893178721072486,
"loss": 0.862,
"step": 8480
},
{
"epoch": 0.3492480894075109,
"grad_norm": 1.0375559329986572,
"learning_rate": 0.00018925794849253462,
"loss": 0.843,
"step": 8500
},
{
"epoch": 0.3500698496178815,
"grad_norm": 1.1177926063537598,
"learning_rate": 0.00018919786680886443,
"loss": 0.8303,
"step": 8520
},
{
"epoch": 0.3508916098282521,
"grad_norm": 1.1874128580093384,
"learning_rate": 0.00018913762716263818,
"loss": 0.8429,
"step": 8540
},
{
"epoch": 0.35171337003862274,
"grad_norm": 1.2707151174545288,
"learning_rate": 0.00018907722966053555,
"loss": 0.8257,
"step": 8560
},
{
"epoch": 0.35253513024899336,
"grad_norm": 1.1079628467559814,
"learning_rate": 0.00018901667440951586,
"loss": 0.849,
"step": 8580
},
{
"epoch": 0.35335689045936397,
"grad_norm": 1.1875925064086914,
"learning_rate": 0.0001889559615168179,
"loss": 0.8704,
"step": 8600
},
{
"epoch": 0.3541786506697346,
"grad_norm": 1.1461087465286255,
"learning_rate": 0.00018889509108995943,
"loss": 0.8292,
"step": 8620
},
{
"epoch": 0.3550004108801052,
"grad_norm": 1.3481261730194092,
"learning_rate": 0.0001888340632367373,
"loss": 0.8163,
"step": 8640
},
{
"epoch": 0.3558221710904758,
"grad_norm": 1.1863452196121216,
"learning_rate": 0.00018877287806522722,
"loss": 0.8532,
"step": 8660
},
{
"epoch": 0.35664393130084643,
"grad_norm": 1.3977798223495483,
"learning_rate": 0.00018871153568378332,
"loss": 0.8714,
"step": 8680
},
{
"epoch": 0.35746569151121704,
"grad_norm": 1.1754332780838013,
"learning_rate": 0.0001886500362010383,
"loss": 0.8243,
"step": 8700
},
{
"epoch": 0.35828745172158766,
"grad_norm": 1.1255104541778564,
"learning_rate": 0.000188588379725903,
"loss": 0.8025,
"step": 8720
},
{
"epoch": 0.35910921193195827,
"grad_norm": 1.0885831117630005,
"learning_rate": 0.00018852656636756627,
"loss": 0.8179,
"step": 8740
},
{
"epoch": 0.3599309721423289,
"grad_norm": 1.121172308921814,
"learning_rate": 0.00018846459623549482,
"loss": 0.831,
"step": 8760
},
{
"epoch": 0.3607527323526995,
"grad_norm": 1.2006275653839111,
"learning_rate": 0.000188402469439433,
"loss": 0.8451,
"step": 8780
},
{
"epoch": 0.3615744925630701,
"grad_norm": 1.0075160264968872,
"learning_rate": 0.00018834018608940257,
"loss": 0.8326,
"step": 8800
},
{
"epoch": 0.36239625277344073,
"grad_norm": 1.3210777044296265,
"learning_rate": 0.00018827774629570252,
"loss": 0.8466,
"step": 8820
},
{
"epoch": 0.36321801298381134,
"grad_norm": 1.157143473625183,
"learning_rate": 0.00018821515016890895,
"loss": 0.84,
"step": 8840
},
{
"epoch": 0.36403977319418196,
"grad_norm": 1.0349316596984863,
"learning_rate": 0.0001881523978198748,
"loss": 0.8166,
"step": 8860
},
{
"epoch": 0.3648615334045526,
"grad_norm": 1.1739977598190308,
"learning_rate": 0.00018808948935972964,
"loss": 0.8154,
"step": 8880
},
{
"epoch": 0.3656832936149232,
"grad_norm": 1.0839564800262451,
"learning_rate": 0.00018802642489987946,
"loss": 0.8446,
"step": 8900
},
{
"epoch": 0.3665050538252938,
"grad_norm": 1.126232624053955,
"learning_rate": 0.0001879632045520066,
"loss": 0.8262,
"step": 8920
},
{
"epoch": 0.3673268140356644,
"grad_norm": 1.1430919170379639,
"learning_rate": 0.00018789982842806947,
"loss": 0.8449,
"step": 8940
},
{
"epoch": 0.36814857424603503,
"grad_norm": 1.1284793615341187,
"learning_rate": 0.00018783629664030226,
"loss": 0.8482,
"step": 8960
},
{
"epoch": 0.36897033445640565,
"grad_norm": 1.0018378496170044,
"learning_rate": 0.00018777260930121487,
"loss": 0.8353,
"step": 8980
},
{
"epoch": 0.36979209466677626,
"grad_norm": 1.055388331413269,
"learning_rate": 0.00018771515779286891,
"loss": 0.8397,
"step": 9000
},
{
"epoch": 0.3706138548771469,
"grad_norm": 1.152448296546936,
"learning_rate": 0.00018765117521722443,
"loss": 0.8607,
"step": 9020
},
{
"epoch": 0.3714356150875175,
"grad_norm": 1.1177656650543213,
"learning_rate": 0.00018758703741809558,
"loss": 0.8254,
"step": 9040
},
{
"epoch": 0.37225737529788805,
"grad_norm": 1.336777687072754,
"learning_rate": 0.00018752274450906545,
"loss": 0.8367,
"step": 9060
},
{
"epoch": 0.37307913550825866,
"grad_norm": 1.19560706615448,
"learning_rate": 0.00018745829660399185,
"loss": 0.8528,
"step": 9080
},
{
"epoch": 0.3739008957186293,
"grad_norm": 1.2169603109359741,
"learning_rate": 0.00018739369381700707,
"loss": 0.8425,
"step": 9100
},
{
"epoch": 0.3747226559289999,
"grad_norm": 1.1716234683990479,
"learning_rate": 0.00018732893626251766,
"loss": 0.8358,
"step": 9120
},
{
"epoch": 0.3755444161393705,
"grad_norm": 1.0265463590621948,
"learning_rate": 0.00018726402405520425,
"loss": 0.8447,
"step": 9140
},
{
"epoch": 0.3763661763497411,
"grad_norm": 1.2216025590896606,
"learning_rate": 0.00018719895731002137,
"loss": 0.8247,
"step": 9160
},
{
"epoch": 0.37718793656011174,
"grad_norm": 1.0820845365524292,
"learning_rate": 0.0001871337361421972,
"loss": 0.8452,
"step": 9180
},
{
"epoch": 0.37800969677048235,
"grad_norm": 1.019952416419983,
"learning_rate": 0.00018706836066723347,
"loss": 0.8493,
"step": 9200
},
{
"epoch": 0.37883145698085297,
"grad_norm": 1.0316121578216553,
"learning_rate": 0.00018700283100090502,
"loss": 0.8447,
"step": 9220
},
{
"epoch": 0.3796532171912236,
"grad_norm": 1.1700369119644165,
"learning_rate": 0.00018693714725925994,
"loss": 0.8337,
"step": 9240
},
{
"epoch": 0.3804749774015942,
"grad_norm": 1.032667636871338,
"learning_rate": 0.00018687130955861902,
"loss": 0.8325,
"step": 9260
},
{
"epoch": 0.3812967376119648,
"grad_norm": 1.2093219757080078,
"learning_rate": 0.0001868053180155758,
"loss": 0.8295,
"step": 9280
},
{
"epoch": 0.3821184978223354,
"grad_norm": 1.1522185802459717,
"learning_rate": 0.00018673917274699618,
"loss": 0.841,
"step": 9300
},
{
"epoch": 0.38294025803270604,
"grad_norm": 1.2028223276138306,
"learning_rate": 0.00018667287387001834,
"loss": 0.8432,
"step": 9320
},
{
"epoch": 0.38376201824307665,
"grad_norm": 1.2288753986358643,
"learning_rate": 0.00018660642150205255,
"loss": 0.8586,
"step": 9340
},
{
"epoch": 0.38458377845344727,
"grad_norm": 1.1289194822311401,
"learning_rate": 0.00018653981576078075,
"loss": 0.8408,
"step": 9360
},
{
"epoch": 0.3854055386638179,
"grad_norm": 1.208264708518982,
"learning_rate": 0.00018647305676415665,
"loss": 0.8233,
"step": 9380
},
{
"epoch": 0.3862272988741885,
"grad_norm": 1.23066246509552,
"learning_rate": 0.00018640949387227146,
"loss": 0.8227,
"step": 9400
},
{
"epoch": 0.3870490590845591,
"grad_norm": 1.2344571352005005,
"learning_rate": 0.0001863424363680021,
"loss": 0.8376,
"step": 9420
},
{
"epoch": 0.3878708192949297,
"grad_norm": 1.1864609718322754,
"learning_rate": 0.00018627522595792413,
"loss": 0.8391,
"step": 9440
},
{
"epoch": 0.38869257950530034,
"grad_norm": 1.109244465827942,
"learning_rate": 0.00018620786276106203,
"loss": 0.8256,
"step": 9460
},
{
"epoch": 0.38951433971567095,
"grad_norm": 1.0809723138809204,
"learning_rate": 0.00018614034689671082,
"loss": 0.8492,
"step": 9480
},
{
"epoch": 0.39033609992604157,
"grad_norm": 1.2196381092071533,
"learning_rate": 0.00018607267848443591,
"loss": 0.8415,
"step": 9500
},
{
"epoch": 0.3911578601364122,
"grad_norm": 1.1039822101593018,
"learning_rate": 0.00018600485764407282,
"loss": 0.8464,
"step": 9520
},
{
"epoch": 0.3919796203467828,
"grad_norm": 1.115871548652649,
"learning_rate": 0.00018593688449572703,
"loss": 0.8368,
"step": 9540
},
{
"epoch": 0.3928013805571534,
"grad_norm": 1.0675318241119385,
"learning_rate": 0.0001858687591597738,
"loss": 0.8276,
"step": 9560
},
{
"epoch": 0.393623140767524,
"grad_norm": 1.1515909433364868,
"learning_rate": 0.00018580048175685784,
"loss": 0.8259,
"step": 9580
},
{
"epoch": 0.39444490097789464,
"grad_norm": 1.1502107381820679,
"learning_rate": 0.00018573205240789316,
"loss": 0.8549,
"step": 9600
},
{
"epoch": 0.39526666118826526,
"grad_norm": 1.0891849994659424,
"learning_rate": 0.00018566347123406284,
"loss": 0.8396,
"step": 9620
},
{
"epoch": 0.39608842139863587,
"grad_norm": 1.1536388397216797,
"learning_rate": 0.00018559473835681896,
"loss": 0.8401,
"step": 9640
},
{
"epoch": 0.3969101816090065,
"grad_norm": 1.320541262626648,
"learning_rate": 0.00018552585389788203,
"loss": 0.8454,
"step": 9660
},
{
"epoch": 0.3977319418193771,
"grad_norm": 0.96424800157547,
"learning_rate": 0.00018545681797924125,
"loss": 0.8257,
"step": 9680
},
{
"epoch": 0.3985537020297477,
"grad_norm": 1.1451895236968994,
"learning_rate": 0.00018538763072315382,
"loss": 0.8327,
"step": 9700
},
{
"epoch": 0.39937546224011833,
"grad_norm": 1.1693811416625977,
"learning_rate": 0.00018531829225214508,
"loss": 0.8342,
"step": 9720
},
{
"epoch": 0.40019722245048894,
"grad_norm": 1.0285801887512207,
"learning_rate": 0.00018524880268900812,
"loss": 0.8261,
"step": 9740
},
{
"epoch": 0.40101898266085956,
"grad_norm": 1.1713870763778687,
"learning_rate": 0.00018517916215680363,
"loss": 0.8597,
"step": 9760
},
{
"epoch": 0.40184074287123017,
"grad_norm": 1.117725133895874,
"learning_rate": 0.00018510937077885958,
"loss": 0.8281,
"step": 9780
},
{
"epoch": 0.4026625030816008,
"grad_norm": 1.2378820180892944,
"learning_rate": 0.00018503942867877118,
"loss": 0.8619,
"step": 9800
},
{
"epoch": 0.4034842632919714,
"grad_norm": 1.2209067344665527,
"learning_rate": 0.00018496933598040048,
"loss": 0.8549,
"step": 9820
},
{
"epoch": 0.404306023502342,
"grad_norm": 1.0837441682815552,
"learning_rate": 0.00018489909280787627,
"loss": 0.8575,
"step": 9840
},
{
"epoch": 0.40512778371271263,
"grad_norm": 1.2987329959869385,
"learning_rate": 0.00018482869928559379,
"loss": 0.8467,
"step": 9860
},
{
"epoch": 0.40594954392308324,
"grad_norm": 1.216752052307129,
"learning_rate": 0.00018475815553821456,
"loss": 0.8333,
"step": 9880
},
{
"epoch": 0.40677130413345386,
"grad_norm": 1.2210928201675415,
"learning_rate": 0.0001846874616906661,
"loss": 0.8377,
"step": 9900
},
{
"epoch": 0.4075930643438245,
"grad_norm": 1.007938027381897,
"learning_rate": 0.0001846166178681418,
"loss": 0.8422,
"step": 9920
},
{
"epoch": 0.4084148245541951,
"grad_norm": 1.1827200651168823,
"learning_rate": 0.00018454562419610058,
"loss": 0.8456,
"step": 9940
},
{
"epoch": 0.4092365847645657,
"grad_norm": 1.2097376585006714,
"learning_rate": 0.00018447448080026682,
"loss": 0.8408,
"step": 9960
},
{
"epoch": 0.4100583449749363,
"grad_norm": 1.0128288269042969,
"learning_rate": 0.00018440318780662998,
"loss": 0.8517,
"step": 9980
},
{
"epoch": 0.41088010518530693,
"grad_norm": 1.1385818719863892,
"learning_rate": 0.00018433174534144444,
"loss": 0.8532,
"step": 10000
},
{
"epoch": 0.41088010518530693,
"eval_loss": 1.1022228002548218,
"eval_runtime": 16.4762,
"eval_samples_per_second": 159.018,
"eval_steps_per_second": 4.977,
"step": 10000
},
{
"epoch": 0.41170186539567755,
"grad_norm": 1.2568473815917969,
"learning_rate": 0.00018426015353122934,
"loss": 0.8304,
"step": 10020
},
{
"epoch": 0.41252362560604816,
"grad_norm": 1.2366386651992798,
"learning_rate": 0.00018418841250276825,
"loss": 0.8244,
"step": 10040
},
{
"epoch": 0.4133453858164188,
"grad_norm": 1.0323954820632935,
"learning_rate": 0.000184116522383109,
"loss": 0.8167,
"step": 10060
},
{
"epoch": 0.4141671460267894,
"grad_norm": 1.1676981449127197,
"learning_rate": 0.00018404448329956344,
"loss": 0.8356,
"step": 10080
},
{
"epoch": 0.41498890623716,
"grad_norm": 1.2117750644683838,
"learning_rate": 0.0001839722953797073,
"loss": 0.8483,
"step": 10100
},
{
"epoch": 0.4158106664475306,
"grad_norm": 1.1228010654449463,
"learning_rate": 0.00018389995875137978,
"loss": 0.818,
"step": 10120
},
{
"epoch": 0.41663242665790123,
"grad_norm": 1.03129243850708,
"learning_rate": 0.00018382747354268351,
"loss": 0.8556,
"step": 10140
},
{
"epoch": 0.41745418686827185,
"grad_norm": 1.239634394645691,
"learning_rate": 0.00018375847508876958,
"loss": 0.8581,
"step": 10160
},
{
"epoch": 0.41827594707864246,
"grad_norm": 1.046134114265442,
"learning_rate": 0.0001836857005178056,
"loss": 0.8241,
"step": 10180
},
{
"epoch": 0.4190977072890131,
"grad_norm": 1.3205509185791016,
"learning_rate": 0.00018361277774590765,
"loss": 0.8409,
"step": 10200
},
{
"epoch": 0.4199194674993837,
"grad_norm": 1.1549128293991089,
"learning_rate": 0.00018353970690221646,
"loss": 0.8252,
"step": 10220
},
{
"epoch": 0.4207412277097543,
"grad_norm": 1.1062310934066772,
"learning_rate": 0.00018346648811613477,
"loss": 0.8428,
"step": 10240
},
{
"epoch": 0.4215629879201249,
"grad_norm": 1.2223172187805176,
"learning_rate": 0.0001833931215173274,
"loss": 0.8118,
"step": 10260
},
{
"epoch": 0.42238474813049554,
"grad_norm": 1.2306350469589233,
"learning_rate": 0.00018331960723572105,
"loss": 0.8615,
"step": 10280
},
{
"epoch": 0.42320650834086615,
"grad_norm": 1.301054835319519,
"learning_rate": 0.0001832459454015038,
"loss": 0.8388,
"step": 10300
},
{
"epoch": 0.42402826855123676,
"grad_norm": 1.1266446113586426,
"learning_rate": 0.00018317213614512507,
"loss": 0.844,
"step": 10320
},
{
"epoch": 0.4248500287616074,
"grad_norm": 1.1231412887573242,
"learning_rate": 0.0001830981795972954,
"loss": 0.8581,
"step": 10340
},
{
"epoch": 0.425671788971978,
"grad_norm": 1.0722932815551758,
"learning_rate": 0.00018302407588898612,
"loss": 0.844,
"step": 10360
},
{
"epoch": 0.4264935491823486,
"grad_norm": 1.122125506401062,
"learning_rate": 0.0001829498251514292,
"loss": 0.817,
"step": 10380
},
{
"epoch": 0.4273153093927192,
"grad_norm": 1.1031594276428223,
"learning_rate": 0.00018287542751611703,
"loss": 0.8448,
"step": 10400
},
{
"epoch": 0.42813706960308984,
"grad_norm": 1.166870355606079,
"learning_rate": 0.00018280088311480201,
"loss": 0.8607,
"step": 10420
},
{
"epoch": 0.42895882981346045,
"grad_norm": 1.1672008037567139,
"learning_rate": 0.0001827261920794966,
"loss": 0.8206,
"step": 10440
},
{
"epoch": 0.42978059002383107,
"grad_norm": 1.7684413194656372,
"learning_rate": 0.00018265135454247284,
"loss": 0.8503,
"step": 10460
},
{
"epoch": 0.4306023502342017,
"grad_norm": 1.3215503692626953,
"learning_rate": 0.00018257637063626226,
"loss": 0.8535,
"step": 10480
},
{
"epoch": 0.4314241104445723,
"grad_norm": 1.1748920679092407,
"learning_rate": 0.0001825012404936556,
"loss": 0.8364,
"step": 10500
},
{
"epoch": 0.4322458706549429,
"grad_norm": 1.0914533138275146,
"learning_rate": 0.00018242596424770252,
"loss": 0.8144,
"step": 10520
},
{
"epoch": 0.4330676308653135,
"grad_norm": 1.2216529846191406,
"learning_rate": 0.0001823505420317115,
"loss": 0.8566,
"step": 10540
},
{
"epoch": 0.43388939107568414,
"grad_norm": 1.2634260654449463,
"learning_rate": 0.00018227497397924948,
"loss": 0.8372,
"step": 10560
},
{
"epoch": 0.43471115128605475,
"grad_norm": 1.244780421257019,
"learning_rate": 0.00018219926022414163,
"loss": 0.8397,
"step": 10580
},
{
"epoch": 0.43553291149642537,
"grad_norm": 1.177216649055481,
"learning_rate": 0.00018212340090047118,
"loss": 0.8122,
"step": 10600
},
{
"epoch": 0.436354671706796,
"grad_norm": 1.1915186643600464,
"learning_rate": 0.0001820473961425792,
"loss": 0.829,
"step": 10620
},
{
"epoch": 0.4371764319171666,
"grad_norm": 0.9915897250175476,
"learning_rate": 0.00018197124608506423,
"loss": 0.8401,
"step": 10640
},
{
"epoch": 0.4379981921275372,
"grad_norm": 1.1870684623718262,
"learning_rate": 0.00018189495086278214,
"loss": 0.8481,
"step": 10660
},
{
"epoch": 0.4388199523379078,
"grad_norm": 1.1735584735870361,
"learning_rate": 0.00018181851061084596,
"loss": 0.8539,
"step": 10680
},
{
"epoch": 0.4396417125482784,
"grad_norm": 1.2218267917633057,
"learning_rate": 0.00018174192546462542,
"loss": 0.8416,
"step": 10700
},
{
"epoch": 0.440463472758649,
"grad_norm": 1.2126885652542114,
"learning_rate": 0.000181665195559747,
"loss": 0.8353,
"step": 10720
},
{
"epoch": 0.4412852329690196,
"grad_norm": 1.1790574789047241,
"learning_rate": 0.00018158832103209335,
"loss": 0.8235,
"step": 10740
},
{
"epoch": 0.44210699317939023,
"grad_norm": 1.153855323791504,
"learning_rate": 0.00018151130201780343,
"loss": 0.8415,
"step": 10760
},
{
"epoch": 0.44292875338976084,
"grad_norm": 1.0696359872817993,
"learning_rate": 0.00018143413865327198,
"loss": 0.8514,
"step": 10780
},
{
"epoch": 0.44375051360013146,
"grad_norm": 1.2022265195846558,
"learning_rate": 0.00018135683107514932,
"loss": 0.8102,
"step": 10800
},
{
"epoch": 0.44457227381050207,
"grad_norm": 1.1839139461517334,
"learning_rate": 0.00018127937942034127,
"loss": 0.8243,
"step": 10820
},
{
"epoch": 0.4453940340208727,
"grad_norm": 1.1672074794769287,
"learning_rate": 0.00018120178382600874,
"loss": 0.8707,
"step": 10840
},
{
"epoch": 0.4462157942312433,
"grad_norm": 1.1144109964370728,
"learning_rate": 0.00018112404442956754,
"loss": 0.833,
"step": 10860
},
{
"epoch": 0.4470375544416139,
"grad_norm": 1.2560542821884155,
"learning_rate": 0.00018104616136868816,
"loss": 0.8164,
"step": 10880
},
{
"epoch": 0.44785931465198453,
"grad_norm": 1.1620038747787476,
"learning_rate": 0.00018096813478129552,
"loss": 0.857,
"step": 10900
},
{
"epoch": 0.44868107486235514,
"grad_norm": 1.0956732034683228,
"learning_rate": 0.0001808899648055687,
"loss": 0.8069,
"step": 10920
},
{
"epoch": 0.44950283507272576,
"grad_norm": 1.153667688369751,
"learning_rate": 0.00018081165157994068,
"loss": 0.8228,
"step": 10940
},
{
"epoch": 0.4503245952830964,
"grad_norm": 1.1663360595703125,
"learning_rate": 0.00018073319524309822,
"loss": 0.8438,
"step": 10960
},
{
"epoch": 0.451146355493467,
"grad_norm": 1.0581029653549194,
"learning_rate": 0.00018065459593398137,
"loss": 0.8245,
"step": 10980
},
{
"epoch": 0.4519681157038376,
"grad_norm": 1.3234822750091553,
"learning_rate": 0.0001805758537917835,
"loss": 0.8462,
"step": 11000
},
{
"epoch": 0.4527898759142082,
"grad_norm": 1.1863617897033691,
"learning_rate": 0.0001804969689559509,
"loss": 0.815,
"step": 11020
},
{
"epoch": 0.45361163612457883,
"grad_norm": 1.2528035640716553,
"learning_rate": 0.00018041794156618252,
"loss": 0.8239,
"step": 11040
},
{
"epoch": 0.45443339633494945,
"grad_norm": 2.205885887145996,
"learning_rate": 0.00018033877176242975,
"loss": 0.8061,
"step": 11060
},
{
"epoch": 0.45525515654532006,
"grad_norm": 1.2284212112426758,
"learning_rate": 0.00018025945968489626,
"loss": 0.837,
"step": 11080
},
{
"epoch": 0.4560769167556907,
"grad_norm": 1.1616712808609009,
"learning_rate": 0.00018018000547403765,
"loss": 0.8502,
"step": 11100
},
{
"epoch": 0.4568986769660613,
"grad_norm": 1.188586711883545,
"learning_rate": 0.00018010040927056117,
"loss": 0.842,
"step": 11120
},
{
"epoch": 0.4577204371764319,
"grad_norm": 1.1483261585235596,
"learning_rate": 0.00018002067121542558,
"loss": 0.8503,
"step": 11140
},
{
"epoch": 0.4585421973868025,
"grad_norm": 1.1956408023834229,
"learning_rate": 0.00017994079144984087,
"loss": 0.8452,
"step": 11160
},
{
"epoch": 0.45936395759717313,
"grad_norm": 1.281135082244873,
"learning_rate": 0.00017986077011526792,
"loss": 0.8641,
"step": 11180
},
{
"epoch": 0.46018571780754375,
"grad_norm": 1.3326410055160522,
"learning_rate": 0.00017978060735341836,
"loss": 0.81,
"step": 11200
},
{
"epoch": 0.46100747801791436,
"grad_norm": 1.331778645515442,
"learning_rate": 0.0001797003033062543,
"loss": 0.8401,
"step": 11220
},
{
"epoch": 0.461829238228285,
"grad_norm": 1.1928249597549438,
"learning_rate": 0.000179619858115988,
"loss": 0.8165,
"step": 11240
},
{
"epoch": 0.4626509984386556,
"grad_norm": 1.3430322408676147,
"learning_rate": 0.0001795392719250817,
"loss": 0.8581,
"step": 11260
},
{
"epoch": 0.4634727586490262,
"grad_norm": 1.142386555671692,
"learning_rate": 0.00017945854487624733,
"loss": 0.8397,
"step": 11280
},
{
"epoch": 0.4642945188593968,
"grad_norm": 1.1435790061950684,
"learning_rate": 0.0001793776771124463,
"loss": 0.8781,
"step": 11300
},
{
"epoch": 0.46511627906976744,
"grad_norm": 1.1941109895706177,
"learning_rate": 0.00017929666877688919,
"loss": 0.8508,
"step": 11320
},
{
"epoch": 0.46593803928013805,
"grad_norm": 1.1792101860046387,
"learning_rate": 0.00017921552001303552,
"loss": 0.8559,
"step": 11340
},
{
"epoch": 0.46675979949050866,
"grad_norm": 1.2072590589523315,
"learning_rate": 0.00017913423096459354,
"loss": 0.8416,
"step": 11360
},
{
"epoch": 0.4675815597008793,
"grad_norm": 1.220444679260254,
"learning_rate": 0.00017905280177551983,
"loss": 0.8261,
"step": 11380
},
{
"epoch": 0.4684033199112499,
"grad_norm": 1.1701765060424805,
"learning_rate": 0.00017897123259001926,
"loss": 0.8509,
"step": 11400
},
{
"epoch": 0.4692250801216205,
"grad_norm": 1.2121479511260986,
"learning_rate": 0.00017888952355254455,
"loss": 0.8424,
"step": 11420
},
{
"epoch": 0.4700468403319911,
"grad_norm": 1.1610180139541626,
"learning_rate": 0.0001788076748077962,
"loss": 0.8433,
"step": 11440
},
{
"epoch": 0.47086860054236174,
"grad_norm": 1.0766757726669312,
"learning_rate": 0.0001787256865007219,
"loss": 0.8175,
"step": 11460
},
{
"epoch": 0.47169036075273235,
"grad_norm": 1.2801762819290161,
"learning_rate": 0.00017864355877651676,
"loss": 0.8416,
"step": 11480
},
{
"epoch": 0.47251212096310297,
"grad_norm": 1.167050838470459,
"learning_rate": 0.00017856129178062257,
"loss": 0.842,
"step": 11500
},
{
"epoch": 0.4733338811734736,
"grad_norm": 1.0567585229873657,
"learning_rate": 0.0001784788856587279,
"loss": 0.8467,
"step": 11520
},
{
"epoch": 0.4741556413838442,
"grad_norm": 1.101453185081482,
"learning_rate": 0.00017839634055676762,
"loss": 0.836,
"step": 11540
},
{
"epoch": 0.4749774015942148,
"grad_norm": 1.2371095418930054,
"learning_rate": 0.00017831365662092274,
"loss": 0.8623,
"step": 11560
},
{
"epoch": 0.4757991618045854,
"grad_norm": 1.2750577926635742,
"learning_rate": 0.00017823083399762018,
"loss": 0.8413,
"step": 11580
},
{
"epoch": 0.47662092201495604,
"grad_norm": 1.2494827508926392,
"learning_rate": 0.00017814787283353245,
"loss": 0.8393,
"step": 11600
},
{
"epoch": 0.47744268222532665,
"grad_norm": 1.1090929508209229,
"learning_rate": 0.0001780647732755773,
"loss": 0.8224,
"step": 11620
},
{
"epoch": 0.47826444243569727,
"grad_norm": 1.2686065435409546,
"learning_rate": 0.00017798153547091773,
"loss": 0.8468,
"step": 11640
},
{
"epoch": 0.4790862026460679,
"grad_norm": 1.0384360551834106,
"learning_rate": 0.0001778981595669615,
"loss": 0.8346,
"step": 11660
},
{
"epoch": 0.4799079628564385,
"grad_norm": 1.1490122079849243,
"learning_rate": 0.0001778146457113608,
"loss": 0.8565,
"step": 11680
},
{
"epoch": 0.4807297230668091,
"grad_norm": 1.1408191919326782,
"learning_rate": 0.00017773099405201236,
"loss": 0.8337,
"step": 11700
},
{
"epoch": 0.4815514832771797,
"grad_norm": 1.1228752136230469,
"learning_rate": 0.00017764720473705675,
"loss": 0.8236,
"step": 11720
},
{
"epoch": 0.48237324348755034,
"grad_norm": 1.0747302770614624,
"learning_rate": 0.00017756327791487847,
"loss": 0.8439,
"step": 11740
},
{
"epoch": 0.48319500369792096,
"grad_norm": 1.1848806142807007,
"learning_rate": 0.0001774792137341054,
"loss": 0.8433,
"step": 11760
},
{
"epoch": 0.48401676390829157,
"grad_norm": 1.1458165645599365,
"learning_rate": 0.00017739501234360875,
"loss": 0.8513,
"step": 11780
},
{
"epoch": 0.4848385241186622,
"grad_norm": 1.2221580743789673,
"learning_rate": 0.00017731067389250272,
"loss": 0.8345,
"step": 11800
},
{
"epoch": 0.4856602843290328,
"grad_norm": 1.2535064220428467,
"learning_rate": 0.00017722619853014423,
"loss": 0.842,
"step": 11820
},
{
"epoch": 0.4864820445394034,
"grad_norm": 1.2956807613372803,
"learning_rate": 0.0001771415864061326,
"loss": 0.833,
"step": 11840
},
{
"epoch": 0.487303804749774,
"grad_norm": 1.3538552522659302,
"learning_rate": 0.0001770568376703094,
"loss": 0.8389,
"step": 11860
},
{
"epoch": 0.48812556496014464,
"grad_norm": 1.325257658958435,
"learning_rate": 0.00017697195247275813,
"loss": 0.8041,
"step": 11880
},
{
"epoch": 0.48894732517051526,
"grad_norm": 1.1765714883804321,
"learning_rate": 0.00017688693096380392,
"loss": 0.8406,
"step": 11900
},
{
"epoch": 0.48976908538088587,
"grad_norm": 1.1218301057815552,
"learning_rate": 0.00017680177329401333,
"loss": 0.8562,
"step": 11920
},
{
"epoch": 0.4905908455912565,
"grad_norm": 1.1272341012954712,
"learning_rate": 0.00017671647961419406,
"loss": 0.8638,
"step": 11940
},
{
"epoch": 0.4914126058016271,
"grad_norm": 1.0992316007614136,
"learning_rate": 0.00017663105007539463,
"loss": 0.8156,
"step": 11960
},
{
"epoch": 0.4922343660119977,
"grad_norm": 1.1406649351119995,
"learning_rate": 0.00017654548482890414,
"loss": 0.8444,
"step": 11980
},
{
"epoch": 0.49305612622236833,
"grad_norm": 1.251295566558838,
"learning_rate": 0.00017645978402625214,
"loss": 0.8549,
"step": 12000
},
{
"epoch": 0.49305612622236833,
"eval_loss": 1.1104093790054321,
"eval_runtime": 16.7503,
"eval_samples_per_second": 156.415,
"eval_steps_per_second": 4.895,
"step": 12000
},
{
"epoch": 0.49387788643273894,
"grad_norm": 1.153436303138733,
"learning_rate": 0.00017637394781920812,
"loss": 0.8262,
"step": 12020
},
{
"epoch": 0.49469964664310956,
"grad_norm": 1.200875997543335,
"learning_rate": 0.00017628797635978134,
"loss": 0.8644,
"step": 12040
},
{
"epoch": 0.4955214068534802,
"grad_norm": 1.2446619272232056,
"learning_rate": 0.00017620186980022072,
"loss": 0.8485,
"step": 12060
},
{
"epoch": 0.4963431670638508,
"grad_norm": 1.1602336168289185,
"learning_rate": 0.00017611562829301429,
"loss": 0.8318,
"step": 12080
},
{
"epoch": 0.4971649272742214,
"grad_norm": 1.4564729928970337,
"learning_rate": 0.00017602925199088917,
"loss": 0.8375,
"step": 12100
},
{
"epoch": 0.497986687484592,
"grad_norm": 1.107946753501892,
"learning_rate": 0.00017594274104681108,
"loss": 0.8292,
"step": 12120
},
{
"epoch": 0.49880844769496263,
"grad_norm": 1.250048279762268,
"learning_rate": 0.00017585609561398426,
"loss": 0.817,
"step": 12140
},
{
"epoch": 0.49963020790533325,
"grad_norm": 0.956064760684967,
"learning_rate": 0.00017576931584585117,
"loss": 0.8311,
"step": 12160
},
{
"epoch": 0.5004519681157038,
"grad_norm": 1.1825581789016724,
"learning_rate": 0.000175682401896092,
"loss": 0.8232,
"step": 12180
},
{
"epoch": 0.5012737283260744,
"grad_norm": 1.26679265499115,
"learning_rate": 0.00017559535391862476,
"loss": 0.7985,
"step": 12200
},
{
"epoch": 0.502095488536445,
"grad_norm": 1.1247515678405762,
"learning_rate": 0.00017550817206760463,
"loss": 0.8177,
"step": 12220
},
{
"epoch": 0.5029172487468156,
"grad_norm": 1.2112337350845337,
"learning_rate": 0.00017542085649742403,
"loss": 0.851,
"step": 12240
},
{
"epoch": 0.5037390089571863,
"grad_norm": 1.3968725204467773,
"learning_rate": 0.00017533340736271207,
"loss": 0.8422,
"step": 12260
},
{
"epoch": 0.5045607691675569,
"grad_norm": 1.2992043495178223,
"learning_rate": 0.00017524582481833444,
"loss": 0.8492,
"step": 12280
},
{
"epoch": 0.5053825293779275,
"grad_norm": 1.1721656322479248,
"learning_rate": 0.0001751581090193931,
"loss": 0.8321,
"step": 12300
},
{
"epoch": 0.5062042895882981,
"grad_norm": 1.3727058172225952,
"learning_rate": 0.00017507026012122595,
"loss": 0.8666,
"step": 12320
},
{
"epoch": 0.5070260497986687,
"grad_norm": 1.27950119972229,
"learning_rate": 0.0001749822782794067,
"loss": 0.8643,
"step": 12340
},
{
"epoch": 0.5078478100090393,
"grad_norm": 0.9998101592063904,
"learning_rate": 0.00017489416364974432,
"loss": 0.8319,
"step": 12360
},
{
"epoch": 0.50866957021941,
"grad_norm": 1.210250973701477,
"learning_rate": 0.0001748059163882831,
"loss": 0.8183,
"step": 12380
},
{
"epoch": 0.5094913304297806,
"grad_norm": 1.2826182842254639,
"learning_rate": 0.00017471753665130213,
"loss": 0.8421,
"step": 12400
},
{
"epoch": 0.5103130906401512,
"grad_norm": 1.2036994695663452,
"learning_rate": 0.00017462902459531508,
"loss": 0.8363,
"step": 12420
},
{
"epoch": 0.5111348508505218,
"grad_norm": 1.0195825099945068,
"learning_rate": 0.00017454038037707008,
"loss": 0.8338,
"step": 12440
},
{
"epoch": 0.5119566110608924,
"grad_norm": 1.2347939014434814,
"learning_rate": 0.00017445160415354916,
"loss": 0.839,
"step": 12460
},
{
"epoch": 0.512778371271263,
"grad_norm": 1.196473240852356,
"learning_rate": 0.00017436269608196817,
"loss": 0.85,
"step": 12480
},
{
"epoch": 0.5136001314816336,
"grad_norm": 1.4037846326828003,
"learning_rate": 0.00017427365631977648,
"loss": 0.84,
"step": 12500
},
{
"epoch": 0.5144218916920043,
"grad_norm": 1.2222518920898438,
"learning_rate": 0.00017418448502465667,
"loss": 0.855,
"step": 12520
},
{
"epoch": 0.5152436519023749,
"grad_norm": 1.2747788429260254,
"learning_rate": 0.0001740951823545242,
"loss": 0.8545,
"step": 12540
},
{
"epoch": 0.5160654121127455,
"grad_norm": 1.2375946044921875,
"learning_rate": 0.00017400574846752724,
"loss": 0.8351,
"step": 12560
},
{
"epoch": 0.5168871723231161,
"grad_norm": 1.084808588027954,
"learning_rate": 0.00017391618352204633,
"loss": 0.8344,
"step": 12580
},
{
"epoch": 0.5177089325334867,
"grad_norm": 1.228043794631958,
"learning_rate": 0.00017382648767669408,
"loss": 0.8321,
"step": 12600
},
{
"epoch": 0.5185306927438573,
"grad_norm": 1.125532865524292,
"learning_rate": 0.00017373666109031497,
"loss": 0.8383,
"step": 12620
},
{
"epoch": 0.5193524529542279,
"grad_norm": 1.1619880199432373,
"learning_rate": 0.00017364670392198492,
"loss": 0.8318,
"step": 12640
},
{
"epoch": 0.5201742131645986,
"grad_norm": 1.1036595106124878,
"learning_rate": 0.00017355661633101116,
"loss": 0.8265,
"step": 12660
},
{
"epoch": 0.5209959733749692,
"grad_norm": 1.084410548210144,
"learning_rate": 0.0001734663984769319,
"loss": 0.8293,
"step": 12680
},
{
"epoch": 0.5218177335853398,
"grad_norm": 1.3622374534606934,
"learning_rate": 0.000173376050519516,
"loss": 0.8496,
"step": 12700
},
{
"epoch": 0.5226394937957104,
"grad_norm": 1.2676513195037842,
"learning_rate": 0.00017328557261876273,
"loss": 0.8357,
"step": 12720
},
{
"epoch": 0.523461254006081,
"grad_norm": 1.1944401264190674,
"learning_rate": 0.00017319496493490148,
"loss": 0.8162,
"step": 12740
},
{
"epoch": 0.5242830142164516,
"grad_norm": 1.2380664348602295,
"learning_rate": 0.00017310422762839155,
"loss": 0.8319,
"step": 12760
},
{
"epoch": 0.5251047744268222,
"grad_norm": 1.3865251541137695,
"learning_rate": 0.00017301336085992163,
"loss": 0.8181,
"step": 12780
},
{
"epoch": 0.5259265346371929,
"grad_norm": 1.1436952352523804,
"learning_rate": 0.00017292236479040984,
"loss": 0.827,
"step": 12800
},
{
"epoch": 0.5267482948475635,
"grad_norm": 1.4181216955184937,
"learning_rate": 0.0001728312395810032,
"loss": 0.8297,
"step": 12820
},
{
"epoch": 0.5275700550579341,
"grad_norm": 1.4452232122421265,
"learning_rate": 0.00017273998539307742,
"loss": 0.8397,
"step": 12840
},
{
"epoch": 0.5283918152683047,
"grad_norm": 1.1824501752853394,
"learning_rate": 0.00017264860238823667,
"loss": 0.8088,
"step": 12860
},
{
"epoch": 0.5292135754786753,
"grad_norm": 1.1516762971878052,
"learning_rate": 0.0001725570907283132,
"loss": 0.8235,
"step": 12880
},
{
"epoch": 0.5300353356890459,
"grad_norm": 1.0846797227859497,
"learning_rate": 0.00017246545057536712,
"loss": 0.8149,
"step": 12900
},
{
"epoch": 0.5308570958994165,
"grad_norm": 1.244383692741394,
"learning_rate": 0.00017237368209168608,
"loss": 0.8488,
"step": 12920
},
{
"epoch": 0.5316788561097872,
"grad_norm": 1.0443450212478638,
"learning_rate": 0.000172281785439785,
"loss": 0.8089,
"step": 12940
},
{
"epoch": 0.5325006163201578,
"grad_norm": 1.282185673713684,
"learning_rate": 0.00017218976078240582,
"loss": 0.8164,
"step": 12960
},
{
"epoch": 0.5333223765305284,
"grad_norm": 1.264277696609497,
"learning_rate": 0.0001720976082825171,
"loss": 0.8512,
"step": 12980
},
{
"epoch": 0.534144136740899,
"grad_norm": 1.1357461214065552,
"learning_rate": 0.00017200532810331378,
"loss": 0.8368,
"step": 13000
},
{
"epoch": 0.5349658969512696,
"grad_norm": 1.066361427307129,
"learning_rate": 0.00017191292040821696,
"loss": 0.8273,
"step": 13020
},
{
"epoch": 0.5357876571616402,
"grad_norm": 1.2606313228607178,
"learning_rate": 0.00017182038536087363,
"loss": 0.8513,
"step": 13040
},
{
"epoch": 0.5366094173720108,
"grad_norm": 1.1224803924560547,
"learning_rate": 0.00017172772312515618,
"loss": 0.8417,
"step": 13060
},
{
"epoch": 0.5374311775823815,
"grad_norm": 1.2137328386306763,
"learning_rate": 0.00017163957634245275,
"loss": 0.8264,
"step": 13080
},
{
"epoch": 0.5382529377927521,
"grad_norm": 1.1143057346343994,
"learning_rate": 0.00017154666656159665,
"loss": 0.8317,
"step": 13100
},
{
"epoch": 0.5390746980031227,
"grad_norm": 1.087512493133545,
"learning_rate": 0.00017145363007710135,
"loss": 0.8194,
"step": 13120
},
{
"epoch": 0.5398964582134933,
"grad_norm": 1.2861449718475342,
"learning_rate": 0.0001713604670537273,
"loss": 0.843,
"step": 13140
},
{
"epoch": 0.5407182184238639,
"grad_norm": 1.2231301069259644,
"learning_rate": 0.00017126717765645908,
"loss": 0.8192,
"step": 13160
},
{
"epoch": 0.5415399786342345,
"grad_norm": 1.2475714683532715,
"learning_rate": 0.00017117376205050502,
"loss": 0.8524,
"step": 13180
},
{
"epoch": 0.5423617388446051,
"grad_norm": 1.1694715023040771,
"learning_rate": 0.00017108022040129695,
"loss": 0.8381,
"step": 13200
},
{
"epoch": 0.5431834990549758,
"grad_norm": 1.29911470413208,
"learning_rate": 0.00017098655287448993,
"loss": 0.8056,
"step": 13220
},
{
"epoch": 0.5440052592653464,
"grad_norm": 1.063346028327942,
"learning_rate": 0.00017089275963596195,
"loss": 0.8328,
"step": 13240
},
{
"epoch": 0.544827019475717,
"grad_norm": 1.6195141077041626,
"learning_rate": 0.0001707988408518136,
"loss": 0.8597,
"step": 13260
},
{
"epoch": 0.5456487796860876,
"grad_norm": 1.2005921602249146,
"learning_rate": 0.00017070479668836785,
"loss": 0.8415,
"step": 13280
},
{
"epoch": 0.5464705398964582,
"grad_norm": 1.1937131881713867,
"learning_rate": 0.00017061533875220887,
"loss": 0.8171,
"step": 13300
},
{
"epoch": 0.5472923001068288,
"grad_norm": 1.1822235584259033,
"learning_rate": 0.0001705210505783601,
"loss": 0.8596,
"step": 13320
},
{
"epoch": 0.5481140603171994,
"grad_norm": 1.0105253458023071,
"learning_rate": 0.00017042663751715912,
"loss": 0.8266,
"step": 13340
},
{
"epoch": 0.5489358205275701,
"grad_norm": 1.20473051071167,
"learning_rate": 0.00017033209973580418,
"loss": 0.8437,
"step": 13360
},
{
"epoch": 0.5497575807379407,
"grad_norm": 1.237752914428711,
"learning_rate": 0.00017023743740171438,
"loss": 0.8278,
"step": 13380
},
{
"epoch": 0.5505793409483113,
"grad_norm": 1.2165151834487915,
"learning_rate": 0.00017014265068252948,
"loss": 0.8494,
"step": 13400
},
{
"epoch": 0.5514011011586819,
"grad_norm": 1.2971493005752563,
"learning_rate": 0.00017004773974610941,
"loss": 0.8312,
"step": 13420
},
{
"epoch": 0.5522228613690525,
"grad_norm": 1.091404676437378,
"learning_rate": 0.0001699527047605342,
"loss": 0.8247,
"step": 13440
},
{
"epoch": 0.5530446215794231,
"grad_norm": 1.1684538125991821,
"learning_rate": 0.00016985754589410342,
"loss": 0.8149,
"step": 13460
},
{
"epoch": 0.5538663817897937,
"grad_norm": 1.201493740081787,
"learning_rate": 0.00016976226331533617,
"loss": 0.8288,
"step": 13480
},
{
"epoch": 0.5546881420001644,
"grad_norm": 1.2418882846832275,
"learning_rate": 0.0001696668571929705,
"loss": 0.8286,
"step": 13500
},
{
"epoch": 0.555509902210535,
"grad_norm": 1.073002576828003,
"learning_rate": 0.00016957132769596336,
"loss": 0.8178,
"step": 13520
},
{
"epoch": 0.5563316624209056,
"grad_norm": 1.156518816947937,
"learning_rate": 0.0001694756749934901,
"loss": 0.8322,
"step": 13540
},
{
"epoch": 0.5571534226312762,
"grad_norm": 1.2362408638000488,
"learning_rate": 0.00016937989925494432,
"loss": 0.8338,
"step": 13560
},
{
"epoch": 0.5579751828416468,
"grad_norm": 1.1928801536560059,
"learning_rate": 0.00016928400064993745,
"loss": 0.8243,
"step": 13580
},
{
"epoch": 0.5587969430520174,
"grad_norm": 1.1574454307556152,
"learning_rate": 0.0001691879793482986,
"loss": 0.8124,
"step": 13600
},
{
"epoch": 0.559618703262388,
"grad_norm": 1.0951565504074097,
"learning_rate": 0.00016909183552007398,
"loss": 0.8575,
"step": 13620
},
{
"epoch": 0.5604404634727587,
"grad_norm": 1.2538108825683594,
"learning_rate": 0.00016899556933552704,
"loss": 0.8149,
"step": 13640
},
{
"epoch": 0.5612622236831293,
"grad_norm": 1.2423375844955444,
"learning_rate": 0.0001688991809651377,
"loss": 0.8114,
"step": 13660
},
{
"epoch": 0.5620839838934999,
"grad_norm": 1.0723458528518677,
"learning_rate": 0.00016880267057960239,
"loss": 0.8384,
"step": 13680
},
{
"epoch": 0.5629057441038705,
"grad_norm": 1.0973854064941406,
"learning_rate": 0.00016870603834983356,
"loss": 0.8373,
"step": 13700
},
{
"epoch": 0.5637275043142411,
"grad_norm": 1.1496849060058594,
"learning_rate": 0.00016860928444695943,
"loss": 0.831,
"step": 13720
},
{
"epoch": 0.5645492645246117,
"grad_norm": 1.175132393836975,
"learning_rate": 0.00016851240904232375,
"loss": 0.8486,
"step": 13740
},
{
"epoch": 0.5653710247349824,
"grad_norm": 1.018654227256775,
"learning_rate": 0.0001684154123074854,
"loss": 0.8332,
"step": 13760
},
{
"epoch": 0.566192784945353,
"grad_norm": 1.3159968852996826,
"learning_rate": 0.00016831829441421809,
"loss": 0.8336,
"step": 13780
},
{
"epoch": 0.5670145451557236,
"grad_norm": 1.1875556707382202,
"learning_rate": 0.0001682210555345102,
"loss": 0.8209,
"step": 13800
},
{
"epoch": 0.5678363053660942,
"grad_norm": 1.2860361337661743,
"learning_rate": 0.00016812369584056424,
"loss": 0.8453,
"step": 13820
},
{
"epoch": 0.5686580655764648,
"grad_norm": 1.2040901184082031,
"learning_rate": 0.00016802621550479675,
"loss": 0.8204,
"step": 13840
},
{
"epoch": 0.5694798257868354,
"grad_norm": 1.1987425088882446,
"learning_rate": 0.00016792861469983793,
"loss": 0.8347,
"step": 13860
},
{
"epoch": 0.570301585997206,
"grad_norm": 1.1014000177383423,
"learning_rate": 0.00016783089359853127,
"loss": 0.8142,
"step": 13880
},
{
"epoch": 0.5711233462075767,
"grad_norm": 1.1898833513259888,
"learning_rate": 0.00016773305237393328,
"loss": 0.8198,
"step": 13900
},
{
"epoch": 0.5719451064179473,
"grad_norm": 1.2249850034713745,
"learning_rate": 0.00016763509119931334,
"loss": 0.8039,
"step": 13920
},
{
"epoch": 0.5727668666283179,
"grad_norm": 1.2241109609603882,
"learning_rate": 0.00016753701024815304,
"loss": 0.8458,
"step": 13940
},
{
"epoch": 0.5735886268386885,
"grad_norm": 1.2025922536849976,
"learning_rate": 0.00016743880969414624,
"loss": 0.8103,
"step": 13960
},
{
"epoch": 0.5744103870490591,
"grad_norm": 1.1043455600738525,
"learning_rate": 0.0001673404897111986,
"loss": 0.834,
"step": 13980
},
{
"epoch": 0.5752321472594297,
"grad_norm": 1.2452826499938965,
"learning_rate": 0.00016724205047342715,
"loss": 0.8216,
"step": 14000
},
{
"epoch": 0.5752321472594297,
"eval_loss": 1.1319005489349365,
"eval_runtime": 16.4046,
"eval_samples_per_second": 159.712,
"eval_steps_per_second": 4.999,
"step": 14000
},
{
"epoch": 0.5760539074698003,
"grad_norm": 1.248207688331604,
"learning_rate": 0.00016714349215516032,
"loss": 0.824,
"step": 14020
},
{
"epoch": 0.576875667680171,
"grad_norm": 1.3563847541809082,
"learning_rate": 0.0001670448149309372,
"loss": 0.8587,
"step": 14040
},
{
"epoch": 0.5776974278905416,
"grad_norm": 1.2468847036361694,
"learning_rate": 0.00016694601897550762,
"loss": 0.8481,
"step": 14060
},
{
"epoch": 0.5785191881009122,
"grad_norm": 1.139793872833252,
"learning_rate": 0.0001668471044638316,
"loss": 0.8353,
"step": 14080
},
{
"epoch": 0.5793409483112828,
"grad_norm": 1.0366131067276,
"learning_rate": 0.0001667480715710791,
"loss": 0.8091,
"step": 14100
},
{
"epoch": 0.5801627085216534,
"grad_norm": 1.1087323427200317,
"learning_rate": 0.00016664892047262977,
"loss": 0.8198,
"step": 14120
},
{
"epoch": 0.580984468732024,
"grad_norm": 1.137624740600586,
"learning_rate": 0.0001665496513440726,
"loss": 0.832,
"step": 14140
},
{
"epoch": 0.5818062289423946,
"grad_norm": 1.1337158679962158,
"learning_rate": 0.00016645026436120551,
"loss": 0.8438,
"step": 14160
},
{
"epoch": 0.5826279891527653,
"grad_norm": 1.1277921199798584,
"learning_rate": 0.00016635075970003525,
"loss": 0.8158,
"step": 14180
},
{
"epoch": 0.5834497493631359,
"grad_norm": 1.1939393281936646,
"learning_rate": 0.00016625113753677693,
"loss": 0.835,
"step": 14200
},
{
"epoch": 0.5842715095735065,
"grad_norm": 1.343687891960144,
"learning_rate": 0.00016615139804785367,
"loss": 0.8231,
"step": 14220
},
{
"epoch": 0.5850932697838771,
"grad_norm": 1.1290326118469238,
"learning_rate": 0.00016605154140989647,
"loss": 0.8456,
"step": 14240
},
{
"epoch": 0.5859150299942477,
"grad_norm": 1.274527668952942,
"learning_rate": 0.00016595156779974376,
"loss": 0.85,
"step": 14260
},
{
"epoch": 0.5867367902046183,
"grad_norm": 1.2629293203353882,
"learning_rate": 0.00016585147739444104,
"loss": 0.8245,
"step": 14280
},
{
"epoch": 0.587558550414989,
"grad_norm": 1.1809213161468506,
"learning_rate": 0.0001657512703712408,
"loss": 0.8063,
"step": 14300
},
{
"epoch": 0.5883803106253596,
"grad_norm": 1.0857917070388794,
"learning_rate": 0.00016565094690760193,
"loss": 0.8158,
"step": 14320
},
{
"epoch": 0.5892020708357302,
"grad_norm": 1.1015921831130981,
"learning_rate": 0.00016555050718118953,
"loss": 0.8501,
"step": 14340
},
{
"epoch": 0.5900238310461008,
"grad_norm": 1.3138587474822998,
"learning_rate": 0.00016544995136987467,
"loss": 0.8416,
"step": 14360
},
{
"epoch": 0.5908455912564714,
"grad_norm": 1.476505994796753,
"learning_rate": 0.00016534927965173392,
"loss": 0.8526,
"step": 14380
},
{
"epoch": 0.591667351466842,
"grad_norm": 1.1754002571105957,
"learning_rate": 0.00016524849220504918,
"loss": 0.8136,
"step": 14400
},
{
"epoch": 0.5924891116772126,
"grad_norm": 1.2151134014129639,
"learning_rate": 0.00016514758920830724,
"loss": 0.821,
"step": 14420
},
{
"epoch": 0.5933108718875832,
"grad_norm": 1.2440420389175415,
"learning_rate": 0.0001650465708401995,
"loss": 0.8274,
"step": 14440
},
{
"epoch": 0.5941326320979539,
"grad_norm": 1.1762789487838745,
"learning_rate": 0.0001649454372796218,
"loss": 0.8379,
"step": 14460
},
{
"epoch": 0.5949543923083245,
"grad_norm": 1.1778429746627808,
"learning_rate": 0.0001648441887056738,
"loss": 0.7901,
"step": 14480
},
{
"epoch": 0.5957761525186951,
"grad_norm": 1.1886394023895264,
"learning_rate": 0.0001647478961925985,
"loss": 0.8431,
"step": 14500
},
{
"epoch": 0.5965979127290657,
"grad_norm": 1.256296992301941,
"learning_rate": 0.0001646515001956767,
"loss": 0.8181,
"step": 14520
},
{
"epoch": 0.5974196729394363,
"grad_norm": 1.2841947078704834,
"learning_rate": 0.00016454991909764397,
"loss": 0.8255,
"step": 14540
},
{
"epoch": 0.5982414331498069,
"grad_norm": 1.1896756887435913,
"learning_rate": 0.0001644482236866732,
"loss": 0.8304,
"step": 14560
},
{
"epoch": 0.5990631933601775,
"grad_norm": 1.4413669109344482,
"learning_rate": 0.00016434641414285922,
"loss": 0.8374,
"step": 14580
},
{
"epoch": 0.5998849535705482,
"grad_norm": 1.2350515127182007,
"learning_rate": 0.00016424449064649882,
"loss": 0.8342,
"step": 14600
},
{
"epoch": 0.6007067137809188,
"grad_norm": 1.3465436697006226,
"learning_rate": 0.0001641424533780907,
"loss": 0.8066,
"step": 14620
},
{
"epoch": 0.6015284739912894,
"grad_norm": 1.351413369178772,
"learning_rate": 0.00016404030251833502,
"loss": 0.7952,
"step": 14640
},
{
"epoch": 0.60235023420166,
"grad_norm": 1.1581010818481445,
"learning_rate": 0.00016393803824813304,
"loss": 0.8005,
"step": 14660
},
{
"epoch": 0.6031719944120306,
"grad_norm": 1.218668818473816,
"learning_rate": 0.00016383566074858695,
"loss": 0.8293,
"step": 14680
},
{
"epoch": 0.6039937546224012,
"grad_norm": 1.1792479753494263,
"learning_rate": 0.00016373317020099943,
"loss": 0.8247,
"step": 14700
},
{
"epoch": 0.6048155148327718,
"grad_norm": 1.2358107566833496,
"learning_rate": 0.00016363056678687335,
"loss": 0.8418,
"step": 14720
},
{
"epoch": 0.6056372750431425,
"grad_norm": 1.2904804944992065,
"learning_rate": 0.00016352785068791142,
"loss": 0.8317,
"step": 14740
},
{
"epoch": 0.6064590352535131,
"grad_norm": 1.1909780502319336,
"learning_rate": 0.000163425022086016,
"loss": 0.8221,
"step": 14760
},
{
"epoch": 0.6072807954638837,
"grad_norm": 1.199325442314148,
"learning_rate": 0.00016332208116328853,
"loss": 0.819,
"step": 14780
},
{
"epoch": 0.6081025556742543,
"grad_norm": 1.4012378454208374,
"learning_rate": 0.00016321902810202956,
"loss": 0.8104,
"step": 14800
},
{
"epoch": 0.6089243158846248,
"grad_norm": 1.1930865049362183,
"learning_rate": 0.00016311586308473812,
"loss": 0.8227,
"step": 14820
},
{
"epoch": 0.6097460760949954,
"grad_norm": 1.055925965309143,
"learning_rate": 0.00016301258629411144,
"loss": 0.8337,
"step": 14840
},
{
"epoch": 0.610567836305366,
"grad_norm": 1.1309912204742432,
"learning_rate": 0.00016290919791304487,
"loss": 0.8154,
"step": 14860
},
{
"epoch": 0.6113895965157367,
"grad_norm": 1.272495985031128,
"learning_rate": 0.0001628056981246312,
"loss": 0.8277,
"step": 14880
},
{
"epoch": 0.6122113567261073,
"grad_norm": 1.4250491857528687,
"learning_rate": 0.00016270208711216059,
"loss": 0.8606,
"step": 14900
},
{
"epoch": 0.6130331169364779,
"grad_norm": 1.079211950302124,
"learning_rate": 0.00016259836505912024,
"loss": 0.8485,
"step": 14920
},
{
"epoch": 0.6138548771468485,
"grad_norm": 1.2047349214553833,
"learning_rate": 0.00016249453214919383,
"loss": 0.8123,
"step": 14940
},
{
"epoch": 0.6146766373572191,
"grad_norm": 1.344088077545166,
"learning_rate": 0.0001623905885662615,
"loss": 0.8155,
"step": 14960
},
{
"epoch": 0.6154983975675897,
"grad_norm": 1.3220460414886475,
"learning_rate": 0.0001622865344943993,
"loss": 0.8544,
"step": 14980
},
{
"epoch": 0.6163201577779603,
"grad_norm": 1.2667044401168823,
"learning_rate": 0.00016218237011787905,
"loss": 0.8049,
"step": 15000
},
{
"epoch": 0.617141917988331,
"grad_norm": 1.2205132246017456,
"learning_rate": 0.0001620780956211678,
"loss": 0.8248,
"step": 15020
},
{
"epoch": 0.6179636781987016,
"grad_norm": 1.2170429229736328,
"learning_rate": 0.00016197371118892762,
"loss": 0.8176,
"step": 15040
},
{
"epoch": 0.6187854384090722,
"grad_norm": 1.1776739358901978,
"learning_rate": 0.00016186921700601535,
"loss": 0.8358,
"step": 15060
},
{
"epoch": 0.6196071986194428,
"grad_norm": 1.305285096168518,
"learning_rate": 0.00016176461325748215,
"loss": 0.8279,
"step": 15080
},
{
"epoch": 0.6204289588298134,
"grad_norm": 1.4242192506790161,
"learning_rate": 0.00016165990012857316,
"loss": 0.8331,
"step": 15100
},
{
"epoch": 0.621250719040184,
"grad_norm": 1.2353391647338867,
"learning_rate": 0.00016155507780472734,
"loss": 0.833,
"step": 15120
},
{
"epoch": 0.6220724792505546,
"grad_norm": 1.0569748878479004,
"learning_rate": 0.0001614501464715769,
"loss": 0.8491,
"step": 15140
},
{
"epoch": 0.6228942394609253,
"grad_norm": 1.0396721363067627,
"learning_rate": 0.00016134510631494718,
"loss": 0.7974,
"step": 15160
},
{
"epoch": 0.6237159996712959,
"grad_norm": 1.1455570459365845,
"learning_rate": 0.00016123995752085623,
"loss": 0.8384,
"step": 15180
},
{
"epoch": 0.6245377598816665,
"grad_norm": 1.1126292943954468,
"learning_rate": 0.00016113470027551442,
"loss": 0.8595,
"step": 15200
},
{
"epoch": 0.6253595200920371,
"grad_norm": 1.1759933233261108,
"learning_rate": 0.00016102933476532427,
"loss": 0.8161,
"step": 15220
},
{
"epoch": 0.6261812803024077,
"grad_norm": 1.1573827266693115,
"learning_rate": 0.00016092386117687996,
"loss": 0.8321,
"step": 15240
},
{
"epoch": 0.6270030405127783,
"grad_norm": 1.1900019645690918,
"learning_rate": 0.00016081827969696713,
"loss": 0.825,
"step": 15260
},
{
"epoch": 0.627824800723149,
"grad_norm": 1.1662086248397827,
"learning_rate": 0.0001607125905125624,
"loss": 0.8244,
"step": 15280
},
{
"epoch": 0.6286465609335196,
"grad_norm": 1.0952377319335938,
"learning_rate": 0.00016060679381083324,
"loss": 0.8183,
"step": 15300
},
{
"epoch": 0.6294683211438902,
"grad_norm": 1.1535894870758057,
"learning_rate": 0.00016050088977913744,
"loss": 0.8147,
"step": 15320
},
{
"epoch": 0.6302900813542608,
"grad_norm": 1.3305004835128784,
"learning_rate": 0.0001603948786050229,
"loss": 0.8172,
"step": 15340
},
{
"epoch": 0.6311118415646314,
"grad_norm": 1.3150241374969482,
"learning_rate": 0.0001602887604762272,
"loss": 0.8058,
"step": 15360
},
{
"epoch": 0.631933601775002,
"grad_norm": 1.2193336486816406,
"learning_rate": 0.00016018253558067744,
"loss": 0.8165,
"step": 15380
},
{
"epoch": 0.6327553619853726,
"grad_norm": 1.2560005187988281,
"learning_rate": 0.0001600762041064897,
"loss": 0.8206,
"step": 15400
},
{
"epoch": 0.6335771221957432,
"grad_norm": 1.3278276920318604,
"learning_rate": 0.00015996976624196884,
"loss": 0.8178,
"step": 15420
},
{
"epoch": 0.6343988824061139,
"grad_norm": 1.2090293169021606,
"learning_rate": 0.00015986322217560815,
"loss": 0.8443,
"step": 15440
},
{
"epoch": 0.6352206426164845,
"grad_norm": 1.2651256322860718,
"learning_rate": 0.00015975657209608895,
"loss": 0.7986,
"step": 15460
},
{
"epoch": 0.6360424028268551,
"grad_norm": 1.229529619216919,
"learning_rate": 0.00015964981619228028,
"loss": 0.8127,
"step": 15480
},
{
"epoch": 0.6368641630372257,
"grad_norm": 1.235855221748352,
"learning_rate": 0.00015954295465323866,
"loss": 0.8184,
"step": 15500
},
{
"epoch": 0.6376859232475963,
"grad_norm": 1.1475471258163452,
"learning_rate": 0.00015943598766820772,
"loss": 0.8258,
"step": 15520
},
{
"epoch": 0.6385076834579669,
"grad_norm": 1.4335112571716309,
"learning_rate": 0.00015932891542661768,
"loss": 0.8153,
"step": 15540
},
{
"epoch": 0.6393294436683375,
"grad_norm": 1.1948201656341553,
"learning_rate": 0.00015922173811808526,
"loss": 0.8501,
"step": 15560
},
{
"epoch": 0.6401512038787082,
"grad_norm": 1.1850942373275757,
"learning_rate": 0.00015911445593241318,
"loss": 0.8307,
"step": 15580
},
{
"epoch": 0.6409729640890788,
"grad_norm": 1.2479064464569092,
"learning_rate": 0.00015900706905959,
"loss": 0.812,
"step": 15600
},
{
"epoch": 0.6417947242994494,
"grad_norm": 1.0971261262893677,
"learning_rate": 0.00015889957768978956,
"loss": 0.8059,
"step": 15620
},
{
"epoch": 0.64261648450982,
"grad_norm": 1.409441351890564,
"learning_rate": 0.0001587919820133708,
"loss": 0.806,
"step": 15640
},
{
"epoch": 0.6434382447201906,
"grad_norm": 1.1296477317810059,
"learning_rate": 0.00015868428222087745,
"loss": 0.8279,
"step": 15660
},
{
"epoch": 0.6442600049305612,
"grad_norm": 1.2360674142837524,
"learning_rate": 0.00015857647850303744,
"loss": 0.8098,
"step": 15680
},
{
"epoch": 0.6450817651409319,
"grad_norm": 1.2544413805007935,
"learning_rate": 0.00015846857105076295,
"loss": 0.8236,
"step": 15700
},
{
"epoch": 0.6459035253513025,
"grad_norm": 0.9945731163024902,
"learning_rate": 0.0001583605600551497,
"loss": 0.8175,
"step": 15720
},
{
"epoch": 0.6467252855616731,
"grad_norm": 1.3537676334381104,
"learning_rate": 0.00015825244570747695,
"loss": 0.807,
"step": 15740
},
{
"epoch": 0.6475470457720437,
"grad_norm": 1.1799793243408203,
"learning_rate": 0.0001581442281992068,
"loss": 0.8446,
"step": 15760
},
{
"epoch": 0.6483688059824143,
"grad_norm": 1.1959508657455444,
"learning_rate": 0.00015803590772198417,
"loss": 0.8156,
"step": 15780
},
{
"epoch": 0.6491905661927849,
"grad_norm": 1.1003177165985107,
"learning_rate": 0.0001579274844676362,
"loss": 0.8295,
"step": 15800
},
{
"epoch": 0.6500123264031555,
"grad_norm": 1.1961150169372559,
"learning_rate": 0.00015781895862817226,
"loss": 0.8428,
"step": 15820
},
{
"epoch": 0.6508340866135262,
"grad_norm": 1.3239089250564575,
"learning_rate": 0.0001577103303957832,
"loss": 0.8322,
"step": 15840
},
{
"epoch": 0.6516558468238968,
"grad_norm": 1.190565586090088,
"learning_rate": 0.0001576015999628412,
"loss": 0.8332,
"step": 15860
},
{
"epoch": 0.6524776070342674,
"grad_norm": 1.158389925956726,
"learning_rate": 0.0001574927675218996,
"loss": 0.8065,
"step": 15880
},
{
"epoch": 0.653299367244638,
"grad_norm": 1.1195902824401855,
"learning_rate": 0.00015738383326569218,
"loss": 0.8145,
"step": 15900
},
{
"epoch": 0.6541211274550086,
"grad_norm": 1.1594524383544922,
"learning_rate": 0.00015727479738713315,
"loss": 0.8094,
"step": 15920
},
{
"epoch": 0.6549428876653792,
"grad_norm": 1.176584005355835,
"learning_rate": 0.0001571656600793167,
"loss": 0.8383,
"step": 15940
},
{
"epoch": 0.6557646478757498,
"grad_norm": 1.188091516494751,
"learning_rate": 0.00015705642153551654,
"loss": 0.8155,
"step": 15960
},
{
"epoch": 0.6565864080861205,
"grad_norm": 1.1001297235488892,
"learning_rate": 0.00015694708194918575,
"loss": 0.8627,
"step": 15980
},
{
"epoch": 0.6574081682964911,
"grad_norm": 1.2960819005966187,
"learning_rate": 0.00015683764151395635,
"loss": 0.8426,
"step": 16000
},
{
"epoch": 0.6574081682964911,
"eval_loss": 1.121155858039856,
"eval_runtime": 16.4628,
"eval_samples_per_second": 159.147,
"eval_steps_per_second": 4.981,
"step": 16000
},
{
"epoch": 0.6582299285068617,
"grad_norm": 1.2257232666015625,
"learning_rate": 0.00015672810042363885,
"loss": 0.8271,
"step": 16020
},
{
"epoch": 0.6590516887172323,
"grad_norm": 1.309381127357483,
"learning_rate": 0.0001566239433327474,
"loss": 0.8404,
"step": 16040
},
{
"epoch": 0.6598734489276029,
"grad_norm": 1.1223074197769165,
"learning_rate": 0.00015651420652313064,
"loss": 0.8329,
"step": 16060
},
{
"epoch": 0.6606952091379735,
"grad_norm": 1.2301430702209473,
"learning_rate": 0.00015640436963120435,
"loss": 0.8332,
"step": 16080
},
{
"epoch": 0.6615169693483441,
"grad_norm": 1.0917423963546753,
"learning_rate": 0.00015629443285148113,
"loss": 0.8068,
"step": 16100
},
{
"epoch": 0.6623387295587148,
"grad_norm": 1.2451525926589966,
"learning_rate": 0.0001561843963786506,
"loss": 0.8214,
"step": 16120
},
{
"epoch": 0.6631604897690854,
"grad_norm": 1.3091555833816528,
"learning_rate": 0.00015607426040757885,
"loss": 0.8122,
"step": 16140
},
{
"epoch": 0.663982249979456,
"grad_norm": 1.1900159120559692,
"learning_rate": 0.00015596402513330822,
"loss": 0.8296,
"step": 16160
},
{
"epoch": 0.6648040101898266,
"grad_norm": 1.2061822414398193,
"learning_rate": 0.00015585369075105693,
"loss": 0.8226,
"step": 16180
},
{
"epoch": 0.6656257704001972,
"grad_norm": 1.3246262073516846,
"learning_rate": 0.00015574325745621866,
"loss": 0.8141,
"step": 16200
},
{
"epoch": 0.6664475306105678,
"grad_norm": 1.1857435703277588,
"learning_rate": 0.00015563272544436222,
"loss": 0.8213,
"step": 16220
},
{
"epoch": 0.6672692908209384,
"grad_norm": 1.2091962099075317,
"learning_rate": 0.00015552209491123136,
"loss": 0.8306,
"step": 16240
},
{
"epoch": 0.6680910510313091,
"grad_norm": 1.2190937995910645,
"learning_rate": 0.00015541136605274423,
"loss": 0.804,
"step": 16260
},
{
"epoch": 0.6689128112416797,
"grad_norm": 1.2466769218444824,
"learning_rate": 0.00015530053906499306,
"loss": 0.8405,
"step": 16280
},
{
"epoch": 0.6697345714520503,
"grad_norm": 1.1477371454238892,
"learning_rate": 0.000155189614144244,
"loss": 0.8193,
"step": 16300
},
{
"epoch": 0.6705563316624209,
"grad_norm": 1.3594835996627808,
"learning_rate": 0.00015508414493801062,
"loss": 0.8351,
"step": 16320
},
{
"epoch": 0.6713780918727915,
"grad_norm": 1.2145652770996094,
"learning_rate": 0.00015497302961308234,
"loss": 0.8394,
"step": 16340
},
{
"epoch": 0.6721998520831621,
"grad_norm": 1.1144131422042847,
"learning_rate": 0.00015486181693515012,
"loss": 0.8403,
"step": 16360
},
{
"epoch": 0.6730216122935327,
"grad_norm": 1.1824201345443726,
"learning_rate": 0.0001547505071011631,
"loss": 0.7953,
"step": 16380
},
{
"epoch": 0.6738433725039034,
"grad_norm": 1.1716006994247437,
"learning_rate": 0.00015463910030824243,
"loss": 0.8262,
"step": 16400
},
{
"epoch": 0.674665132714274,
"grad_norm": 1.243807315826416,
"learning_rate": 0.00015452759675368093,
"loss": 0.8239,
"step": 16420
},
{
"epoch": 0.6754868929246446,
"grad_norm": 1.235845923423767,
"learning_rate": 0.00015441599663494287,
"loss": 0.8226,
"step": 16440
},
{
"epoch": 0.6763086531350152,
"grad_norm": 1.1748180389404297,
"learning_rate": 0.0001543043001496634,
"loss": 0.8161,
"step": 16460
},
{
"epoch": 0.6771304133453858,
"grad_norm": 1.1699196100234985,
"learning_rate": 0.00015419250749564841,
"loss": 0.8131,
"step": 16480
},
{
"epoch": 0.6779521735557564,
"grad_norm": 1.238051176071167,
"learning_rate": 0.00015408061887087416,
"loss": 0.8475,
"step": 16500
},
{
"epoch": 0.678773933766127,
"grad_norm": 1.3277729749679565,
"learning_rate": 0.0001539686344734867,
"loss": 0.8217,
"step": 16520
},
{
"epoch": 0.6795956939764977,
"grad_norm": 1.1437911987304688,
"learning_rate": 0.00015385655450180185,
"loss": 0.8305,
"step": 16540
},
{
"epoch": 0.6804174541868683,
"grad_norm": 1.2318732738494873,
"learning_rate": 0.00015374437915430456,
"loss": 0.8295,
"step": 16560
},
{
"epoch": 0.6812392143972389,
"grad_norm": 1.1004635095596313,
"learning_rate": 0.00015363210862964874,
"loss": 0.8028,
"step": 16580
},
{
"epoch": 0.6820609746076095,
"grad_norm": 0.9944893717765808,
"learning_rate": 0.00015351974312665685,
"loss": 0.8375,
"step": 16600
},
{
"epoch": 0.6828827348179801,
"grad_norm": 1.3072353601455688,
"learning_rate": 0.00015340728284431957,
"loss": 0.8113,
"step": 16620
},
{
"epoch": 0.6837044950283507,
"grad_norm": 1.173128604888916,
"learning_rate": 0.0001532947279817954,
"loss": 0.8208,
"step": 16640
},
{
"epoch": 0.6845262552387213,
"grad_norm": 1.1952176094055176,
"learning_rate": 0.0001531820787384103,
"loss": 0.8139,
"step": 16660
},
{
"epoch": 0.685348015449092,
"grad_norm": 1.189226508140564,
"learning_rate": 0.00015306933531365746,
"loss": 0.837,
"step": 16680
},
{
"epoch": 0.6861697756594626,
"grad_norm": 1.0701826810836792,
"learning_rate": 0.0001529564979071968,
"loss": 0.8424,
"step": 16700
},
{
"epoch": 0.6869915358698332,
"grad_norm": 1.2586934566497803,
"learning_rate": 0.00015284356671885465,
"loss": 0.81,
"step": 16720
},
{
"epoch": 0.6878132960802038,
"grad_norm": 1.2510169744491577,
"learning_rate": 0.00015273054194862344,
"loss": 0.8353,
"step": 16740
},
{
"epoch": 0.6886350562905744,
"grad_norm": 1.2519487142562866,
"learning_rate": 0.00015261742379666138,
"loss": 0.8164,
"step": 16760
},
{
"epoch": 0.689456816500945,
"grad_norm": 1.2734975814819336,
"learning_rate": 0.000152504212463292,
"loss": 0.8136,
"step": 16780
},
{
"epoch": 0.6902785767113157,
"grad_norm": 1.4471458196640015,
"learning_rate": 0.00015239090814900386,
"loss": 0.814,
"step": 16800
},
{
"epoch": 0.6911003369216863,
"grad_norm": 1.1526660919189453,
"learning_rate": 0.00015227751105445017,
"loss": 0.8106,
"step": 16820
},
{
"epoch": 0.6919220971320569,
"grad_norm": 1.2432206869125366,
"learning_rate": 0.00015216402138044843,
"loss": 0.7911,
"step": 16840
},
{
"epoch": 0.6927438573424275,
"grad_norm": 1.3356374502182007,
"learning_rate": 0.00015205043932798015,
"loss": 0.8157,
"step": 16860
},
{
"epoch": 0.6935656175527981,
"grad_norm": 1.105976939201355,
"learning_rate": 0.00015193676509819043,
"loss": 0.8088,
"step": 16880
},
{
"epoch": 0.6943873777631687,
"grad_norm": 1.2425799369812012,
"learning_rate": 0.0001518229988923875,
"loss": 0.8318,
"step": 16900
},
{
"epoch": 0.6952091379735393,
"grad_norm": 1.1508738994598389,
"learning_rate": 0.00015171483598759317,
"loss": 0.84,
"step": 16920
},
{
"epoch": 0.69603089818391,
"grad_norm": 1.237749695777893,
"learning_rate": 0.00015160089100819412,
"loss": 0.823,
"step": 16940
},
{
"epoch": 0.6968526583942806,
"grad_norm": 1.1718213558197021,
"learning_rate": 0.00015148685464758913,
"loss": 0.7944,
"step": 16960
},
{
"epoch": 0.6976744186046512,
"grad_norm": 1.207578420639038,
"learning_rate": 0.0001513727271077277,
"loss": 0.8068,
"step": 16980
},
{
"epoch": 0.6984961788150218,
"grad_norm": 1.180071473121643,
"learning_rate": 0.00015125850859072098,
"loss": 0.8302,
"step": 17000
},
{
"epoch": 0.6993179390253924,
"grad_norm": 1.2875052690505981,
"learning_rate": 0.00015114419929884116,
"loss": 0.8382,
"step": 17020
},
{
"epoch": 0.700139699235763,
"grad_norm": 1.3213404417037964,
"learning_rate": 0.0001510297994345212,
"loss": 0.8196,
"step": 17040
},
{
"epoch": 0.7009614594461336,
"grad_norm": 1.2812501192092896,
"learning_rate": 0.00015091530920035445,
"loss": 0.85,
"step": 17060
},
{
"epoch": 0.7017832196565043,
"grad_norm": 1.122597336769104,
"learning_rate": 0.0001508007287990943,
"loss": 0.8363,
"step": 17080
},
{
"epoch": 0.7026049798668749,
"grad_norm": 1.1828125715255737,
"learning_rate": 0.0001506860584336538,
"loss": 0.8217,
"step": 17100
},
{
"epoch": 0.7034267400772455,
"grad_norm": 1.187536358833313,
"learning_rate": 0.00015057129830710542,
"loss": 0.826,
"step": 17120
},
{
"epoch": 0.7042485002876161,
"grad_norm": 1.1405049562454224,
"learning_rate": 0.00015045644862268044,
"loss": 0.8035,
"step": 17140
},
{
"epoch": 0.7050702604979867,
"grad_norm": 1.255892276763916,
"learning_rate": 0.0001503415095837688,
"loss": 0.8251,
"step": 17160
},
{
"epoch": 0.7058920207083573,
"grad_norm": 1.1409162282943726,
"learning_rate": 0.00015022648139391875,
"loss": 0.8049,
"step": 17180
},
{
"epoch": 0.7067137809187279,
"grad_norm": 1.1684703826904297,
"learning_rate": 0.00015011136425683628,
"loss": 0.8166,
"step": 17200
},
{
"epoch": 0.7075355411290986,
"grad_norm": 1.2653696537017822,
"learning_rate": 0.00014999615837638506,
"loss": 0.779,
"step": 17220
},
{
"epoch": 0.7083573013394692,
"grad_norm": 1.2622226476669312,
"learning_rate": 0.0001498808639565858,
"loss": 0.8285,
"step": 17240
},
{
"epoch": 0.7091790615498398,
"grad_norm": 1.152935266494751,
"learning_rate": 0.00014976548120161607,
"loss": 0.7834,
"step": 17260
},
{
"epoch": 0.7100008217602104,
"grad_norm": 1.2912839651107788,
"learning_rate": 0.0001496500103158098,
"loss": 0.8015,
"step": 17280
},
{
"epoch": 0.710822581970581,
"grad_norm": 1.2247825860977173,
"learning_rate": 0.00014953445150365705,
"loss": 0.8121,
"step": 17300
},
{
"epoch": 0.7116443421809516,
"grad_norm": 1.2684624195098877,
"learning_rate": 0.00014941880496980358,
"loss": 0.8203,
"step": 17320
},
{
"epoch": 0.7124661023913222,
"grad_norm": 1.2209104299545288,
"learning_rate": 0.00014930307091905057,
"loss": 0.8254,
"step": 17340
},
{
"epoch": 0.7132878626016929,
"grad_norm": 1.3020516633987427,
"learning_rate": 0.000149187249556354,
"loss": 0.8342,
"step": 17360
},
{
"epoch": 0.7141096228120635,
"grad_norm": 1.1557644605636597,
"learning_rate": 0.00014907134108682466,
"loss": 0.8199,
"step": 17380
},
{
"epoch": 0.7149313830224341,
"grad_norm": 1.177384853363037,
"learning_rate": 0.00014895534571572754,
"loss": 0.8406,
"step": 17400
},
{
"epoch": 0.7157531432328047,
"grad_norm": 1.2047346830368042,
"learning_rate": 0.0001488392636484815,
"loss": 0.8091,
"step": 17420
},
{
"epoch": 0.7165749034431753,
"grad_norm": 1.171870231628418,
"learning_rate": 0.00014872309509065886,
"loss": 0.7942,
"step": 17440
},
{
"epoch": 0.7173966636535459,
"grad_norm": 1.1541531085968018,
"learning_rate": 0.00014860684024798536,
"loss": 0.7921,
"step": 17460
},
{
"epoch": 0.7182184238639165,
"grad_norm": 1.1826390027999878,
"learning_rate": 0.0001484904993263392,
"loss": 0.8068,
"step": 17480
},
{
"epoch": 0.7190401840742872,
"grad_norm": 1.1863442659378052,
"learning_rate": 0.0001483740725317513,
"loss": 0.8347,
"step": 17500
},
{
"epoch": 0.7198619442846578,
"grad_norm": 1.2402416467666626,
"learning_rate": 0.00014825756007040458,
"loss": 0.8276,
"step": 17520
},
{
"epoch": 0.7206837044950284,
"grad_norm": 1.2695879936218262,
"learning_rate": 0.00014814096214863355,
"loss": 0.8035,
"step": 17540
},
{
"epoch": 0.721505464705399,
"grad_norm": 1.1038098335266113,
"learning_rate": 0.0001480242789729242,
"loss": 0.8131,
"step": 17560
},
{
"epoch": 0.7223272249157696,
"grad_norm": 1.3015015125274658,
"learning_rate": 0.0001479075107499135,
"loss": 0.7688,
"step": 17580
},
{
"epoch": 0.7231489851261402,
"grad_norm": 1.1493345499038696,
"learning_rate": 0.00014779065768638888,
"loss": 0.8167,
"step": 17600
},
{
"epoch": 0.7239707453365108,
"grad_norm": 1.1382313966751099,
"learning_rate": 0.0001476737199892882,
"loss": 0.8435,
"step": 17620
},
{
"epoch": 0.7247925055468815,
"grad_norm": 1.2679277658462524,
"learning_rate": 0.0001475566978656991,
"loss": 0.8304,
"step": 17640
},
{
"epoch": 0.7256142657572521,
"grad_norm": 1.2502251863479614,
"learning_rate": 0.0001474395915228587,
"loss": 0.8252,
"step": 17660
},
{
"epoch": 0.7264360259676227,
"grad_norm": 1.1250088214874268,
"learning_rate": 0.00014732240116815343,
"loss": 0.8166,
"step": 17680
},
{
"epoch": 0.7272577861779933,
"grad_norm": 1.1370155811309814,
"learning_rate": 0.0001472051270091183,
"loss": 0.7896,
"step": 17700
},
{
"epoch": 0.7280795463883639,
"grad_norm": 1.2608290910720825,
"learning_rate": 0.00014708776925343684,
"loss": 0.8028,
"step": 17720
},
{
"epoch": 0.7289013065987345,
"grad_norm": 1.3024847507476807,
"learning_rate": 0.00014697032810894064,
"loss": 0.7981,
"step": 17740
},
{
"epoch": 0.7297230668091051,
"grad_norm": 1.1868743896484375,
"learning_rate": 0.00014685280378360884,
"loss": 0.7949,
"step": 17760
},
{
"epoch": 0.7305448270194758,
"grad_norm": 1.253355622291565,
"learning_rate": 0.00014673519648556805,
"loss": 0.8107,
"step": 17780
},
{
"epoch": 0.7313665872298464,
"grad_norm": 1.2940136194229126,
"learning_rate": 0.00014661750642309173,
"loss": 0.8017,
"step": 17800
},
{
"epoch": 0.732188347440217,
"grad_norm": 1.2385964393615723,
"learning_rate": 0.0001464997338045999,
"loss": 0.8055,
"step": 17820
},
{
"epoch": 0.7330101076505876,
"grad_norm": 1.2589930295944214,
"learning_rate": 0.0001463818788386588,
"loss": 0.7795,
"step": 17840
},
{
"epoch": 0.7338318678609582,
"grad_norm": 1.0926438570022583,
"learning_rate": 0.00014626394173398056,
"loss": 0.8202,
"step": 17860
},
{
"epoch": 0.7346536280713288,
"grad_norm": 1.1935014724731445,
"learning_rate": 0.00014614592269942262,
"loss": 0.7994,
"step": 17880
},
{
"epoch": 0.7354753882816994,
"grad_norm": 1.1052745580673218,
"learning_rate": 0.0001460278219439877,
"loss": 0.8283,
"step": 17900
},
{
"epoch": 0.7362971484920701,
"grad_norm": 1.2212902307510376,
"learning_rate": 0.00014590963967682304,
"loss": 0.7937,
"step": 17920
},
{
"epoch": 0.7371189087024407,
"grad_norm": 1.2168195247650146,
"learning_rate": 0.00014579137610722044,
"loss": 0.7938,
"step": 17940
},
{
"epoch": 0.7379406689128113,
"grad_norm": 1.1712826490402222,
"learning_rate": 0.00014567303144461552,
"loss": 0.8279,
"step": 17960
},
{
"epoch": 0.7387624291231819,
"grad_norm": 1.2737464904785156,
"learning_rate": 0.0001455546058985876,
"loss": 0.8144,
"step": 17980
},
{
"epoch": 0.7395841893335525,
"grad_norm": 1.2962466478347778,
"learning_rate": 0.0001454360996788592,
"loss": 0.8398,
"step": 18000
},
{
"epoch": 0.7395841893335525,
"eval_loss": 1.0920464992523193,
"eval_runtime": 16.6449,
"eval_samples_per_second": 157.406,
"eval_steps_per_second": 4.926,
"step": 18000
},
{
"epoch": 0.7404059495439231,
"grad_norm": 1.1804122924804688,
"learning_rate": 0.0001453175129952957,
"loss": 0.8203,
"step": 18020
},
{
"epoch": 0.7412277097542938,
"grad_norm": 1.0877560377120972,
"learning_rate": 0.00014519884605790497,
"loss": 0.7971,
"step": 18040
},
{
"epoch": 0.7420494699646644,
"grad_norm": 1.2490891218185425,
"learning_rate": 0.00014508009907683705,
"loss": 0.8479,
"step": 18060
},
{
"epoch": 0.742871230175035,
"grad_norm": 1.1328068971633911,
"learning_rate": 0.0001449612722623837,
"loss": 0.8062,
"step": 18080
},
{
"epoch": 0.7436929903854055,
"grad_norm": 1.2985849380493164,
"learning_rate": 0.00014484236582497806,
"loss": 0.8267,
"step": 18100
},
{
"epoch": 0.7445147505957761,
"grad_norm": 1.1852083206176758,
"learning_rate": 0.00014472337997519432,
"loss": 0.8416,
"step": 18120
},
{
"epoch": 0.7453365108061467,
"grad_norm": 1.0732208490371704,
"learning_rate": 0.0001446043149237472,
"loss": 0.7889,
"step": 18140
},
{
"epoch": 0.7461582710165173,
"grad_norm": 1.331568717956543,
"learning_rate": 0.00014448517088149176,
"loss": 0.7988,
"step": 18160
},
{
"epoch": 0.7469800312268879,
"grad_norm": 1.2419168949127197,
"learning_rate": 0.00014436594805942288,
"loss": 0.8073,
"step": 18180
},
{
"epoch": 0.7478017914372586,
"grad_norm": 1.2016359567642212,
"learning_rate": 0.0001442466466686751,
"loss": 0.8104,
"step": 18200
},
{
"epoch": 0.7486235516476292,
"grad_norm": 1.0315485000610352,
"learning_rate": 0.00014412726692052195,
"loss": 0.8303,
"step": 18220
},
{
"epoch": 0.7494453118579998,
"grad_norm": 1.1660343408584595,
"learning_rate": 0.00014400780902637574,
"loss": 0.8143,
"step": 18240
},
{
"epoch": 0.7502670720683704,
"grad_norm": 1.1398091316223145,
"learning_rate": 0.00014388827319778723,
"loss": 0.8144,
"step": 18260
},
{
"epoch": 0.751088832278741,
"grad_norm": 1.2301084995269775,
"learning_rate": 0.00014376865964644522,
"loss": 0.8116,
"step": 18280
},
{
"epoch": 0.7519105924891116,
"grad_norm": 1.159977674484253,
"learning_rate": 0.00014364896858417607,
"loss": 0.8183,
"step": 18300
},
{
"epoch": 0.7527323526994822,
"grad_norm": 1.3030027151107788,
"learning_rate": 0.0001435292002229434,
"loss": 0.8133,
"step": 18320
},
{
"epoch": 0.7535541129098529,
"grad_norm": 1.2161463499069214,
"learning_rate": 0.00014340935477484781,
"loss": 0.8049,
"step": 18340
},
{
"epoch": 0.7543758731202235,
"grad_norm": 1.2423648834228516,
"learning_rate": 0.0001432894324521263,
"loss": 0.8061,
"step": 18360
},
{
"epoch": 0.7551976333305941,
"grad_norm": 1.2223330736160278,
"learning_rate": 0.00014316943346715216,
"loss": 0.8098,
"step": 18380
},
{
"epoch": 0.7560193935409647,
"grad_norm": 1.2927947044372559,
"learning_rate": 0.00014304935803243429,
"loss": 0.7944,
"step": 18400
},
{
"epoch": 0.7568411537513353,
"grad_norm": 1.2604374885559082,
"learning_rate": 0.00014292920636061704,
"loss": 0.8082,
"step": 18420
},
{
"epoch": 0.7576629139617059,
"grad_norm": 1.2504435777664185,
"learning_rate": 0.00014280897866447985,
"loss": 0.8105,
"step": 18440
},
{
"epoch": 0.7584846741720765,
"grad_norm": 1.2936044931411743,
"learning_rate": 0.00014268867515693662,
"loss": 0.7844,
"step": 18460
},
{
"epoch": 0.7593064343824472,
"grad_norm": 1.1824711561203003,
"learning_rate": 0.00014256829605103564,
"loss": 0.8227,
"step": 18480
},
{
"epoch": 0.7601281945928178,
"grad_norm": 1.300419807434082,
"learning_rate": 0.00014244784155995906,
"loss": 0.8513,
"step": 18500
},
{
"epoch": 0.7609499548031884,
"grad_norm": 1.376930832862854,
"learning_rate": 0.00014232731189702249,
"loss": 0.8247,
"step": 18520
},
{
"epoch": 0.761771715013559,
"grad_norm": 1.229896903038025,
"learning_rate": 0.00014220670727567466,
"loss": 0.7918,
"step": 18540
},
{
"epoch": 0.7625934752239296,
"grad_norm": 1.1824287176132202,
"learning_rate": 0.00014208602790949715,
"loss": 0.8236,
"step": 18560
},
{
"epoch": 0.7634152354343002,
"grad_norm": 1.1316356658935547,
"learning_rate": 0.00014196527401220374,
"loss": 0.8122,
"step": 18580
},
{
"epoch": 0.7642369956446708,
"grad_norm": 1.1641656160354614,
"learning_rate": 0.00014184444579764036,
"loss": 0.7887,
"step": 18600
},
{
"epoch": 0.7650587558550415,
"grad_norm": 1.2354685068130493,
"learning_rate": 0.0001417235434797844,
"loss": 0.8223,
"step": 18620
},
{
"epoch": 0.7658805160654121,
"grad_norm": 1.108034372329712,
"learning_rate": 0.00014160256727274462,
"loss": 0.8055,
"step": 18640
},
{
"epoch": 0.7667022762757827,
"grad_norm": 1.3176486492156982,
"learning_rate": 0.00014148151739076055,
"loss": 0.829,
"step": 18660
},
{
"epoch": 0.7675240364861533,
"grad_norm": 1.0911662578582764,
"learning_rate": 0.0001413603940482022,
"loss": 0.7978,
"step": 18680
},
{
"epoch": 0.7683457966965239,
"grad_norm": 1.0955551862716675,
"learning_rate": 0.0001412391974595697,
"loss": 0.8174,
"step": 18700
},
{
"epoch": 0.7691675569068945,
"grad_norm": 1.1460391283035278,
"learning_rate": 0.0001411179278394929,
"loss": 0.8279,
"step": 18720
},
{
"epoch": 0.7699893171172651,
"grad_norm": 1.1242876052856445,
"learning_rate": 0.00014099658540273096,
"loss": 0.8248,
"step": 18740
},
{
"epoch": 0.7708110773276358,
"grad_norm": 1.1270663738250732,
"learning_rate": 0.00014087517036417196,
"loss": 0.8211,
"step": 18760
},
{
"epoch": 0.7716328375380064,
"grad_norm": 1.1673957109451294,
"learning_rate": 0.0001407536829388326,
"loss": 0.8119,
"step": 18780
},
{
"epoch": 0.772454597748377,
"grad_norm": 1.123095989227295,
"learning_rate": 0.00014063212334185774,
"loss": 0.8095,
"step": 18800
},
{
"epoch": 0.7732763579587476,
"grad_norm": 1.0859897136688232,
"learning_rate": 0.0001405104917885201,
"loss": 0.7797,
"step": 18820
},
{
"epoch": 0.7740981181691182,
"grad_norm": 1.2377300262451172,
"learning_rate": 0.0001403887884942198,
"loss": 0.8273,
"step": 18840
},
{
"epoch": 0.7749198783794888,
"grad_norm": 1.2705551385879517,
"learning_rate": 0.000140267013674484,
"loss": 0.8051,
"step": 18860
},
{
"epoch": 0.7757416385898595,
"grad_norm": 1.319014072418213,
"learning_rate": 0.00014014516754496656,
"loss": 0.8107,
"step": 18880
},
{
"epoch": 0.7765633988002301,
"grad_norm": 1.3769927024841309,
"learning_rate": 0.00014002325032144754,
"loss": 0.7844,
"step": 18900
},
{
"epoch": 0.7773851590106007,
"grad_norm": 1.3221933841705322,
"learning_rate": 0.00013990126221983298,
"loss": 0.8131,
"step": 18920
},
{
"epoch": 0.7782069192209713,
"grad_norm": 1.2304918766021729,
"learning_rate": 0.00013977920345615444,
"loss": 0.8039,
"step": 18940
},
{
"epoch": 0.7790286794313419,
"grad_norm": 1.385318398475647,
"learning_rate": 0.00013965707424656856,
"loss": 0.7985,
"step": 18960
},
{
"epoch": 0.7798504396417125,
"grad_norm": 1.1849255561828613,
"learning_rate": 0.00013953487480735679,
"loss": 0.7902,
"step": 18980
},
{
"epoch": 0.7806721998520831,
"grad_norm": 1.2645635604858398,
"learning_rate": 0.0001394126053549249,
"loss": 0.7931,
"step": 19000
},
{
"epoch": 0.7814939600624538,
"grad_norm": 1.2201372385025024,
"learning_rate": 0.00013929026610580276,
"loss": 0.8244,
"step": 19020
},
{
"epoch": 0.7823157202728244,
"grad_norm": 1.1765888929367065,
"learning_rate": 0.0001391739793672815,
"loss": 0.8012,
"step": 19040
},
{
"epoch": 0.783137480483195,
"grad_norm": 1.0896601676940918,
"learning_rate": 0.0001390515046378748,
"loss": 0.8107,
"step": 19060
},
{
"epoch": 0.7839592406935656,
"grad_norm": 1.125887393951416,
"learning_rate": 0.00013892896075125936,
"loss": 0.8233,
"step": 19080
},
{
"epoch": 0.7847810009039362,
"grad_norm": 1.1168248653411865,
"learning_rate": 0.00013880634792445097,
"loss": 0.8036,
"step": 19100
},
{
"epoch": 0.7856027611143068,
"grad_norm": 1.3335416316986084,
"learning_rate": 0.00013868366637458755,
"loss": 0.8174,
"step": 19120
},
{
"epoch": 0.7864245213246774,
"grad_norm": 1.281420111656189,
"learning_rate": 0.0001385609163189286,
"loss": 0.8116,
"step": 19140
},
{
"epoch": 0.787246281535048,
"grad_norm": 1.2268955707550049,
"learning_rate": 0.00013843809797485515,
"loss": 0.8147,
"step": 19160
},
{
"epoch": 0.7880680417454187,
"grad_norm": 1.2544898986816406,
"learning_rate": 0.0001383152115598689,
"loss": 0.826,
"step": 19180
},
{
"epoch": 0.7888898019557893,
"grad_norm": 1.3066948652267456,
"learning_rate": 0.00013819225729159228,
"loss": 0.8072,
"step": 19200
},
{
"epoch": 0.7897115621661599,
"grad_norm": 1.2460455894470215,
"learning_rate": 0.00013806923538776787,
"loss": 0.7989,
"step": 19220
},
{
"epoch": 0.7905333223765305,
"grad_norm": 1.155281662940979,
"learning_rate": 0.00013794614606625792,
"loss": 0.8405,
"step": 19240
},
{
"epoch": 0.7913550825869011,
"grad_norm": 1.054686427116394,
"learning_rate": 0.00013782914896373142,
"loss": 0.8056,
"step": 19260
},
{
"epoch": 0.7921768427972717,
"grad_norm": 1.3717306852340698,
"learning_rate": 0.00013770592880481307,
"loss": 0.802,
"step": 19280
},
{
"epoch": 0.7929986030076424,
"grad_norm": 1.2226320505142212,
"learning_rate": 0.0001375826418715972,
"loss": 0.813,
"step": 19300
},
{
"epoch": 0.793820363218013,
"grad_norm": 1.250246524810791,
"learning_rate": 0.00013745928838241556,
"loss": 0.8225,
"step": 19320
},
{
"epoch": 0.7946421234283836,
"grad_norm": 1.0490273237228394,
"learning_rate": 0.00013733586855571754,
"loss": 0.7996,
"step": 19340
},
{
"epoch": 0.7954638836387542,
"grad_norm": 1.1633754968643188,
"learning_rate": 0.00013721238261007023,
"loss": 0.8224,
"step": 19360
},
{
"epoch": 0.7962856438491248,
"grad_norm": 1.079055666923523,
"learning_rate": 0.0001370888307641578,
"loss": 0.8207,
"step": 19380
},
{
"epoch": 0.7971074040594954,
"grad_norm": 1.3664897680282593,
"learning_rate": 0.00013696521323678092,
"loss": 0.8015,
"step": 19400
},
{
"epoch": 0.797929164269866,
"grad_norm": 1.1427098512649536,
"learning_rate": 0.00013684153024685685,
"loss": 0.8093,
"step": 19420
},
{
"epoch": 0.7987509244802367,
"grad_norm": 1.1648412942886353,
"learning_rate": 0.00013671778201341855,
"loss": 0.7898,
"step": 19440
},
{
"epoch": 0.7995726846906073,
"grad_norm": 1.1498785018920898,
"learning_rate": 0.00013659396875561467,
"loss": 0.8113,
"step": 19460
},
{
"epoch": 0.8003944449009779,
"grad_norm": 1.1189064979553223,
"learning_rate": 0.00013647009069270893,
"loss": 0.8062,
"step": 19480
},
{
"epoch": 0.8012162051113485,
"grad_norm": 1.225690245628357,
"learning_rate": 0.00013634614804407984,
"loss": 0.7994,
"step": 19500
},
{
"epoch": 0.8020379653217191,
"grad_norm": 1.1157780885696411,
"learning_rate": 0.0001362221410292203,
"loss": 0.7882,
"step": 19520
},
{
"epoch": 0.8028597255320897,
"grad_norm": 1.0617575645446777,
"learning_rate": 0.00013609806986773722,
"loss": 0.8035,
"step": 19540
},
{
"epoch": 0.8036814857424603,
"grad_norm": 1.128070592880249,
"learning_rate": 0.00013597393477935102,
"loss": 0.8095,
"step": 19560
},
{
"epoch": 0.804503245952831,
"grad_norm": 1.1231319904327393,
"learning_rate": 0.00013584973598389544,
"loss": 0.799,
"step": 19580
},
{
"epoch": 0.8053250061632016,
"grad_norm": 1.4038573503494263,
"learning_rate": 0.00013572547370131695,
"loss": 0.7849,
"step": 19600
},
{
"epoch": 0.8061467663735722,
"grad_norm": 1.208066463470459,
"learning_rate": 0.00013560114815167447,
"loss": 0.8127,
"step": 19620
},
{
"epoch": 0.8069685265839428,
"grad_norm": 1.243054747581482,
"learning_rate": 0.00013547675955513904,
"loss": 0.8139,
"step": 19640
},
{
"epoch": 0.8077902867943134,
"grad_norm": 1.1254897117614746,
"learning_rate": 0.00013535230813199324,
"loss": 0.7909,
"step": 19660
},
{
"epoch": 0.808612047004684,
"grad_norm": 1.3242278099060059,
"learning_rate": 0.00013522779410263098,
"loss": 0.8242,
"step": 19680
},
{
"epoch": 0.8094338072150546,
"grad_norm": 1.042738437652588,
"learning_rate": 0.00013510321768755702,
"loss": 0.7948,
"step": 19700
},
{
"epoch": 0.8102555674254253,
"grad_norm": 1.1268333196640015,
"learning_rate": 0.00013497857910738662,
"loss": 0.7923,
"step": 19720
},
{
"epoch": 0.8110773276357959,
"grad_norm": 1.115422010421753,
"learning_rate": 0.00013485387858284506,
"loss": 0.7998,
"step": 19740
},
{
"epoch": 0.8118990878461665,
"grad_norm": 1.2183444499969482,
"learning_rate": 0.0001347291163347674,
"loss": 0.8054,
"step": 19760
},
{
"epoch": 0.8127208480565371,
"grad_norm": 1.158079981803894,
"learning_rate": 0.000134604292584098,
"loss": 0.7716,
"step": 19780
},
{
"epoch": 0.8135426082669077,
"grad_norm": 1.0598976612091064,
"learning_rate": 0.0001344794075518901,
"loss": 0.7967,
"step": 19800
},
{
"epoch": 0.8143643684772783,
"grad_norm": 1.1043282747268677,
"learning_rate": 0.00013435446145930544,
"loss": 0.7848,
"step": 19820
},
{
"epoch": 0.815186128687649,
"grad_norm": 1.2107045650482178,
"learning_rate": 0.00013422945452761398,
"loss": 0.7996,
"step": 19840
},
{
"epoch": 0.8160078888980196,
"grad_norm": 1.1288707256317139,
"learning_rate": 0.00013410438697819337,
"loss": 0.8172,
"step": 19860
},
{
"epoch": 0.8168296491083902,
"grad_norm": 1.215134859085083,
"learning_rate": 0.0001339792590325286,
"loss": 0.8155,
"step": 19880
},
{
"epoch": 0.8176514093187608,
"grad_norm": 1.3704839944839478,
"learning_rate": 0.00013385407091221163,
"loss": 0.7831,
"step": 19900
},
{
"epoch": 0.8184731695291314,
"grad_norm": 1.2613569498062134,
"learning_rate": 0.000133728822838941,
"loss": 0.8154,
"step": 19920
},
{
"epoch": 0.819294929739502,
"grad_norm": 1.2519367933273315,
"learning_rate": 0.00013360351503452137,
"loss": 0.8089,
"step": 19940
},
{
"epoch": 0.8201166899498726,
"grad_norm": 1.259395956993103,
"learning_rate": 0.0001334781477208632,
"loss": 0.8161,
"step": 19960
},
{
"epoch": 0.8209384501602432,
"grad_norm": 1.1715264320373535,
"learning_rate": 0.00013335272111998246,
"loss": 0.8009,
"step": 19980
},
{
"epoch": 0.8217602103706139,
"grad_norm": 1.1847702264785767,
"learning_rate": 0.0001332272354539999,
"loss": 0.774,
"step": 20000
},
{
"epoch": 0.8217602103706139,
"eval_loss": 1.0668244361877441,
"eval_runtime": 16.5724,
"eval_samples_per_second": 158.094,
"eval_steps_per_second": 4.948,
"step": 20000
},
{
"epoch": 0.8225819705809845,
"grad_norm": 1.224970817565918,
"learning_rate": 0.00013310169094514103,
"loss": 0.8,
"step": 20020
},
{
"epoch": 0.8234037307913551,
"grad_norm": 1.1203746795654297,
"learning_rate": 0.0001329760878157355,
"loss": 0.8158,
"step": 20040
},
{
"epoch": 0.8242254910017257,
"grad_norm": 1.1533716917037964,
"learning_rate": 0.00013285042628821675,
"loss": 0.7896,
"step": 20060
},
{
"epoch": 0.8250472512120963,
"grad_norm": 1.213652491569519,
"learning_rate": 0.00013272470658512174,
"loss": 0.8121,
"step": 20080
},
{
"epoch": 0.8258690114224669,
"grad_norm": 1.2149641513824463,
"learning_rate": 0.00013259892892909033,
"loss": 0.7926,
"step": 20100
},
{
"epoch": 0.8266907716328376,
"grad_norm": 1.3824020624160767,
"learning_rate": 0.00013247309354286503,
"loss": 0.7909,
"step": 20120
},
{
"epoch": 0.8275125318432082,
"grad_norm": 1.0057183504104614,
"learning_rate": 0.00013234720064929075,
"loss": 0.819,
"step": 20140
},
{
"epoch": 0.8283342920535788,
"grad_norm": 1.1706757545471191,
"learning_rate": 0.000132221250471314,
"loss": 0.8248,
"step": 20160
},
{
"epoch": 0.8291560522639494,
"grad_norm": 1.0104855298995972,
"learning_rate": 0.00013209524323198294,
"loss": 0.7992,
"step": 20180
},
{
"epoch": 0.82997781247432,
"grad_norm": 1.2821381092071533,
"learning_rate": 0.0001319691791544466,
"loss": 0.802,
"step": 20200
},
{
"epoch": 0.8307995726846906,
"grad_norm": 1.106156826019287,
"learning_rate": 0.0001318430584619548,
"loss": 0.7799,
"step": 20220
},
{
"epoch": 0.8316213328950612,
"grad_norm": 1.1786506175994873,
"learning_rate": 0.0001317168813778576,
"loss": 0.7927,
"step": 20240
},
{
"epoch": 0.8324430931054319,
"grad_norm": 1.280644178390503,
"learning_rate": 0.00013159064812560484,
"loss": 0.7922,
"step": 20260
},
{
"epoch": 0.8332648533158025,
"grad_norm": 1.2000395059585571,
"learning_rate": 0.00013146435892874596,
"loss": 0.8024,
"step": 20280
},
{
"epoch": 0.8340866135261731,
"grad_norm": 1.384141206741333,
"learning_rate": 0.00013134433257673927,
"loss": 0.8428,
"step": 20300
},
{
"epoch": 0.8349083737365437,
"grad_norm": 1.1807432174682617,
"learning_rate": 0.00013121793493125713,
"loss": 0.8039,
"step": 20320
},
{
"epoch": 0.8357301339469143,
"grad_norm": 1.1562411785125732,
"learning_rate": 0.00013109148200121524,
"loss": 0.8043,
"step": 20340
},
{
"epoch": 0.8365518941572849,
"grad_norm": 1.0757108926773071,
"learning_rate": 0.00013096497401055197,
"loss": 0.7933,
"step": 20360
},
{
"epoch": 0.8373736543676555,
"grad_norm": 1.1456866264343262,
"learning_rate": 0.00013083841118330326,
"loss": 0.7793,
"step": 20380
},
{
"epoch": 0.8381954145780262,
"grad_norm": 1.3488410711288452,
"learning_rate": 0.0001307117937436021,
"loss": 0.7775,
"step": 20400
},
{
"epoch": 0.8390171747883968,
"grad_norm": 1.2938168048858643,
"learning_rate": 0.0001305851219156783,
"loss": 0.8144,
"step": 20420
},
{
"epoch": 0.8398389349987674,
"grad_norm": 1.131605625152588,
"learning_rate": 0.00013045839592385787,
"loss": 0.7883,
"step": 20440
},
{
"epoch": 0.840660695209138,
"grad_norm": 1.1760270595550537,
"learning_rate": 0.00013033161599256275,
"loss": 0.7984,
"step": 20460
},
{
"epoch": 0.8414824554195086,
"grad_norm": 1.1592392921447754,
"learning_rate": 0.00013020478234631049,
"loss": 0.8142,
"step": 20480
},
{
"epoch": 0.8423042156298792,
"grad_norm": 1.3598871231079102,
"learning_rate": 0.00013007789520971374,
"loss": 0.7861,
"step": 20500
},
{
"epoch": 0.8431259758402498,
"grad_norm": 1.2098135948181152,
"learning_rate": 0.00012995095480747976,
"loss": 0.8069,
"step": 20520
},
{
"epoch": 0.8439477360506205,
"grad_norm": 1.0988825559616089,
"learning_rate": 0.00012982396136441033,
"loss": 0.7971,
"step": 20540
},
{
"epoch": 0.8447694962609911,
"grad_norm": 0.9486870765686035,
"learning_rate": 0.000129696915105401,
"loss": 0.7807,
"step": 20560
},
{
"epoch": 0.8455912564713617,
"grad_norm": 1.1476234197616577,
"learning_rate": 0.00012956981625544093,
"loss": 0.7884,
"step": 20580
},
{
"epoch": 0.8464130166817323,
"grad_norm": 1.1225764751434326,
"learning_rate": 0.00012944266503961242,
"loss": 0.7966,
"step": 20600
},
{
"epoch": 0.8472347768921029,
"grad_norm": 1.316215991973877,
"learning_rate": 0.00012931546168309046,
"loss": 0.8115,
"step": 20620
},
{
"epoch": 0.8480565371024735,
"grad_norm": 1.1493761539459229,
"learning_rate": 0.0001291882064111424,
"loss": 0.8094,
"step": 20640
},
{
"epoch": 0.8488782973128441,
"grad_norm": 1.1531728506088257,
"learning_rate": 0.00012906089944912755,
"loss": 0.8158,
"step": 20660
},
{
"epoch": 0.8497000575232148,
"grad_norm": 1.1480746269226074,
"learning_rate": 0.00012893354102249673,
"loss": 0.8085,
"step": 20680
},
{
"epoch": 0.8505218177335854,
"grad_norm": 1.265832543373108,
"learning_rate": 0.00012880613135679193,
"loss": 0.803,
"step": 20700
},
{
"epoch": 0.851343577943956,
"grad_norm": 1.0262377262115479,
"learning_rate": 0.00012867867067764584,
"loss": 0.774,
"step": 20720
},
{
"epoch": 0.8521653381543266,
"grad_norm": 1.136478066444397,
"learning_rate": 0.00012855115921078152,
"loss": 0.779,
"step": 20740
},
{
"epoch": 0.8529870983646972,
"grad_norm": 1.1349806785583496,
"learning_rate": 0.00012842359718201194,
"loss": 0.7922,
"step": 20760
},
{
"epoch": 0.8538088585750678,
"grad_norm": 1.2569667100906372,
"learning_rate": 0.00012829598481723964,
"loss": 0.7739,
"step": 20780
},
{
"epoch": 0.8546306187854384,
"grad_norm": 1.2146037817001343,
"learning_rate": 0.00012816832234245634,
"loss": 0.8221,
"step": 20800
},
{
"epoch": 0.8554523789958091,
"grad_norm": 1.300690770149231,
"learning_rate": 0.00012804060998374245,
"loss": 0.7928,
"step": 20820
},
{
"epoch": 0.8562741392061797,
"grad_norm": 1.0861823558807373,
"learning_rate": 0.00012791284796726663,
"loss": 0.8043,
"step": 20840
},
{
"epoch": 0.8570958994165503,
"grad_norm": 1.2741750478744507,
"learning_rate": 0.0001277850365192857,
"loss": 0.8065,
"step": 20860
},
{
"epoch": 0.8579176596269209,
"grad_norm": 1.0767971277236938,
"learning_rate": 0.00012765717586614382,
"loss": 0.8153,
"step": 20880
},
{
"epoch": 0.8587394198372915,
"grad_norm": 1.1135772466659546,
"learning_rate": 0.0001275292662342724,
"loss": 0.7685,
"step": 20900
},
{
"epoch": 0.8595611800476621,
"grad_norm": 1.2278940677642822,
"learning_rate": 0.0001274013078501895,
"loss": 0.7635,
"step": 20920
},
{
"epoch": 0.8603829402580327,
"grad_norm": 1.0262848138809204,
"learning_rate": 0.00012727330094049967,
"loss": 0.7907,
"step": 20940
},
{
"epoch": 0.8612047004684034,
"grad_norm": 1.2303491830825806,
"learning_rate": 0.0001271452457318932,
"loss": 0.8026,
"step": 20960
},
{
"epoch": 0.862026460678774,
"grad_norm": 1.1088433265686035,
"learning_rate": 0.00012701714245114603,
"loss": 0.7966,
"step": 20980
},
{
"epoch": 0.8628482208891446,
"grad_norm": 1.1193281412124634,
"learning_rate": 0.00012688899132511924,
"loss": 0.7924,
"step": 21000
},
{
"epoch": 0.8636699810995152,
"grad_norm": 1.308245301246643,
"learning_rate": 0.00012676079258075858,
"loss": 0.7885,
"step": 21020
},
{
"epoch": 0.8644917413098858,
"grad_norm": 1.2952789068222046,
"learning_rate": 0.0001266325464450942,
"loss": 0.7913,
"step": 21040
},
{
"epoch": 0.8653135015202564,
"grad_norm": 1.1136795282363892,
"learning_rate": 0.00012650425314524006,
"loss": 0.7927,
"step": 21060
},
{
"epoch": 0.866135261730627,
"grad_norm": 1.222264051437378,
"learning_rate": 0.00012637591290839376,
"loss": 0.8171,
"step": 21080
},
{
"epoch": 0.8669570219409977,
"grad_norm": 1.3129292726516724,
"learning_rate": 0.000126247525961836,
"loss": 0.786,
"step": 21100
},
{
"epoch": 0.8677787821513683,
"grad_norm": 1.15342116355896,
"learning_rate": 0.00012611909253293016,
"loss": 0.7821,
"step": 21120
},
{
"epoch": 0.8686005423617389,
"grad_norm": 1.2096205949783325,
"learning_rate": 0.00012599061284912193,
"loss": 0.7886,
"step": 21140
},
{
"epoch": 0.8694223025721095,
"grad_norm": 1.0738475322723389,
"learning_rate": 0.00012586208713793898,
"loss": 0.7813,
"step": 21160
},
{
"epoch": 0.8702440627824801,
"grad_norm": 1.1030628681182861,
"learning_rate": 0.00012573351562699048,
"loss": 0.7859,
"step": 21180
},
{
"epoch": 0.8710658229928507,
"grad_norm": 1.0959700345993042,
"learning_rate": 0.0001256048985439666,
"loss": 0.8055,
"step": 21200
},
{
"epoch": 0.8718875832032214,
"grad_norm": 1.2536059617996216,
"learning_rate": 0.00012547623611663836,
"loss": 0.7837,
"step": 21220
},
{
"epoch": 0.872709343413592,
"grad_norm": 1.2621644735336304,
"learning_rate": 0.00012534752857285692,
"loss": 0.7977,
"step": 21240
},
{
"epoch": 0.8735311036239626,
"grad_norm": 1.1967788934707642,
"learning_rate": 0.00012521877614055357,
"loss": 0.8035,
"step": 21260
},
{
"epoch": 0.8743528638343332,
"grad_norm": 1.2444006204605103,
"learning_rate": 0.00012508997904773884,
"loss": 0.771,
"step": 21280
},
{
"epoch": 0.8751746240447038,
"grad_norm": 1.4090861082077026,
"learning_rate": 0.00012497402366798297,
"loss": 0.7657,
"step": 21300
},
{
"epoch": 0.8759963842550744,
"grad_norm": 1.2610723972320557,
"learning_rate": 0.00012484514234864898,
"loss": 0.7928,
"step": 21320
},
{
"epoch": 0.876818144465445,
"grad_norm": 1.2457926273345947,
"learning_rate": 0.0001247226643378368,
"loss": 0.7985,
"step": 21340
},
{
"epoch": 0.8776399046758157,
"grad_norm": 1.2153024673461914,
"learning_rate": 0.0001245936974322525,
"loss": 0.8184,
"step": 21360
},
{
"epoch": 0.8784616648861862,
"grad_norm": 1.244173526763916,
"learning_rate": 0.00012446468697312316,
"loss": 0.7855,
"step": 21380
},
{
"epoch": 0.8792834250965568,
"grad_norm": 1.2349773645401,
"learning_rate": 0.00012433563318891633,
"loss": 0.7915,
"step": 21400
},
{
"epoch": 0.8801051853069274,
"grad_norm": 1.3552359342575073,
"learning_rate": 0.0001242065363081764,
"loss": 0.7878,
"step": 21420
},
{
"epoch": 0.880926945517298,
"grad_norm": 1.071760654449463,
"learning_rate": 0.00012407739655952393,
"loss": 0.7836,
"step": 21440
},
{
"epoch": 0.8817487057276686,
"grad_norm": 1.2500207424163818,
"learning_rate": 0.0001239482141716555,
"loss": 0.7781,
"step": 21460
},
{
"epoch": 0.8825704659380392,
"grad_norm": 1.2251540422439575,
"learning_rate": 0.00012381898937334322,
"loss": 0.7826,
"step": 21480
},
{
"epoch": 0.8833922261484098,
"grad_norm": 1.0519969463348389,
"learning_rate": 0.0001236897223934342,
"loss": 0.7764,
"step": 21500
},
{
"epoch": 0.8842139863587805,
"grad_norm": 0.9632487297058105,
"learning_rate": 0.00012356041346085032,
"loss": 0.8034,
"step": 21520
},
{
"epoch": 0.8850357465691511,
"grad_norm": 1.2027411460876465,
"learning_rate": 0.00012343106280458777,
"loss": 0.8176,
"step": 21540
},
{
"epoch": 0.8858575067795217,
"grad_norm": 1.1366872787475586,
"learning_rate": 0.00012330814124322003,
"loss": 0.7884,
"step": 21560
},
{
"epoch": 0.8866792669898923,
"grad_norm": 1.137412667274475,
"learning_rate": 0.00012317870988471372,
"loss": 0.7777,
"step": 21580
},
{
"epoch": 0.8875010272002629,
"grad_norm": 1.1433818340301514,
"learning_rate": 0.00012304923747849642,
"loss": 0.7818,
"step": 21600
},
{
"epoch": 0.8883227874106335,
"grad_norm": 1.1305466890335083,
"learning_rate": 0.00012291972425385373,
"loss": 0.7829,
"step": 21620
},
{
"epoch": 0.8891445476210041,
"grad_norm": 1.1296701431274414,
"learning_rate": 0.00012279017044014364,
"loss": 0.8018,
"step": 21640
},
{
"epoch": 0.8899663078313748,
"grad_norm": 1.1467466354370117,
"learning_rate": 0.00012266057626679594,
"loss": 0.8009,
"step": 21660
},
{
"epoch": 0.8907880680417454,
"grad_norm": 1.2313017845153809,
"learning_rate": 0.00012253094196331202,
"loss": 0.7924,
"step": 21680
},
{
"epoch": 0.891609828252116,
"grad_norm": 1.1884132623672485,
"learning_rate": 0.00012240126775926418,
"loss": 0.8,
"step": 21700
},
{
"epoch": 0.8924315884624866,
"grad_norm": 1.1451033353805542,
"learning_rate": 0.00012227155388429547,
"loss": 0.8158,
"step": 21720
},
{
"epoch": 0.8932533486728572,
"grad_norm": 1.2778671979904175,
"learning_rate": 0.00012214180056811916,
"loss": 0.773,
"step": 21740
},
{
"epoch": 0.8940751088832278,
"grad_norm": 1.0006611347198486,
"learning_rate": 0.00012201200804051842,
"loss": 0.7859,
"step": 21760
},
{
"epoch": 0.8948968690935984,
"grad_norm": 1.270612120628357,
"learning_rate": 0.0001218821765313458,
"loss": 0.7977,
"step": 21780
},
{
"epoch": 0.8957186293039691,
"grad_norm": 1.4604487419128418,
"learning_rate": 0.00012175230627052294,
"loss": 0.8046,
"step": 21800
},
{
"epoch": 0.8965403895143397,
"grad_norm": 1.153067946434021,
"learning_rate": 0.00012162239748804005,
"loss": 0.7672,
"step": 21820
},
{
"epoch": 0.8973621497247103,
"grad_norm": 1.1451218128204346,
"learning_rate": 0.00012149245041395559,
"loss": 0.7758,
"step": 21840
},
{
"epoch": 0.8981839099350809,
"grad_norm": 1.2750080823898315,
"learning_rate": 0.00012136246527839583,
"loss": 0.7816,
"step": 21860
},
{
"epoch": 0.8990056701454515,
"grad_norm": 1.0611047744750977,
"learning_rate": 0.00012123244231155442,
"loss": 0.7668,
"step": 21880
},
{
"epoch": 0.8998274303558221,
"grad_norm": 1.098158359527588,
"learning_rate": 0.00012110238174369202,
"loss": 0.8227,
"step": 21900
},
{
"epoch": 0.9006491905661927,
"grad_norm": 1.0489616394042969,
"learning_rate": 0.00012097228380513595,
"loss": 0.8008,
"step": 21920
},
{
"epoch": 0.9014709507765634,
"grad_norm": 1.1194531917572021,
"learning_rate": 0.00012084214872627955,
"loss": 0.8118,
"step": 21940
},
{
"epoch": 0.902292710986934,
"grad_norm": 1.1527105569839478,
"learning_rate": 0.00012071197673758212,
"loss": 0.769,
"step": 21960
},
{
"epoch": 0.9031144711973046,
"grad_norm": 1.1321651935577393,
"learning_rate": 0.00012058176806956818,
"loss": 0.7789,
"step": 21980
},
{
"epoch": 0.9039362314076752,
"grad_norm": 1.25917387008667,
"learning_rate": 0.00012045152295282727,
"loss": 0.7828,
"step": 22000
},
{
"epoch": 0.9039362314076752,
"eval_loss": 1.0472674369812012,
"eval_runtime": 23.5948,
"eval_samples_per_second": 111.042,
"eval_steps_per_second": 3.475,
"step": 22000
},
{
"epoch": 0.9047579916180458,
"grad_norm": 1.260465145111084,
"learning_rate": 0.00012032124161801345,
"loss": 0.761,
"step": 22020
},
{
"epoch": 0.9055797518284164,
"grad_norm": 1.0791617631912231,
"learning_rate": 0.00012019092429584495,
"loss": 0.7737,
"step": 22040
},
{
"epoch": 0.906401512038787,
"grad_norm": 1.277144193649292,
"learning_rate": 0.0001200605712171037,
"loss": 0.8014,
"step": 22060
},
{
"epoch": 0.9072232722491577,
"grad_norm": 1.1210474967956543,
"learning_rate": 0.00011993018261263503,
"loss": 0.8066,
"step": 22080
},
{
"epoch": 0.9080450324595283,
"grad_norm": 1.2601996660232544,
"learning_rate": 0.00011979975871334703,
"loss": 0.7787,
"step": 22100
},
{
"epoch": 0.9088667926698989,
"grad_norm": 1.1547856330871582,
"learning_rate": 0.00011966929975021046,
"loss": 0.7936,
"step": 22120
},
{
"epoch": 0.9096885528802695,
"grad_norm": 1.1628038883209229,
"learning_rate": 0.00011953880595425808,
"loss": 0.7803,
"step": 22140
},
{
"epoch": 0.9105103130906401,
"grad_norm": 1.1185588836669922,
"learning_rate": 0.00011940827755658433,
"loss": 0.8004,
"step": 22160
},
{
"epoch": 0.9113320733010107,
"grad_norm": 1.0631072521209717,
"learning_rate": 0.00011927771478834496,
"loss": 0.7898,
"step": 22180
},
{
"epoch": 0.9121538335113814,
"grad_norm": 1.2936205863952637,
"learning_rate": 0.00011914711788075663,
"loss": 0.7809,
"step": 22200
},
{
"epoch": 0.912975593721752,
"grad_norm": 1.084246039390564,
"learning_rate": 0.00011901648706509636,
"loss": 0.795,
"step": 22220
},
{
"epoch": 0.9137973539321226,
"grad_norm": 1.3070451021194458,
"learning_rate": 0.00011888582257270132,
"loss": 0.793,
"step": 22240
},
{
"epoch": 0.9146191141424932,
"grad_norm": 1.2026519775390625,
"learning_rate": 0.00011875512463496822,
"loss": 0.8013,
"step": 22260
},
{
"epoch": 0.9154408743528638,
"grad_norm": 1.2209608554840088,
"learning_rate": 0.00011862439348335306,
"loss": 0.7708,
"step": 22280
},
{
"epoch": 0.9162626345632344,
"grad_norm": 1.1526132822036743,
"learning_rate": 0.00011849362934937065,
"loss": 0.7822,
"step": 22300
},
{
"epoch": 0.917084394773605,
"grad_norm": 1.0761892795562744,
"learning_rate": 0.00011836283246459421,
"loss": 0.8061,
"step": 22320
},
{
"epoch": 0.9179061549839757,
"grad_norm": 1.2090603113174438,
"learning_rate": 0.00011823200306065494,
"loss": 0.7692,
"step": 22340
},
{
"epoch": 0.9187279151943463,
"grad_norm": 1.1188862323760986,
"learning_rate": 0.00011810114136924166,
"loss": 0.8228,
"step": 22360
},
{
"epoch": 0.9195496754047169,
"grad_norm": 1.138457179069519,
"learning_rate": 0.00011797024762210034,
"loss": 0.7883,
"step": 22380
},
{
"epoch": 0.9203714356150875,
"grad_norm": 1.0787718296051025,
"learning_rate": 0.00011783932205103376,
"loss": 0.7743,
"step": 22400
},
{
"epoch": 0.9211931958254581,
"grad_norm": 1.0644888877868652,
"learning_rate": 0.00011770836488790103,
"loss": 0.8028,
"step": 22420
},
{
"epoch": 0.9220149560358287,
"grad_norm": 1.1350173950195312,
"learning_rate": 0.00011757737636461711,
"loss": 0.769,
"step": 22440
},
{
"epoch": 0.9228367162461993,
"grad_norm": 1.2350654602050781,
"learning_rate": 0.00011744635671315274,
"loss": 0.7614,
"step": 22460
},
{
"epoch": 0.92365847645657,
"grad_norm": 1.1380183696746826,
"learning_rate": 0.0001173153061655335,
"loss": 0.7789,
"step": 22480
},
{
"epoch": 0.9244802366669406,
"grad_norm": 1.2965232133865356,
"learning_rate": 0.00011718422495383992,
"loss": 0.8028,
"step": 22500
},
{
"epoch": 0.9253019968773112,
"grad_norm": 1.2787531614303589,
"learning_rate": 0.00011705311331020667,
"loss": 0.7729,
"step": 22520
},
{
"epoch": 0.9261237570876818,
"grad_norm": 1.1900768280029297,
"learning_rate": 0.00011692197146682242,
"loss": 0.7856,
"step": 22540
},
{
"epoch": 0.9269455172980524,
"grad_norm": 1.2483367919921875,
"learning_rate": 0.0001167907996559293,
"loss": 0.7906,
"step": 22560
},
{
"epoch": 0.927767277508423,
"grad_norm": 1.0251802206039429,
"learning_rate": 0.00011665959810982245,
"loss": 0.7756,
"step": 22580
},
{
"epoch": 0.9285890377187936,
"grad_norm": 1.267059087753296,
"learning_rate": 0.00011652836706084969,
"loss": 0.8083,
"step": 22600
},
{
"epoch": 0.9294107979291643,
"grad_norm": 1.1620546579360962,
"learning_rate": 0.00011639710674141114,
"loss": 0.7737,
"step": 22620
},
{
"epoch": 0.9302325581395349,
"grad_norm": 1.09406578540802,
"learning_rate": 0.00011626581738395872,
"loss": 0.8101,
"step": 22640
},
{
"epoch": 0.9310543183499055,
"grad_norm": 1.101579189300537,
"learning_rate": 0.00011613449922099576,
"loss": 0.7853,
"step": 22660
},
{
"epoch": 0.9318760785602761,
"grad_norm": 1.2870060205459595,
"learning_rate": 0.00011600315248507666,
"loss": 0.7786,
"step": 22680
},
{
"epoch": 0.9326978387706467,
"grad_norm": 1.1342302560806274,
"learning_rate": 0.00011587177740880633,
"loss": 0.7688,
"step": 22700
},
{
"epoch": 0.9335195989810173,
"grad_norm": 1.2600188255310059,
"learning_rate": 0.00011574037422483995,
"loss": 0.7672,
"step": 22720
},
{
"epoch": 0.9343413591913879,
"grad_norm": 1.1466922760009766,
"learning_rate": 0.00011560894316588243,
"loss": 0.7855,
"step": 22740
},
{
"epoch": 0.9351631194017586,
"grad_norm": 1.2696952819824219,
"learning_rate": 0.00011547748446468802,
"loss": 0.7869,
"step": 22760
},
{
"epoch": 0.9359848796121292,
"grad_norm": 1.2031900882720947,
"learning_rate": 0.00011534599835406001,
"loss": 0.784,
"step": 22780
},
{
"epoch": 0.9368066398224998,
"grad_norm": 1.1982570886611938,
"learning_rate": 0.00011521448506685022,
"loss": 0.7838,
"step": 22800
},
{
"epoch": 0.9376284000328704,
"grad_norm": 1.0865845680236816,
"learning_rate": 0.00011508294483595845,
"loss": 0.7608,
"step": 22820
},
{
"epoch": 0.938450160243241,
"grad_norm": 1.1527810096740723,
"learning_rate": 0.00011495137789433243,
"loss": 0.7637,
"step": 22840
},
{
"epoch": 0.9392719204536116,
"grad_norm": 1.3062829971313477,
"learning_rate": 0.00011481978447496704,
"loss": 0.7919,
"step": 22860
},
{
"epoch": 0.9400936806639822,
"grad_norm": 1.1722383499145508,
"learning_rate": 0.00011468816481090406,
"loss": 0.7742,
"step": 22880
},
{
"epoch": 0.9409154408743529,
"grad_norm": 1.1022741794586182,
"learning_rate": 0.00011455651913523184,
"loss": 0.7798,
"step": 22900
},
{
"epoch": 0.9417372010847235,
"grad_norm": 1.2316502332687378,
"learning_rate": 0.0001144248476810847,
"loss": 0.7957,
"step": 22920
},
{
"epoch": 0.9425589612950941,
"grad_norm": 1.0445023775100708,
"learning_rate": 0.00011429315068164269,
"loss": 0.7784,
"step": 22940
},
{
"epoch": 0.9433807215054647,
"grad_norm": 1.1832512617111206,
"learning_rate": 0.00011416142837013101,
"loss": 0.7939,
"step": 22960
},
{
"epoch": 0.9442024817158353,
"grad_norm": 1.1852409839630127,
"learning_rate": 0.00011402968097981976,
"loss": 0.7692,
"step": 22980
},
{
"epoch": 0.9450242419262059,
"grad_norm": 1.3063126802444458,
"learning_rate": 0.00011389790874402347,
"loss": 0.7647,
"step": 23000
},
{
"epoch": 0.9458460021365765,
"grad_norm": 1.3952887058258057,
"learning_rate": 0.00011376611189610056,
"loss": 0.7942,
"step": 23020
},
{
"epoch": 0.9466677623469472,
"grad_norm": 1.1620287895202637,
"learning_rate": 0.0001136342906694531,
"loss": 0.791,
"step": 23040
},
{
"epoch": 0.9474895225573178,
"grad_norm": 1.1343626976013184,
"learning_rate": 0.0001135024452975264,
"loss": 0.7983,
"step": 23060
},
{
"epoch": 0.9483112827676884,
"grad_norm": 1.3010191917419434,
"learning_rate": 0.00011337057601380841,
"loss": 0.7934,
"step": 23080
},
{
"epoch": 0.949133042978059,
"grad_norm": 1.0730324983596802,
"learning_rate": 0.0001132386830518295,
"loss": 0.7907,
"step": 23100
},
{
"epoch": 0.9499548031884296,
"grad_norm": 1.1792380809783936,
"learning_rate": 0.00011310676664516196,
"loss": 0.7756,
"step": 23120
},
{
"epoch": 0.9507765633988002,
"grad_norm": 1.037816047668457,
"learning_rate": 0.00011297482702741958,
"loss": 0.7706,
"step": 23140
},
{
"epoch": 0.9515983236091708,
"grad_norm": 1.2920358180999756,
"learning_rate": 0.00011284286443225725,
"loss": 0.7788,
"step": 23160
},
{
"epoch": 0.9524200838195415,
"grad_norm": 1.3024156093597412,
"learning_rate": 0.00011271087909337059,
"loss": 0.7726,
"step": 23180
},
{
"epoch": 0.9532418440299121,
"grad_norm": 1.0410034656524658,
"learning_rate": 0.00011257887124449549,
"loss": 0.7358,
"step": 23200
},
{
"epoch": 0.9540636042402827,
"grad_norm": 1.1379297971725464,
"learning_rate": 0.00011244684111940765,
"loss": 0.7654,
"step": 23220
},
{
"epoch": 0.9548853644506533,
"grad_norm": 1.1472039222717285,
"learning_rate": 0.00011231478895192232,
"loss": 0.7398,
"step": 23240
},
{
"epoch": 0.9557071246610239,
"grad_norm": 1.1397725343704224,
"learning_rate": 0.00011218271497589364,
"loss": 0.802,
"step": 23260
},
{
"epoch": 0.9565288848713945,
"grad_norm": 1.2153700590133667,
"learning_rate": 0.00011205061942521453,
"loss": 0.8053,
"step": 23280
},
{
"epoch": 0.9573506450817652,
"grad_norm": 1.1267844438552856,
"learning_rate": 0.00011191850253381601,
"loss": 0.7778,
"step": 23300
},
{
"epoch": 0.9581724052921358,
"grad_norm": 1.2119829654693604,
"learning_rate": 0.00011178636453566691,
"loss": 0.8014,
"step": 23320
},
{
"epoch": 0.9589941655025064,
"grad_norm": 1.1640605926513672,
"learning_rate": 0.00011165420566477351,
"loss": 0.7643,
"step": 23340
},
{
"epoch": 0.959815925712877,
"grad_norm": 1.2424917221069336,
"learning_rate": 0.0001115220261551789,
"loss": 0.7769,
"step": 23360
},
{
"epoch": 0.9606376859232476,
"grad_norm": 1.1949933767318726,
"learning_rate": 0.0001113898262409629,
"loss": 0.7647,
"step": 23380
},
{
"epoch": 0.9614594461336182,
"grad_norm": 1.0828710794448853,
"learning_rate": 0.0001112576061562414,
"loss": 0.7729,
"step": 23400
},
{
"epoch": 0.9622812063439888,
"grad_norm": 1.1788461208343506,
"learning_rate": 0.00011112536613516589,
"loss": 0.7833,
"step": 23420
},
{
"epoch": 0.9631029665543595,
"grad_norm": 1.1178921461105347,
"learning_rate": 0.00011099310641192335,
"loss": 0.7873,
"step": 23440
},
{
"epoch": 0.9639247267647301,
"grad_norm": 1.131232738494873,
"learning_rate": 0.00011086082722073556,
"loss": 0.7622,
"step": 23460
},
{
"epoch": 0.9647464869751007,
"grad_norm": 1.1513261795043945,
"learning_rate": 0.00011072852879585876,
"loss": 0.7781,
"step": 23480
},
{
"epoch": 0.9655682471854713,
"grad_norm": 1.185996413230896,
"learning_rate": 0.00011059621137158332,
"loss": 0.7987,
"step": 23500
},
{
"epoch": 0.9663900073958419,
"grad_norm": 1.2174021005630493,
"learning_rate": 0.00011046387518223314,
"loss": 0.7732,
"step": 23520
},
{
"epoch": 0.9672117676062125,
"grad_norm": 1.2955466508865356,
"learning_rate": 0.00011033152046216546,
"loss": 0.772,
"step": 23540
},
{
"epoch": 0.9680335278165831,
"grad_norm": 1.1724337339401245,
"learning_rate": 0.00011019914744577034,
"loss": 0.7811,
"step": 23560
},
{
"epoch": 0.9688552880269538,
"grad_norm": 1.1841801404953003,
"learning_rate": 0.00011006675636747017,
"loss": 0.7893,
"step": 23580
},
{
"epoch": 0.9696770482373244,
"grad_norm": 1.1488229036331177,
"learning_rate": 0.00010993434746171933,
"loss": 0.759,
"step": 23600
},
{
"epoch": 0.970498808447695,
"grad_norm": 1.2947933673858643,
"learning_rate": 0.00010980192096300389,
"loss": 0.7821,
"step": 23620
},
{
"epoch": 0.9713205686580656,
"grad_norm": 1.1167196035385132,
"learning_rate": 0.00010966947710584086,
"loss": 0.7765,
"step": 23640
},
{
"epoch": 0.9721423288684362,
"grad_norm": 1.114274024963379,
"learning_rate": 0.00010953701612477821,
"loss": 0.7563,
"step": 23660
},
{
"epoch": 0.9729640890788068,
"grad_norm": 1.1539628505706787,
"learning_rate": 0.00010940453825439411,
"loss": 0.7754,
"step": 23680
},
{
"epoch": 0.9737858492891774,
"grad_norm": 1.1876133680343628,
"learning_rate": 0.00010927204372929667,
"loss": 0.7681,
"step": 23700
},
{
"epoch": 0.974607609499548,
"grad_norm": 1.0904812812805176,
"learning_rate": 0.00010913953278412353,
"loss": 0.7726,
"step": 23720
},
{
"epoch": 0.9754293697099187,
"grad_norm": 1.1562939882278442,
"learning_rate": 0.00010900700565354131,
"loss": 0.7774,
"step": 23740
},
{
"epoch": 0.9762511299202893,
"grad_norm": 1.227735161781311,
"learning_rate": 0.0001088744625722454,
"loss": 0.7907,
"step": 23760
},
{
"epoch": 0.9770728901306599,
"grad_norm": 1.1245887279510498,
"learning_rate": 0.00010874190377495938,
"loss": 0.7882,
"step": 23780
},
{
"epoch": 0.9778946503410305,
"grad_norm": 1.1092678308486938,
"learning_rate": 0.0001086093294964347,
"loss": 0.759,
"step": 23800
},
{
"epoch": 0.9787164105514011,
"grad_norm": 1.1554052829742432,
"learning_rate": 0.00010847673997145016,
"loss": 0.7665,
"step": 23820
},
{
"epoch": 0.9795381707617717,
"grad_norm": 1.1881402730941772,
"learning_rate": 0.00010834413543481163,
"loss": 0.7899,
"step": 23840
},
{
"epoch": 0.9803599309721424,
"grad_norm": 1.209314227104187,
"learning_rate": 0.00010821151612135155,
"loss": 0.7478,
"step": 23860
},
{
"epoch": 0.981181691182513,
"grad_norm": 1.184097409248352,
"learning_rate": 0.00010807888226592848,
"loss": 0.7626,
"step": 23880
},
{
"epoch": 0.9820034513928836,
"grad_norm": 1.109991431236267,
"learning_rate": 0.00010794623410342682,
"loss": 0.7748,
"step": 23900
},
{
"epoch": 0.9828252116032542,
"grad_norm": 1.1060677766799927,
"learning_rate": 0.00010781357186875619,
"loss": 0.7873,
"step": 23920
},
{
"epoch": 0.9836469718136248,
"grad_norm": 1.1834135055541992,
"learning_rate": 0.00010768089579685126,
"loss": 0.7814,
"step": 23940
},
{
"epoch": 0.9844687320239954,
"grad_norm": 1.1708906888961792,
"learning_rate": 0.00010754820612267106,
"loss": 0.753,
"step": 23960
},
{
"epoch": 0.985290492234366,
"grad_norm": 1.2200385332107544,
"learning_rate": 0.00010741550308119885,
"loss": 0.7651,
"step": 23980
},
{
"epoch": 0.9861122524447367,
"grad_norm": 1.0666234493255615,
"learning_rate": 0.00010728278690744153,
"loss": 0.7795,
"step": 24000
},
{
"epoch": 0.9861122524447367,
"eval_loss": 1.02431058883667,
"eval_runtime": 16.5783,
"eval_samples_per_second": 158.038,
"eval_steps_per_second": 4.946,
"step": 24000
},
{
"epoch": 0.9869340126551073,
"grad_norm": 1.1617413759231567,
"learning_rate": 0.00010715005783642917,
"loss": 0.7878,
"step": 24020
},
{
"epoch": 0.9877557728654779,
"grad_norm": 1.11324942111969,
"learning_rate": 0.00010701731610321475,
"loss": 0.7969,
"step": 24040
},
{
"epoch": 0.9885775330758485,
"grad_norm": 1.1129649877548218,
"learning_rate": 0.00010688456194287368,
"loss": 0.7595,
"step": 24060
},
{
"epoch": 0.9893992932862191,
"grad_norm": 1.186141848564148,
"learning_rate": 0.00010675179559050332,
"loss": 0.8078,
"step": 24080
},
{
"epoch": 0.9902210534965897,
"grad_norm": 1.205206036567688,
"learning_rate": 0.00010661901728122272,
"loss": 0.7449,
"step": 24100
},
{
"epoch": 0.9910428137069603,
"grad_norm": 1.2314426898956299,
"learning_rate": 0.00010648622725017199,
"loss": 0.768,
"step": 24120
},
{
"epoch": 0.991864573917331,
"grad_norm": 1.1358891725540161,
"learning_rate": 0.00010635342573251209,
"loss": 0.7753,
"step": 24140
},
{
"epoch": 0.9926863341277016,
"grad_norm": 1.3078477382659912,
"learning_rate": 0.00010622061296342425,
"loss": 0.7845,
"step": 24160
},
{
"epoch": 0.9935080943380722,
"grad_norm": 1.105637550354004,
"learning_rate": 0.0001060877891781097,
"loss": 0.7741,
"step": 24180
},
{
"epoch": 0.9943298545484428,
"grad_norm": 1.1030614376068115,
"learning_rate": 0.00010595495461178912,
"loss": 0.7611,
"step": 24200
},
{
"epoch": 0.9951516147588134,
"grad_norm": 1.0990793704986572,
"learning_rate": 0.00010582210949970233,
"loss": 0.7711,
"step": 24220
},
{
"epoch": 0.995973374969184,
"grad_norm": 1.1833900213241577,
"learning_rate": 0.00010568925407710773,
"loss": 0.7684,
"step": 24240
},
{
"epoch": 0.9967951351795546,
"grad_norm": 1.0902843475341797,
"learning_rate": 0.00010555638857928209,
"loss": 0.7709,
"step": 24260
},
{
"epoch": 0.9976168953899253,
"grad_norm": 1.1372528076171875,
"learning_rate": 0.00010542351324152,
"loss": 0.7764,
"step": 24280
},
{
"epoch": 0.9984386556002959,
"grad_norm": 1.0270088911056519,
"learning_rate": 0.00010529062829913343,
"loss": 0.7638,
"step": 24300
},
{
"epoch": 0.9992604158106665,
"grad_norm": 1.0260668992996216,
"learning_rate": 0.0001051577339874514,
"loss": 0.781,
"step": 24320
},
{
"epoch": 1.000082176021037,
"grad_norm": 1.0618846416473389,
"learning_rate": 0.00010502483054181948,
"loss": 0.7616,
"step": 24340
},
{
"epoch": 1.0009039362314076,
"grad_norm": 1.0496258735656738,
"learning_rate": 0.00010489191819759945,
"loss": 0.6874,
"step": 24360
},
{
"epoch": 1.0017256964417782,
"grad_norm": 1.304571270942688,
"learning_rate": 0.00010475899719016886,
"loss": 0.6708,
"step": 24380
},
{
"epoch": 1.0025474566521488,
"grad_norm": 1.4253283739089966,
"learning_rate": 0.00010462606775492054,
"loss": 0.6789,
"step": 24400
},
{
"epoch": 1.0033692168625195,
"grad_norm": 1.1459177732467651,
"learning_rate": 0.00010449313012726234,
"loss": 0.6678,
"step": 24420
},
{
"epoch": 1.00419097707289,
"grad_norm": 1.2071423530578613,
"learning_rate": 0.00010436018454261654,
"loss": 0.6639,
"step": 24440
},
{
"epoch": 1.0050127372832607,
"grad_norm": 1.1461540460586548,
"learning_rate": 0.00010422723123641956,
"loss": 0.6722,
"step": 24460
},
{
"epoch": 1.0058344974936313,
"grad_norm": 1.2865045070648193,
"learning_rate": 0.00010409427044412141,
"loss": 0.6756,
"step": 24480
},
{
"epoch": 1.006656257704002,
"grad_norm": 1.0493274927139282,
"learning_rate": 0.00010396130240118549,
"loss": 0.677,
"step": 24500
},
{
"epoch": 1.0074780179143725,
"grad_norm": 1.171351671218872,
"learning_rate": 0.00010382832734308792,
"loss": 0.6736,
"step": 24520
},
{
"epoch": 1.0082997781247431,
"grad_norm": 1.1830470561981201,
"learning_rate": 0.00010369534550531734,
"loss": 0.6575,
"step": 24540
},
{
"epoch": 1.0091215383351138,
"grad_norm": 1.1490957736968994,
"learning_rate": 0.00010357565624934433,
"loss": 0.6628,
"step": 24560
},
{
"epoch": 1.0099432985454844,
"grad_norm": 1.0752952098846436,
"learning_rate": 0.00010344266217900869,
"loss": 0.6489,
"step": 24580
},
{
"epoch": 1.010765058755855,
"grad_norm": 1.1401771306991577,
"learning_rate": 0.00010330966201198336,
"loss": 0.6436,
"step": 24600
},
{
"epoch": 1.0115868189662256,
"grad_norm": 1.0942751169204712,
"learning_rate": 0.00010317665598380131,
"loss": 0.6908,
"step": 24620
},
{
"epoch": 1.0124085791765962,
"grad_norm": 1.23777437210083,
"learning_rate": 0.00010304364433000604,
"loss": 0.676,
"step": 24640
},
{
"epoch": 1.0132303393869668,
"grad_norm": 1.3880153894424438,
"learning_rate": 0.00010291062728615099,
"loss": 0.6838,
"step": 24660
},
{
"epoch": 1.0140520995973374,
"grad_norm": 1.090610384941101,
"learning_rate": 0.00010277760508779903,
"loss": 0.6505,
"step": 24680
},
{
"epoch": 1.014873859807708,
"grad_norm": 1.1579679250717163,
"learning_rate": 0.00010264457797052227,
"loss": 0.679,
"step": 24700
},
{
"epoch": 1.0156956200180787,
"grad_norm": 1.1469173431396484,
"learning_rate": 0.00010251154616990151,
"loss": 0.7002,
"step": 24720
},
{
"epoch": 1.0165173802284493,
"grad_norm": 1.1271486282348633,
"learning_rate": 0.00010237850992152578,
"loss": 0.6774,
"step": 24740
},
{
"epoch": 1.01733914043882,
"grad_norm": 1.2221741676330566,
"learning_rate": 0.00010224546946099209,
"loss": 0.6912,
"step": 24760
},
{
"epoch": 1.0181609006491905,
"grad_norm": 1.0773913860321045,
"learning_rate": 0.00010211242502390481,
"loss": 0.6631,
"step": 24780
},
{
"epoch": 1.0189826608595611,
"grad_norm": 1.3026174306869507,
"learning_rate": 0.0001019793768458754,
"loss": 0.6731,
"step": 24800
},
{
"epoch": 1.0198044210699317,
"grad_norm": 1.1742732524871826,
"learning_rate": 0.00010184632516252199,
"loss": 0.6562,
"step": 24820
},
{
"epoch": 1.0206261812803024,
"grad_norm": 1.1922106742858887,
"learning_rate": 0.00010171992303113935,
"loss": 0.6519,
"step": 24840
},
{
"epoch": 1.021447941490673,
"grad_norm": 1.0810050964355469,
"learning_rate": 0.0001015868651901237,
"loss": 0.6625,
"step": 24860
},
{
"epoch": 1.0222697017010436,
"grad_norm": 1.4511394500732422,
"learning_rate": 0.00010145380453889195,
"loss": 0.6955,
"step": 24880
},
{
"epoch": 1.0230914619114142,
"grad_norm": 1.1028388738632202,
"learning_rate": 0.0001013207413130843,
"loss": 0.6576,
"step": 24900
},
{
"epoch": 1.0239132221217848,
"grad_norm": 1.169084072113037,
"learning_rate": 0.0001011876757483454,
"loss": 0.6678,
"step": 24920
},
{
"epoch": 1.0247349823321554,
"grad_norm": 1.1536842584609985,
"learning_rate": 0.00010105460808032418,
"loss": 0.6477,
"step": 24940
},
{
"epoch": 1.025556742542526,
"grad_norm": 1.123879313468933,
"learning_rate": 0.00010092153854467326,
"loss": 0.679,
"step": 24960
},
{
"epoch": 1.0263785027528967,
"grad_norm": 1.019872784614563,
"learning_rate": 0.00010078846737704848,
"loss": 0.6854,
"step": 24980
},
{
"epoch": 1.0272002629632673,
"grad_norm": 1.3068866729736328,
"learning_rate": 0.00010065539481310871,
"loss": 0.6717,
"step": 25000
},
{
"epoch": 1.0280220231736379,
"grad_norm": 1.2124892473220825,
"learning_rate": 0.00010052232108851513,
"loss": 0.649,
"step": 25020
},
{
"epoch": 1.0288437833840085,
"grad_norm": 1.2036775350570679,
"learning_rate": 0.00010038924643893113,
"loss": 0.666,
"step": 25040
},
{
"epoch": 1.0296655435943791,
"grad_norm": 1.244070053100586,
"learning_rate": 0.00010025617110002165,
"loss": 0.6654,
"step": 25060
},
{
"epoch": 1.0304873038047497,
"grad_norm": 1.1506962776184082,
"learning_rate": 0.00010012309530745285,
"loss": 0.6764,
"step": 25080
},
{
"epoch": 1.0313090640151203,
"grad_norm": 1.1000773906707764,
"learning_rate": 9.999001929689177e-05,
"loss": 0.6574,
"step": 25100
},
{
"epoch": 1.032130824225491,
"grad_norm": 1.0797061920166016,
"learning_rate": 9.985694330400571e-05,
"loss": 0.6618,
"step": 25120
},
{
"epoch": 1.0329525844358616,
"grad_norm": 1.1669949293136597,
"learning_rate": 9.972386756446208e-05,
"loss": 0.6731,
"step": 25140
},
{
"epoch": 1.0337743446462322,
"grad_norm": 1.1310721635818481,
"learning_rate": 9.959079231392771e-05,
"loss": 0.6773,
"step": 25160
},
{
"epoch": 1.0345961048566028,
"grad_norm": 1.1668230295181274,
"learning_rate": 9.945771778806865e-05,
"loss": 0.6772,
"step": 25180
},
{
"epoch": 1.0354178650669734,
"grad_norm": 1.1975557804107666,
"learning_rate": 9.93246442225497e-05,
"loss": 0.6801,
"step": 25200
},
{
"epoch": 1.036239625277344,
"grad_norm": 1.1820405721664429,
"learning_rate": 9.919157185303379e-05,
"loss": 0.6581,
"step": 25220
},
{
"epoch": 1.0370613854877146,
"grad_norm": 1.1540806293487549,
"learning_rate": 9.90585009151819e-05,
"loss": 0.6732,
"step": 25240
},
{
"epoch": 1.0378831456980853,
"grad_norm": 1.064178228378296,
"learning_rate": 9.892543164465243e-05,
"loss": 0.6732,
"step": 25260
},
{
"epoch": 1.0387049059084559,
"grad_norm": 1.107135534286499,
"learning_rate": 9.879236427710082e-05,
"loss": 0.6708,
"step": 25280
},
{
"epoch": 1.0395266661188265,
"grad_norm": 1.172105312347412,
"learning_rate": 9.865929904817909e-05,
"loss": 0.6818,
"step": 25300
},
{
"epoch": 1.040348426329197,
"grad_norm": 1.1912182569503784,
"learning_rate": 9.85262361935356e-05,
"loss": 0.7184,
"step": 25320
},
{
"epoch": 1.0411701865395677,
"grad_norm": 1.1023154258728027,
"learning_rate": 9.83931759488143e-05,
"loss": 0.6638,
"step": 25340
},
{
"epoch": 1.0419919467499383,
"grad_norm": 1.076657772064209,
"learning_rate": 9.826011854965474e-05,
"loss": 0.6727,
"step": 25360
},
{
"epoch": 1.042813706960309,
"grad_norm": 1.2090641260147095,
"learning_rate": 9.812706423169129e-05,
"loss": 0.6725,
"step": 25380
},
{
"epoch": 1.0436354671706796,
"grad_norm": 1.1991325616836548,
"learning_rate": 9.799401323055292e-05,
"loss": 0.6793,
"step": 25400
},
{
"epoch": 1.0444572273810502,
"grad_norm": 1.0969122648239136,
"learning_rate": 9.786096578186273e-05,
"loss": 0.6803,
"step": 25420
},
{
"epoch": 1.0452789875914208,
"grad_norm": 1.2352911233901978,
"learning_rate": 9.772792212123748e-05,
"loss": 0.6789,
"step": 25440
},
{
"epoch": 1.0461007478017914,
"grad_norm": 1.2759590148925781,
"learning_rate": 9.759488248428725e-05,
"loss": 0.6728,
"step": 25460
},
{
"epoch": 1.046922508012162,
"grad_norm": 1.3433208465576172,
"learning_rate": 9.7461847106615e-05,
"loss": 0.6835,
"step": 25480
},
{
"epoch": 1.0477442682225326,
"grad_norm": 1.2637454271316528,
"learning_rate": 9.732881622381616e-05,
"loss": 0.6738,
"step": 25500
},
{
"epoch": 1.0485660284329033,
"grad_norm": 1.1877262592315674,
"learning_rate": 9.719579007147815e-05,
"loss": 0.6633,
"step": 25520
},
{
"epoch": 1.0493877886432739,
"grad_norm": 1.1830005645751953,
"learning_rate": 9.706276888518013e-05,
"loss": 0.6559,
"step": 25540
},
{
"epoch": 1.0502095488536445,
"grad_norm": 1.1390386819839478,
"learning_rate": 9.692975290049228e-05,
"loss": 0.6782,
"step": 25560
},
{
"epoch": 1.051031309064015,
"grad_norm": 1.1353988647460938,
"learning_rate": 9.679674235297572e-05,
"loss": 0.6781,
"step": 25580
},
{
"epoch": 1.0518530692743857,
"grad_norm": 1.1977840662002563,
"learning_rate": 9.666373747818187e-05,
"loss": 0.6697,
"step": 25600
},
{
"epoch": 1.0526748294847563,
"grad_norm": 1.0405921936035156,
"learning_rate": 9.653073851165214e-05,
"loss": 0.6715,
"step": 25620
},
{
"epoch": 1.053496589695127,
"grad_norm": 1.1129424571990967,
"learning_rate": 9.63977456889175e-05,
"loss": 0.6806,
"step": 25640
},
{
"epoch": 1.0543183499054976,
"grad_norm": 1.163406491279602,
"learning_rate": 9.626475924549792e-05,
"loss": 0.6457,
"step": 25660
},
{
"epoch": 1.0551401101158682,
"grad_norm": 1.1244803667068481,
"learning_rate": 9.613177941690219e-05,
"loss": 0.6664,
"step": 25680
},
{
"epoch": 1.0559618703262388,
"grad_norm": 1.1291669607162476,
"learning_rate": 9.599880643862737e-05,
"loss": 0.6642,
"step": 25700
},
{
"epoch": 1.0567836305366094,
"grad_norm": 1.3684684038162231,
"learning_rate": 9.586584054615836e-05,
"loss": 0.6833,
"step": 25720
},
{
"epoch": 1.05760539074698,
"grad_norm": 1.0214548110961914,
"learning_rate": 9.57328819749675e-05,
"loss": 0.6693,
"step": 25740
},
{
"epoch": 1.0584271509573506,
"grad_norm": 1.0439046621322632,
"learning_rate": 9.559993096051425e-05,
"loss": 0.6563,
"step": 25760
},
{
"epoch": 1.0592489111677212,
"grad_norm": 1.1112405061721802,
"learning_rate": 9.546698773824453e-05,
"loss": 0.6683,
"step": 25780
},
{
"epoch": 1.0600706713780919,
"grad_norm": 1.2522891759872437,
"learning_rate": 9.53340525435906e-05,
"loss": 0.656,
"step": 25800
},
{
"epoch": 1.0608924315884625,
"grad_norm": 1.133664846420288,
"learning_rate": 9.520112561197045e-05,
"loss": 0.6891,
"step": 25820
},
{
"epoch": 1.061714191798833,
"grad_norm": 1.3006486892700195,
"learning_rate": 9.506820717878745e-05,
"loss": 0.6536,
"step": 25840
},
{
"epoch": 1.0625359520092037,
"grad_norm": 1.2655730247497559,
"learning_rate": 9.493529747942996e-05,
"loss": 0.687,
"step": 25860
},
{
"epoch": 1.0633577122195743,
"grad_norm": 1.1235599517822266,
"learning_rate": 9.480239674927074e-05,
"loss": 0.665,
"step": 25880
},
{
"epoch": 1.064179472429945,
"grad_norm": 1.250252604484558,
"learning_rate": 9.46695052236668e-05,
"loss": 0.6604,
"step": 25900
},
{
"epoch": 1.0650012326403155,
"grad_norm": 1.0628433227539062,
"learning_rate": 9.453662313795879e-05,
"loss": 0.6565,
"step": 25920
},
{
"epoch": 1.0658229928506862,
"grad_norm": 1.1810961961746216,
"learning_rate": 9.440375072747064e-05,
"loss": 0.6722,
"step": 25940
},
{
"epoch": 1.0666447530610568,
"grad_norm": 1.2217936515808105,
"learning_rate": 9.427753111349993e-05,
"loss": 0.6913,
"step": 25960
},
{
"epoch": 1.0674665132714274,
"grad_norm": 1.0944212675094604,
"learning_rate": 9.414467824647579e-05,
"loss": 0.6668,
"step": 25980
},
{
"epoch": 1.068288273481798,
"grad_norm": 1.083652377128601,
"learning_rate": 9.401183574877579e-05,
"loss": 0.6713,
"step": 26000
},
{
"epoch": 1.068288273481798,
"eval_loss": 1.0167440176010132,
"eval_runtime": 16.6957,
"eval_samples_per_second": 156.927,
"eval_steps_per_second": 4.911,
"step": 26000
},
{
"epoch": 1.0691100336921686,
"grad_norm": 1.117463231086731,
"learning_rate": 9.387900385565371e-05,
"loss": 0.6435,
"step": 26020
},
{
"epoch": 1.0699317939025392,
"grad_norm": 1.027099847793579,
"learning_rate": 9.374618280234465e-05,
"loss": 0.6909,
"step": 26040
},
{
"epoch": 1.0707535541129098,
"grad_norm": 1.3012546300888062,
"learning_rate": 9.36133728240645e-05,
"loss": 0.6741,
"step": 26060
},
{
"epoch": 1.0715753143232805,
"grad_norm": 1.179627776145935,
"learning_rate": 9.348057415600942e-05,
"loss": 0.687,
"step": 26080
},
{
"epoch": 1.072397074533651,
"grad_norm": 1.1117219924926758,
"learning_rate": 9.334778703335568e-05,
"loss": 0.6616,
"step": 26100
},
{
"epoch": 1.0732188347440217,
"grad_norm": 1.1664800643920898,
"learning_rate": 9.321501169125905e-05,
"loss": 0.6648,
"step": 26120
},
{
"epoch": 1.0740405949543923,
"grad_norm": 1.1486011743545532,
"learning_rate": 9.308224836485447e-05,
"loss": 0.684,
"step": 26140
},
{
"epoch": 1.074862355164763,
"grad_norm": 1.1502150297164917,
"learning_rate": 9.294949728925554e-05,
"loss": 0.6682,
"step": 26160
},
{
"epoch": 1.0756841153751335,
"grad_norm": 1.0855724811553955,
"learning_rate": 9.281675869955419e-05,
"loss": 0.671,
"step": 26180
},
{
"epoch": 1.0765058755855041,
"grad_norm": 1.267439365386963,
"learning_rate": 9.268403283082024e-05,
"loss": 0.6935,
"step": 26200
},
{
"epoch": 1.0773276357958748,
"grad_norm": 1.19661545753479,
"learning_rate": 9.255131991810099e-05,
"loss": 0.6611,
"step": 26220
},
{
"epoch": 1.0781493960062454,
"grad_norm": 1.1992172002792358,
"learning_rate": 9.241862019642083e-05,
"loss": 0.6527,
"step": 26240
},
{
"epoch": 1.078971156216616,
"grad_norm": 1.117606520652771,
"learning_rate": 9.228593390078073e-05,
"loss": 0.6792,
"step": 26260
},
{
"epoch": 1.0797929164269866,
"grad_norm": 1.1372742652893066,
"learning_rate": 9.21532612661579e-05,
"loss": 0.6742,
"step": 26280
},
{
"epoch": 1.0806146766373572,
"grad_norm": 1.1025826930999756,
"learning_rate": 9.202060252750539e-05,
"loss": 0.6559,
"step": 26300
},
{
"epoch": 1.0814364368477278,
"grad_norm": 1.222730278968811,
"learning_rate": 9.188795791975155e-05,
"loss": 0.6711,
"step": 26320
},
{
"epoch": 1.0822581970580984,
"grad_norm": 1.1559460163116455,
"learning_rate": 9.17553276777998e-05,
"loss": 0.6432,
"step": 26340
},
{
"epoch": 1.083079957268469,
"grad_norm": 1.0200605392456055,
"learning_rate": 9.162271203652811e-05,
"loss": 0.6606,
"step": 26360
},
{
"epoch": 1.0839017174788397,
"grad_norm": 1.1216548681259155,
"learning_rate": 9.149011123078861e-05,
"loss": 0.6608,
"step": 26380
},
{
"epoch": 1.0847234776892103,
"grad_norm": 1.0914236307144165,
"learning_rate": 9.135752549540704e-05,
"loss": 0.6644,
"step": 26400
},
{
"epoch": 1.085545237899581,
"grad_norm": 1.2880839109420776,
"learning_rate": 9.122495506518254e-05,
"loss": 0.6699,
"step": 26420
},
{
"epoch": 1.0863669981099515,
"grad_norm": 1.3740962743759155,
"learning_rate": 9.10924001748872e-05,
"loss": 0.661,
"step": 26440
},
{
"epoch": 1.0871887583203221,
"grad_norm": 1.2001346349716187,
"learning_rate": 9.095986105926547e-05,
"loss": 0.6806,
"step": 26460
},
{
"epoch": 1.0880105185306927,
"grad_norm": 1.1882902383804321,
"learning_rate": 9.082733795303393e-05,
"loss": 0.6581,
"step": 26480
},
{
"epoch": 1.0888322787410634,
"grad_norm": 1.1991751194000244,
"learning_rate": 9.069483109088086e-05,
"loss": 0.6684,
"step": 26500
},
{
"epoch": 1.089654038951434,
"grad_norm": 1.2505358457565308,
"learning_rate": 9.056234070746562e-05,
"loss": 0.6719,
"step": 26520
},
{
"epoch": 1.0904757991618046,
"grad_norm": 1.1861186027526855,
"learning_rate": 9.042986703741853e-05,
"loss": 0.6723,
"step": 26540
},
{
"epoch": 1.0912975593721752,
"grad_norm": 1.2916886806488037,
"learning_rate": 9.029741031534025e-05,
"loss": 0.6649,
"step": 26560
},
{
"epoch": 1.0921193195825458,
"grad_norm": 1.1472935676574707,
"learning_rate": 9.016497077580147e-05,
"loss": 0.6717,
"step": 26580
},
{
"epoch": 1.0929410797929164,
"grad_norm": 1.1004657745361328,
"learning_rate": 9.00325486533424e-05,
"loss": 0.659,
"step": 26600
},
{
"epoch": 1.093762840003287,
"grad_norm": 1.374177098274231,
"learning_rate": 8.990014418247242e-05,
"loss": 0.663,
"step": 26620
},
{
"epoch": 1.0945846002136577,
"grad_norm": 1.1015247106552124,
"learning_rate": 8.976775759766963e-05,
"loss": 0.6565,
"step": 26640
},
{
"epoch": 1.0954063604240283,
"grad_norm": 1.4717910289764404,
"learning_rate": 8.963538913338051e-05,
"loss": 0.6628,
"step": 26660
},
{
"epoch": 1.096228120634399,
"grad_norm": 1.180732011795044,
"learning_rate": 8.950303902401942e-05,
"loss": 0.6674,
"step": 26680
},
{
"epoch": 1.0970498808447695,
"grad_norm": 1.168150782585144,
"learning_rate": 8.937070750396826e-05,
"loss": 0.686,
"step": 26700
},
{
"epoch": 1.0978716410551401,
"grad_norm": 1.3096245527267456,
"learning_rate": 8.923839480757589e-05,
"loss": 0.6577,
"step": 26720
},
{
"epoch": 1.0986934012655107,
"grad_norm": 1.1966016292572021,
"learning_rate": 8.910610116915797e-05,
"loss": 0.6614,
"step": 26740
},
{
"epoch": 1.0995151614758814,
"grad_norm": 1.2684301137924194,
"learning_rate": 8.897382682299628e-05,
"loss": 0.6696,
"step": 26760
},
{
"epoch": 1.100336921686252,
"grad_norm": 1.1399879455566406,
"learning_rate": 8.884157200333856e-05,
"loss": 0.6443,
"step": 26780
},
{
"epoch": 1.1011586818966226,
"grad_norm": 1.2215358018875122,
"learning_rate": 8.870933694439789e-05,
"loss": 0.6749,
"step": 26800
},
{
"epoch": 1.1019804421069932,
"grad_norm": 1.1253427267074585,
"learning_rate": 8.857712188035245e-05,
"loss": 0.6739,
"step": 26820
},
{
"epoch": 1.1028022023173638,
"grad_norm": 1.0373494625091553,
"learning_rate": 8.844492704534485e-05,
"loss": 0.6877,
"step": 26840
},
{
"epoch": 1.1036239625277344,
"grad_norm": 1.2346014976501465,
"learning_rate": 8.831275267348199e-05,
"loss": 0.6754,
"step": 26860
},
{
"epoch": 1.104445722738105,
"grad_norm": 0.9985619187355042,
"learning_rate": 8.818059899883456e-05,
"loss": 0.6806,
"step": 26880
},
{
"epoch": 1.1052674829484757,
"grad_norm": 1.1723840236663818,
"learning_rate": 8.804846625543646e-05,
"loss": 0.6757,
"step": 26900
},
{
"epoch": 1.1060892431588463,
"grad_norm": 1.1545828580856323,
"learning_rate": 8.791635467728471e-05,
"loss": 0.6533,
"step": 26920
},
{
"epoch": 1.1069110033692169,
"grad_norm": 1.0837960243225098,
"learning_rate": 8.778426449833865e-05,
"loss": 0.6614,
"step": 26940
},
{
"epoch": 1.1077327635795875,
"grad_norm": 1.1886368989944458,
"learning_rate": 8.765219595251986e-05,
"loss": 0.6571,
"step": 26960
},
{
"epoch": 1.1085545237899581,
"grad_norm": 1.2560697793960571,
"learning_rate": 8.753335295091204e-05,
"loss": 0.6819,
"step": 26980
},
{
"epoch": 1.1093762840003287,
"grad_norm": 1.0834345817565918,
"learning_rate": 8.74013261523514e-05,
"loss": 0.6838,
"step": 27000
},
{
"epoch": 1.1101980442106993,
"grad_norm": 1.2474608421325684,
"learning_rate": 8.72693216650723e-05,
"loss": 0.6479,
"step": 27020
},
{
"epoch": 1.11101980442107,
"grad_norm": 1.2082535028457642,
"learning_rate": 8.713733972284461e-05,
"loss": 0.6479,
"step": 27040
},
{
"epoch": 1.1118415646314406,
"grad_norm": 1.2836271524429321,
"learning_rate": 8.700538055939816e-05,
"loss": 0.6927,
"step": 27060
},
{
"epoch": 1.1126633248418112,
"grad_norm": 1.235249400138855,
"learning_rate": 8.687344440842249e-05,
"loss": 0.6672,
"step": 27080
},
{
"epoch": 1.1134850850521818,
"grad_norm": 1.1937826871871948,
"learning_rate": 8.67415315035664e-05,
"loss": 0.6775,
"step": 27100
},
{
"epoch": 1.1143068452625524,
"grad_norm": 1.210194706916809,
"learning_rate": 8.66096420784374e-05,
"loss": 0.6656,
"step": 27120
},
{
"epoch": 1.115128605472923,
"grad_norm": 1.1403838396072388,
"learning_rate": 8.647777636660159e-05,
"loss": 0.6406,
"step": 27140
},
{
"epoch": 1.1159503656832936,
"grad_norm": 1.0807080268859863,
"learning_rate": 8.634593460158293e-05,
"loss": 0.6712,
"step": 27160
},
{
"epoch": 1.1167721258936643,
"grad_norm": 1.2173619270324707,
"learning_rate": 8.621411701686309e-05,
"loss": 0.6671,
"step": 27180
},
{
"epoch": 1.1175938861040349,
"grad_norm": 1.2164521217346191,
"learning_rate": 8.608232384588086e-05,
"loss": 0.6551,
"step": 27200
},
{
"epoch": 1.1184156463144055,
"grad_norm": 1.3138072490692139,
"learning_rate": 8.595055532203177e-05,
"loss": 0.681,
"step": 27220
},
{
"epoch": 1.119237406524776,
"grad_norm": 1.1198703050613403,
"learning_rate": 8.581881167866774e-05,
"loss": 0.6533,
"step": 27240
},
{
"epoch": 1.1200591667351467,
"grad_norm": 1.1022275686264038,
"learning_rate": 8.568709314909663e-05,
"loss": 0.6441,
"step": 27260
},
{
"epoch": 1.1208809269455173,
"grad_norm": 1.2056939601898193,
"learning_rate": 8.555539996658184e-05,
"loss": 0.6768,
"step": 27280
},
{
"epoch": 1.121702687155888,
"grad_norm": 1.2110202312469482,
"learning_rate": 8.542373236434182e-05,
"loss": 0.6646,
"step": 27300
},
{
"epoch": 1.1225244473662586,
"grad_norm": 1.0746920108795166,
"learning_rate": 8.529209057554984e-05,
"loss": 0.679,
"step": 27320
},
{
"epoch": 1.1233462075766292,
"grad_norm": 1.1810733079910278,
"learning_rate": 8.516047483333325e-05,
"loss": 0.6643,
"step": 27340
},
{
"epoch": 1.1241679677869998,
"grad_norm": 1.1760369539260864,
"learning_rate": 8.502888537077345e-05,
"loss": 0.6543,
"step": 27360
},
{
"epoch": 1.1249897279973704,
"grad_norm": 1.2052829265594482,
"learning_rate": 8.489732242090527e-05,
"loss": 0.6448,
"step": 27380
},
{
"epoch": 1.125811488207741,
"grad_norm": 1.2746154069900513,
"learning_rate": 8.476578621671659e-05,
"loss": 0.6622,
"step": 27400
},
{
"epoch": 1.1266332484181116,
"grad_norm": 1.186718225479126,
"learning_rate": 8.463427699114785e-05,
"loss": 0.6626,
"step": 27420
},
{
"epoch": 1.1274550086284822,
"grad_norm": 1.299566388130188,
"learning_rate": 8.450279497709179e-05,
"loss": 0.677,
"step": 27440
},
{
"epoch": 1.1282767688388529,
"grad_norm": 1.2594420909881592,
"learning_rate": 8.43713404073929e-05,
"loss": 0.6743,
"step": 27460
},
{
"epoch": 1.1290985290492235,
"grad_norm": 1.1725876331329346,
"learning_rate": 8.423991351484716e-05,
"loss": 0.6861,
"step": 27480
},
{
"epoch": 1.129920289259594,
"grad_norm": 1.1445516347885132,
"learning_rate": 8.410851453220143e-05,
"loss": 0.6626,
"step": 27500
},
{
"epoch": 1.1307420494699647,
"grad_norm": 1.0811272859573364,
"learning_rate": 8.397714369215324e-05,
"loss": 0.6527,
"step": 27520
},
{
"epoch": 1.1315638096803353,
"grad_norm": 1.0390878915786743,
"learning_rate": 8.384580122735024e-05,
"loss": 0.6966,
"step": 27540
},
{
"epoch": 1.132385569890706,
"grad_norm": 1.1482422351837158,
"learning_rate": 8.371448737038976e-05,
"loss": 0.6674,
"step": 27560
},
{
"epoch": 1.1332073301010765,
"grad_norm": 1.1507283449172974,
"learning_rate": 8.35832023538186e-05,
"loss": 0.6776,
"step": 27580
},
{
"epoch": 1.1340290903114472,
"grad_norm": 1.141131043434143,
"learning_rate": 8.345194641013236e-05,
"loss": 0.6824,
"step": 27600
},
{
"epoch": 1.1348508505218178,
"grad_norm": 1.1057684421539307,
"learning_rate": 8.332071977177526e-05,
"loss": 0.687,
"step": 27620
},
{
"epoch": 1.1356726107321884,
"grad_norm": 1.4246220588684082,
"learning_rate": 8.318952267113958e-05,
"loss": 0.6624,
"step": 27640
},
{
"epoch": 1.136494370942559,
"grad_norm": 0.9534235000610352,
"learning_rate": 8.305835534056524e-05,
"loss": 0.668,
"step": 27660
},
{
"epoch": 1.1373161311529296,
"grad_norm": 1.2411669492721558,
"learning_rate": 8.29272180123395e-05,
"loss": 0.6585,
"step": 27680
},
{
"epoch": 1.1381378913633002,
"grad_norm": 1.137723684310913,
"learning_rate": 8.279611091869651e-05,
"loss": 0.6684,
"step": 27700
},
{
"epoch": 1.1389596515736709,
"grad_norm": 1.1483657360076904,
"learning_rate": 8.26650342918168e-05,
"loss": 0.668,
"step": 27720
},
{
"epoch": 1.1397814117840415,
"grad_norm": 1.1523141860961914,
"learning_rate": 8.253398836382702e-05,
"loss": 0.6957,
"step": 27740
},
{
"epoch": 1.140603171994412,
"grad_norm": 1.1631639003753662,
"learning_rate": 8.240297336679942e-05,
"loss": 0.66,
"step": 27760
},
{
"epoch": 1.1414249322047827,
"grad_norm": 1.2860993146896362,
"learning_rate": 8.227198953275142e-05,
"loss": 0.685,
"step": 27780
},
{
"epoch": 1.1422466924151533,
"grad_norm": 1.236307144165039,
"learning_rate": 8.214103709364535e-05,
"loss": 0.663,
"step": 27800
},
{
"epoch": 1.143068452625524,
"grad_norm": 1.1525200605392456,
"learning_rate": 8.201011628138789e-05,
"loss": 0.6581,
"step": 27820
},
{
"epoch": 1.1438902128358945,
"grad_norm": 1.1676980257034302,
"learning_rate": 8.187922732782976e-05,
"loss": 0.6768,
"step": 27840
},
{
"epoch": 1.1447119730462652,
"grad_norm": 1.0874487161636353,
"learning_rate": 8.174837046476518e-05,
"loss": 0.6336,
"step": 27860
},
{
"epoch": 1.1455337332566358,
"grad_norm": 0.9278498291969299,
"learning_rate": 8.16175459239316e-05,
"loss": 0.6767,
"step": 27880
},
{
"epoch": 1.1463554934670064,
"grad_norm": 1.039865255355835,
"learning_rate": 8.148675393700918e-05,
"loss": 0.6686,
"step": 27900
},
{
"epoch": 1.147177253677377,
"grad_norm": 1.0158859491348267,
"learning_rate": 8.135599473562048e-05,
"loss": 0.685,
"step": 27920
},
{
"epoch": 1.1479990138877476,
"grad_norm": 1.2587593793869019,
"learning_rate": 8.122526855132997e-05,
"loss": 0.6606,
"step": 27940
},
{
"epoch": 1.1488207740981182,
"grad_norm": 1.2706636190414429,
"learning_rate": 8.109457561564373e-05,
"loss": 0.6761,
"step": 27960
},
{
"epoch": 1.1496425343084888,
"grad_norm": 1.0990588665008545,
"learning_rate": 8.09639161600088e-05,
"loss": 0.6512,
"step": 27980
},
{
"epoch": 1.1504642945188595,
"grad_norm": 1.1541229486465454,
"learning_rate": 8.084635146662451e-05,
"loss": 0.663,
"step": 28000
},
{
"epoch": 1.1504642945188595,
"eval_loss": 0.9906555414199829,
"eval_runtime": 16.5563,
"eval_samples_per_second": 158.248,
"eval_steps_per_second": 4.953,
"step": 28000
},
{
"epoch": 1.15128605472923,
"grad_norm": 1.1189225912094116,
"learning_rate": 8.071575626051133e-05,
"loss": 0.6602,
"step": 28020
},
{
"epoch": 1.1521078149396007,
"grad_norm": 1.2261697053909302,
"learning_rate": 8.058519520530936e-05,
"loss": 0.6556,
"step": 28040
},
{
"epoch": 1.1529295751499713,
"grad_norm": 1.10615074634552,
"learning_rate": 8.045466853223223e-05,
"loss": 0.6632,
"step": 28060
},
{
"epoch": 1.153751335360342,
"grad_norm": 1.2373408079147339,
"learning_rate": 8.032417647243263e-05,
"loss": 0.6697,
"step": 28080
},
{
"epoch": 1.1545730955707125,
"grad_norm": 1.282421588897705,
"learning_rate": 8.019371925700199e-05,
"loss": 0.6827,
"step": 28100
},
{
"epoch": 1.1553948557810831,
"grad_norm": 1.3164217472076416,
"learning_rate": 8.006329711696991e-05,
"loss": 0.6604,
"step": 28120
},
{
"epoch": 1.1562166159914538,
"grad_norm": 1.0808019638061523,
"learning_rate": 7.993291028330409e-05,
"loss": 0.6437,
"step": 28140
},
{
"epoch": 1.1570383762018244,
"grad_norm": 1.116471767425537,
"learning_rate": 7.980255898690942e-05,
"loss": 0.6599,
"step": 28160
},
{
"epoch": 1.157860136412195,
"grad_norm": 1.13624906539917,
"learning_rate": 7.967224345862811e-05,
"loss": 0.6707,
"step": 28180
},
{
"epoch": 1.1586818966225656,
"grad_norm": 1.1284352540969849,
"learning_rate": 7.954196392923891e-05,
"loss": 0.6627,
"step": 28200
},
{
"epoch": 1.1595036568329362,
"grad_norm": 0.9575105905532837,
"learning_rate": 7.941172062945683e-05,
"loss": 0.669,
"step": 28220
},
{
"epoch": 1.1603254170433068,
"grad_norm": 1.4857319593429565,
"learning_rate": 7.928151378993277e-05,
"loss": 0.684,
"step": 28240
},
{
"epoch": 1.1611471772536774,
"grad_norm": 1.1151981353759766,
"learning_rate": 7.915134364125295e-05,
"loss": 0.6515,
"step": 28260
},
{
"epoch": 1.161968937464048,
"grad_norm": 1.0837492942810059,
"learning_rate": 7.902121041393876e-05,
"loss": 0.6808,
"step": 28280
},
{
"epoch": 1.1627906976744187,
"grad_norm": 1.3104796409606934,
"learning_rate": 7.889111433844603e-05,
"loss": 0.6816,
"step": 28300
},
{
"epoch": 1.1636124578847893,
"grad_norm": 1.266835331916809,
"learning_rate": 7.876105564516498e-05,
"loss": 0.6494,
"step": 28320
},
{
"epoch": 1.16443421809516,
"grad_norm": 1.1202675104141235,
"learning_rate": 7.863103456441951e-05,
"loss": 0.6517,
"step": 28340
},
{
"epoch": 1.1652559783055305,
"grad_norm": 1.1493626832962036,
"learning_rate": 7.850105132646699e-05,
"loss": 0.6545,
"step": 28360
},
{
"epoch": 1.1660777385159011,
"grad_norm": 1.259341835975647,
"learning_rate": 7.837110616149767e-05,
"loss": 0.6606,
"step": 28380
},
{
"epoch": 1.1668994987262717,
"grad_norm": 1.210664987564087,
"learning_rate": 7.824119929963444e-05,
"loss": 0.645,
"step": 28400
},
{
"epoch": 1.1677212589366424,
"grad_norm": 1.1816967725753784,
"learning_rate": 7.81113309709324e-05,
"loss": 0.6413,
"step": 28420
},
{
"epoch": 1.168543019147013,
"grad_norm": 1.0502477884292603,
"learning_rate": 7.79815014053783e-05,
"loss": 0.6552,
"step": 28440
},
{
"epoch": 1.1693647793573836,
"grad_norm": 1.1239842176437378,
"learning_rate": 7.785171083289039e-05,
"loss": 0.6106,
"step": 28460
},
{
"epoch": 1.1701865395677542,
"grad_norm": 1.163809061050415,
"learning_rate": 7.772195948331769e-05,
"loss": 0.6831,
"step": 28480
},
{
"epoch": 1.1710082997781248,
"grad_norm": 1.0698950290679932,
"learning_rate": 7.759873224073584e-05,
"loss": 0.6741,
"step": 28500
},
{
"epoch": 1.1718300599884954,
"grad_norm": 1.140785574913025,
"learning_rate": 7.7469058036688e-05,
"loss": 0.6761,
"step": 28520
},
{
"epoch": 1.172651820198866,
"grad_norm": 1.134072184562683,
"learning_rate": 7.733942373320407e-05,
"loss": 0.6785,
"step": 28540
},
{
"epoch": 1.1734735804092367,
"grad_norm": 1.0939021110534668,
"learning_rate": 7.720982955985641e-05,
"loss": 0.6478,
"step": 28560
},
{
"epoch": 1.1742953406196073,
"grad_norm": 1.268872618675232,
"learning_rate": 7.708027574614631e-05,
"loss": 0.6623,
"step": 28580
},
{
"epoch": 1.175117100829978,
"grad_norm": 1.1508769989013672,
"learning_rate": 7.695076252150361e-05,
"loss": 0.6663,
"step": 28600
},
{
"epoch": 1.1759388610403485,
"grad_norm": 1.2125996351242065,
"learning_rate": 7.682129011528627e-05,
"loss": 0.6473,
"step": 28620
},
{
"epoch": 1.1767606212507191,
"grad_norm": 1.1959916353225708,
"learning_rate": 7.669185875677998e-05,
"loss": 0.6421,
"step": 28640
},
{
"epoch": 1.1775823814610897,
"grad_norm": 1.1845420598983765,
"learning_rate": 7.656246867519772e-05,
"loss": 0.6756,
"step": 28660
},
{
"epoch": 1.1784041416714603,
"grad_norm": 1.0986628532409668,
"learning_rate": 7.643312009967928e-05,
"loss": 0.6591,
"step": 28680
},
{
"epoch": 1.179225901881831,
"grad_norm": 1.0492193698883057,
"learning_rate": 7.630381325929113e-05,
"loss": 0.6567,
"step": 28700
},
{
"epoch": 1.1800476620922016,
"grad_norm": 1.1858165264129639,
"learning_rate": 7.617454838302567e-05,
"loss": 0.6644,
"step": 28720
},
{
"epoch": 1.1808694223025722,
"grad_norm": 1.107640266418457,
"learning_rate": 7.604532569980105e-05,
"loss": 0.6514,
"step": 28740
},
{
"epoch": 1.1816911825129428,
"grad_norm": 1.2059592008590698,
"learning_rate": 7.591614543846075e-05,
"loss": 0.6607,
"step": 28760
},
{
"epoch": 1.1825129427233134,
"grad_norm": 1.0413446426391602,
"learning_rate": 7.578700782777299e-05,
"loss": 0.6722,
"step": 28780
},
{
"epoch": 1.183334702933684,
"grad_norm": 1.18887197971344,
"learning_rate": 7.565791309643058e-05,
"loss": 0.6796,
"step": 28800
},
{
"epoch": 1.1841564631440546,
"grad_norm": 1.1347589492797852,
"learning_rate": 7.552886147305034e-05,
"loss": 0.6652,
"step": 28820
},
{
"epoch": 1.1849782233544253,
"grad_norm": 1.0669459104537964,
"learning_rate": 7.53998531861728e-05,
"loss": 0.6557,
"step": 28840
},
{
"epoch": 1.1857999835647959,
"grad_norm": 1.1914751529693604,
"learning_rate": 7.527088846426164e-05,
"loss": 0.6631,
"step": 28860
},
{
"epoch": 1.1866217437751665,
"grad_norm": 1.1654757261276245,
"learning_rate": 7.514196753570354e-05,
"loss": 0.6392,
"step": 28880
},
{
"epoch": 1.187443503985537,
"grad_norm": 1.1120076179504395,
"learning_rate": 7.501309062880745e-05,
"loss": 0.6515,
"step": 28900
},
{
"epoch": 1.1882652641959077,
"grad_norm": 1.3329730033874512,
"learning_rate": 7.488425797180449e-05,
"loss": 0.6495,
"step": 28920
},
{
"epoch": 1.1890870244062783,
"grad_norm": 1.0386924743652344,
"learning_rate": 7.475546979284738e-05,
"loss": 0.6633,
"step": 28940
},
{
"epoch": 1.189908784616649,
"grad_norm": 1.130223035812378,
"learning_rate": 7.46267263200101e-05,
"loss": 0.6608,
"step": 28960
},
{
"epoch": 1.1907305448270196,
"grad_norm": 1.205684781074524,
"learning_rate": 7.44980277812874e-05,
"loss": 0.6873,
"step": 28980
},
{
"epoch": 1.1915523050373902,
"grad_norm": 1.1943809986114502,
"learning_rate": 7.436937440459448e-05,
"loss": 0.6384,
"step": 29000
},
{
"epoch": 1.1923740652477608,
"grad_norm": 1.1930698156356812,
"learning_rate": 7.424076641776657e-05,
"loss": 0.6562,
"step": 29020
},
{
"epoch": 1.1931958254581314,
"grad_norm": 1.1172430515289307,
"learning_rate": 7.411220404855852e-05,
"loss": 0.6548,
"step": 29040
},
{
"epoch": 1.194017585668502,
"grad_norm": 1.2052860260009766,
"learning_rate": 7.398368752464438e-05,
"loss": 0.6368,
"step": 29060
},
{
"epoch": 1.1948393458788726,
"grad_norm": 1.2752981185913086,
"learning_rate": 7.385521707361705e-05,
"loss": 0.6425,
"step": 29080
},
{
"epoch": 1.1956611060892433,
"grad_norm": 1.2396671772003174,
"learning_rate": 7.372679292298781e-05,
"loss": 0.6536,
"step": 29100
},
{
"epoch": 1.1964828662996139,
"grad_norm": 1.0870732069015503,
"learning_rate": 7.359841530018589e-05,
"loss": 0.6926,
"step": 29120
},
{
"epoch": 1.1973046265099845,
"grad_norm": 1.238871455192566,
"learning_rate": 7.347008443255825e-05,
"loss": 0.6733,
"step": 29140
},
{
"epoch": 1.198126386720355,
"grad_norm": 0.9936127662658691,
"learning_rate": 7.334180054736892e-05,
"loss": 0.6675,
"step": 29160
},
{
"epoch": 1.1989481469307257,
"grad_norm": 1.1829917430877686,
"learning_rate": 7.321356387179881e-05,
"loss": 0.6501,
"step": 29180
},
{
"epoch": 1.1997699071410963,
"grad_norm": 1.218711256980896,
"learning_rate": 7.308537463294525e-05,
"loss": 0.6366,
"step": 29200
},
{
"epoch": 1.2005916673514667,
"grad_norm": 1.1491694450378418,
"learning_rate": 7.29572330578214e-05,
"loss": 0.6563,
"step": 29220
},
{
"epoch": 1.2014134275618376,
"grad_norm": 1.1249312162399292,
"learning_rate": 7.28291393733562e-05,
"loss": 0.6522,
"step": 29240
},
{
"epoch": 1.202235187772208,
"grad_norm": 1.1913214921951294,
"learning_rate": 7.270109380639374e-05,
"loss": 0.6663,
"step": 29260
},
{
"epoch": 1.2030569479825788,
"grad_norm": 1.059322714805603,
"learning_rate": 7.257309658369278e-05,
"loss": 0.6719,
"step": 29280
},
{
"epoch": 1.2038787081929492,
"grad_norm": 1.2595094442367554,
"learning_rate": 7.244514793192658e-05,
"loss": 0.6652,
"step": 29300
},
{
"epoch": 1.20470046840332,
"grad_norm": 1.1310940980911255,
"learning_rate": 7.231724807768243e-05,
"loss": 0.6286,
"step": 29320
},
{
"epoch": 1.2055222286136904,
"grad_norm": 1.1567174196243286,
"learning_rate": 7.2189397247461e-05,
"loss": 0.6545,
"step": 29340
},
{
"epoch": 1.2063439888240612,
"grad_norm": 1.2142544984817505,
"learning_rate": 7.206159566767633e-05,
"loss": 0.6717,
"step": 29360
},
{
"epoch": 1.2071657490344316,
"grad_norm": 1.1306164264678955,
"learning_rate": 7.193384356465518e-05,
"loss": 0.6589,
"step": 29380
},
{
"epoch": 1.2079875092448025,
"grad_norm": 1.1609306335449219,
"learning_rate": 7.180614116463671e-05,
"loss": 0.6393,
"step": 29400
},
{
"epoch": 1.2088092694551729,
"grad_norm": 1.0231740474700928,
"learning_rate": 7.167848869377201e-05,
"loss": 0.6623,
"step": 29420
},
{
"epoch": 1.2096310296655437,
"grad_norm": 1.1379834413528442,
"learning_rate": 7.15508863781238e-05,
"loss": 0.6451,
"step": 29440
},
{
"epoch": 1.210452789875914,
"grad_norm": 1.1678614616394043,
"learning_rate": 7.142333444366593e-05,
"loss": 0.6275,
"step": 29460
},
{
"epoch": 1.211274550086285,
"grad_norm": 1.2142267227172852,
"learning_rate": 7.129583311628307e-05,
"loss": 0.6486,
"step": 29480
},
{
"epoch": 1.2120963102966553,
"grad_norm": 1.1522984504699707,
"learning_rate": 7.116838262177025e-05,
"loss": 0.6692,
"step": 29500
},
{
"epoch": 1.2129180705070262,
"grad_norm": 1.2733930349349976,
"learning_rate": 7.104098318583256e-05,
"loss": 0.6467,
"step": 29520
},
{
"epoch": 1.2137398307173966,
"grad_norm": 1.1900440454483032,
"learning_rate": 7.09136350340845e-05,
"loss": 0.6534,
"step": 29540
},
{
"epoch": 1.2145615909277674,
"grad_norm": 1.1477757692337036,
"learning_rate": 7.078633839204985e-05,
"loss": 0.6601,
"step": 29560
},
{
"epoch": 1.2153833511381378,
"grad_norm": 1.284244418144226,
"learning_rate": 7.065909348516122e-05,
"loss": 0.6659,
"step": 29580
},
{
"epoch": 1.2162051113485086,
"grad_norm": 1.0393390655517578,
"learning_rate": 7.05319005387595e-05,
"loss": 0.6428,
"step": 29600
},
{
"epoch": 1.217026871558879,
"grad_norm": 1.0445849895477295,
"learning_rate": 7.040475977809362e-05,
"loss": 0.6619,
"step": 29620
},
{
"epoch": 1.2178486317692498,
"grad_norm": 1.2404284477233887,
"learning_rate": 7.027767142832012e-05,
"loss": 0.6424,
"step": 29640
},
{
"epoch": 1.2186703919796202,
"grad_norm": 1.1194926500320435,
"learning_rate": 7.015063571450262e-05,
"loss": 0.6492,
"step": 29660
},
{
"epoch": 1.219492152189991,
"grad_norm": 1.1228058338165283,
"learning_rate": 7.00236528616116e-05,
"loss": 0.6569,
"step": 29680
},
{
"epoch": 1.2203139124003615,
"grad_norm": 1.106655240058899,
"learning_rate": 6.989672309452398e-05,
"loss": 0.6606,
"step": 29700
},
{
"epoch": 1.2211356726107323,
"grad_norm": 1.1909033060073853,
"learning_rate": 6.976984663802252e-05,
"loss": 0.6867,
"step": 29720
},
{
"epoch": 1.2219574328211027,
"grad_norm": 1.1197348833084106,
"learning_rate": 6.964302371679578e-05,
"loss": 0.65,
"step": 29740
},
{
"epoch": 1.2227791930314735,
"grad_norm": 1.0467857122421265,
"learning_rate": 6.951625455543724e-05,
"loss": 0.6434,
"step": 29760
},
{
"epoch": 1.223600953241844,
"grad_norm": 1.1986229419708252,
"learning_rate": 6.938953937844541e-05,
"loss": 0.6278,
"step": 29780
},
{
"epoch": 1.2244227134522148,
"grad_norm": 1.1179389953613281,
"learning_rate": 6.926287841022312e-05,
"loss": 0.6584,
"step": 29800
},
{
"epoch": 1.2252444736625852,
"grad_norm": 1.0440237522125244,
"learning_rate": 6.913627187507716e-05,
"loss": 0.6305,
"step": 29820
},
{
"epoch": 1.226066233872956,
"grad_norm": 1.2614185810089111,
"learning_rate": 6.9009719997218e-05,
"loss": 0.6582,
"step": 29840
},
{
"epoch": 1.2268879940833264,
"grad_norm": 1.0907703638076782,
"learning_rate": 6.888322300075927e-05,
"loss": 0.655,
"step": 29860
},
{
"epoch": 1.2277097542936972,
"grad_norm": 1.0902374982833862,
"learning_rate": 6.875678110971738e-05,
"loss": 0.6338,
"step": 29880
},
{
"epoch": 1.2285315145040676,
"grad_norm": 1.2563419342041016,
"learning_rate": 6.863039454801119e-05,
"loss": 0.6591,
"step": 29900
},
{
"epoch": 1.2293532747144384,
"grad_norm": 1.1033470630645752,
"learning_rate": 6.850406353946158e-05,
"loss": 0.654,
"step": 29920
},
{
"epoch": 1.2301750349248088,
"grad_norm": 1.1465051174163818,
"learning_rate": 6.837778830779105e-05,
"loss": 0.6621,
"step": 29940
},
{
"epoch": 1.2309967951351797,
"grad_norm": 1.1070022583007812,
"learning_rate": 6.825156907662336e-05,
"loss": 0.6426,
"step": 29960
},
{
"epoch": 1.23181855534555,
"grad_norm": 1.1090333461761475,
"learning_rate": 6.812540606948296e-05,
"loss": 0.6625,
"step": 29980
},
{
"epoch": 1.232640315555921,
"grad_norm": 1.1961784362792969,
"learning_rate": 6.799929950979487e-05,
"loss": 0.6424,
"step": 30000
},
{
"epoch": 1.232640315555921,
"eval_loss": 0.9679434895515442,
"eval_runtime": 16.6198,
"eval_samples_per_second": 157.644,
"eval_steps_per_second": 4.934,
"step": 30000
},
{
"epoch": 1.2334620757662913,
"grad_norm": 1.2960805892944336,
"learning_rate": 6.787324962088411e-05,
"loss": 0.6821,
"step": 30020
},
{
"epoch": 1.234283835976662,
"grad_norm": 1.1521825790405273,
"learning_rate": 6.77472566259753e-05,
"loss": 0.6593,
"step": 30040
},
{
"epoch": 1.2351055961870325,
"grad_norm": 1.1858956813812256,
"learning_rate": 6.762132074819236e-05,
"loss": 0.662,
"step": 30060
},
{
"epoch": 1.2359273563974031,
"grad_norm": 1.1205803155899048,
"learning_rate": 6.749544221055808e-05,
"loss": 0.6815,
"step": 30080
},
{
"epoch": 1.2367491166077738,
"grad_norm": 0.9257388710975647,
"learning_rate": 6.736962123599355e-05,
"loss": 0.6531,
"step": 30100
},
{
"epoch": 1.2375708768181444,
"grad_norm": 0.9369202256202698,
"learning_rate": 6.724385804731811e-05,
"loss": 0.6305,
"step": 30120
},
{
"epoch": 1.238392637028515,
"grad_norm": 1.1999167203903198,
"learning_rate": 6.71181528672487e-05,
"loss": 0.6468,
"step": 30140
},
{
"epoch": 1.2392143972388856,
"grad_norm": 1.1772797107696533,
"learning_rate": 6.699250591839946e-05,
"loss": 0.654,
"step": 30160
},
{
"epoch": 1.2400361574492562,
"grad_norm": 1.1733454465866089,
"learning_rate": 6.68669174232815e-05,
"loss": 0.6439,
"step": 30180
},
{
"epoch": 1.2408579176596268,
"grad_norm": 1.0485635995864868,
"learning_rate": 6.674138760430236e-05,
"loss": 0.6505,
"step": 30200
},
{
"epoch": 1.2416796778699974,
"grad_norm": 1.1465091705322266,
"learning_rate": 6.66159166837657e-05,
"loss": 0.6591,
"step": 30220
},
{
"epoch": 1.242501438080368,
"grad_norm": 1.1369998455047607,
"learning_rate": 6.649050488387086e-05,
"loss": 0.6399,
"step": 30240
},
{
"epoch": 1.2433231982907387,
"grad_norm": 1.2065318822860718,
"learning_rate": 6.636515242671247e-05,
"loss": 0.6695,
"step": 30260
},
{
"epoch": 1.2441449585011093,
"grad_norm": 1.2178568840026855,
"learning_rate": 6.62398595342801e-05,
"loss": 0.6448,
"step": 30280
},
{
"epoch": 1.24496671871148,
"grad_norm": 1.1589614152908325,
"learning_rate": 6.611462642845782e-05,
"loss": 0.6308,
"step": 30300
},
{
"epoch": 1.2457884789218505,
"grad_norm": 1.136525273323059,
"learning_rate": 6.598945333102384e-05,
"loss": 0.6525,
"step": 30320
},
{
"epoch": 1.2466102391322211,
"grad_norm": 1.2322630882263184,
"learning_rate": 6.586434046365002e-05,
"loss": 0.639,
"step": 30340
},
{
"epoch": 1.2474319993425917,
"grad_norm": 1.171217679977417,
"learning_rate": 6.573928804790165e-05,
"loss": 0.6624,
"step": 30360
},
{
"epoch": 1.2482537595529624,
"grad_norm": 1.0731596946716309,
"learning_rate": 6.561429630523694e-05,
"loss": 0.658,
"step": 30380
},
{
"epoch": 1.249075519763333,
"grad_norm": 0.9198722839355469,
"learning_rate": 6.548936545700665e-05,
"loss": 0.656,
"step": 30400
},
{
"epoch": 1.2498972799737036,
"grad_norm": 1.2230961322784424,
"learning_rate": 6.53644957244537e-05,
"loss": 0.6415,
"step": 30420
},
{
"epoch": 1.2507190401840742,
"grad_norm": 1.3008650541305542,
"learning_rate": 6.52396873287127e-05,
"loss": 0.649,
"step": 30440
},
{
"epoch": 1.2515408003944448,
"grad_norm": 1.1299681663513184,
"learning_rate": 6.511494049080982e-05,
"loss": 0.6632,
"step": 30460
},
{
"epoch": 1.2523625606048154,
"grad_norm": 1.0429800748825073,
"learning_rate": 6.499025543166205e-05,
"loss": 0.6435,
"step": 30480
},
{
"epoch": 1.253184320815186,
"grad_norm": 1.313433051109314,
"learning_rate": 6.486563237207704e-05,
"loss": 0.6573,
"step": 30500
},
{
"epoch": 1.2540060810255567,
"grad_norm": 1.0738648176193237,
"learning_rate": 6.47472980935819e-05,
"loss": 0.6508,
"step": 30520
},
{
"epoch": 1.2548278412359273,
"grad_norm": 1.1511154174804688,
"learning_rate": 6.462279656782608e-05,
"loss": 0.6244,
"step": 30540
},
{
"epoch": 1.255649601446298,
"grad_norm": 1.1229740381240845,
"learning_rate": 6.450457814479982e-05,
"loss": 0.6303,
"step": 30560
},
{
"epoch": 1.2564713616566685,
"grad_norm": 1.1822351217269897,
"learning_rate": 6.438019899125807e-05,
"loss": 0.6296,
"step": 30580
},
{
"epoch": 1.2572931218670391,
"grad_norm": 1.127118706703186,
"learning_rate": 6.425588291764203e-05,
"loss": 0.6664,
"step": 30600
},
{
"epoch": 1.2581148820774097,
"grad_norm": 1.275978684425354,
"learning_rate": 6.413163014410595e-05,
"loss": 0.651,
"step": 30620
},
{
"epoch": 1.2589366422877803,
"grad_norm": 1.1102421283721924,
"learning_rate": 6.400744089069191e-05,
"loss": 0.6459,
"step": 30640
},
{
"epoch": 1.259758402498151,
"grad_norm": 1.2421194314956665,
"learning_rate": 6.388331537732954e-05,
"loss": 0.6464,
"step": 30660
},
{
"epoch": 1.2605801627085216,
"grad_norm": 1.248716950416565,
"learning_rate": 6.37592538238356e-05,
"loss": 0.6495,
"step": 30680
},
{
"epoch": 1.2614019229188922,
"grad_norm": 1.144371509552002,
"learning_rate": 6.363525644991348e-05,
"loss": 0.6405,
"step": 30700
},
{
"epoch": 1.2622236831292628,
"grad_norm": 1.2534958124160767,
"learning_rate": 6.351132347515303e-05,
"loss": 0.6581,
"step": 30720
},
{
"epoch": 1.2630454433396334,
"grad_norm": 1.2671653032302856,
"learning_rate": 6.338745511902997e-05,
"loss": 0.6547,
"step": 30740
},
{
"epoch": 1.263867203550004,
"grad_norm": 1.1555167436599731,
"learning_rate": 6.326365160090566e-05,
"loss": 0.6461,
"step": 30760
},
{
"epoch": 1.2646889637603747,
"grad_norm": 1.1524670124053955,
"learning_rate": 6.313991314002663e-05,
"loss": 0.6622,
"step": 30780
},
{
"epoch": 1.2655107239707453,
"grad_norm": 1.204883337020874,
"learning_rate": 6.301623995552409e-05,
"loss": 0.6624,
"step": 30800
},
{
"epoch": 1.2663324841811159,
"grad_norm": 1.0530204772949219,
"learning_rate": 6.289263226641375e-05,
"loss": 0.6655,
"step": 30820
},
{
"epoch": 1.2671542443914865,
"grad_norm": 1.1731172800064087,
"learning_rate": 6.276909029159536e-05,
"loss": 0.6524,
"step": 30840
},
{
"epoch": 1.267976004601857,
"grad_norm": 1.1701477766036987,
"learning_rate": 6.26456142498522e-05,
"loss": 0.6265,
"step": 30860
},
{
"epoch": 1.2687977648122277,
"grad_norm": 1.3820420503616333,
"learning_rate": 6.252220435985088e-05,
"loss": 0.6387,
"step": 30880
},
{
"epoch": 1.2696195250225983,
"grad_norm": 1.1477141380310059,
"learning_rate": 6.239886084014081e-05,
"loss": 0.6521,
"step": 30900
},
{
"epoch": 1.270441285232969,
"grad_norm": 1.2501583099365234,
"learning_rate": 6.22755839091538e-05,
"loss": 0.6452,
"step": 30920
},
{
"epoch": 1.2712630454433396,
"grad_norm": 1.110160231590271,
"learning_rate": 6.215237378520388e-05,
"loss": 0.6287,
"step": 30940
},
{
"epoch": 1.2720848056537102,
"grad_norm": 1.2295660972595215,
"learning_rate": 6.202923068648665e-05,
"loss": 0.6611,
"step": 30960
},
{
"epoch": 1.2729065658640808,
"grad_norm": 1.2019611597061157,
"learning_rate": 6.190615483107911e-05,
"loss": 0.6363,
"step": 30980
},
{
"epoch": 1.2737283260744514,
"grad_norm": 1.2087301015853882,
"learning_rate": 6.17831464369391e-05,
"loss": 0.6613,
"step": 31000
},
{
"epoch": 1.274550086284822,
"grad_norm": 1.179447889328003,
"learning_rate": 6.166020572190501e-05,
"loss": 0.6497,
"step": 31020
},
{
"epoch": 1.2753718464951926,
"grad_norm": 1.183194637298584,
"learning_rate": 6.153733290369536e-05,
"loss": 0.6464,
"step": 31040
},
{
"epoch": 1.2761936067055633,
"grad_norm": 1.2545372247695923,
"learning_rate": 6.14145281999085e-05,
"loss": 0.6517,
"step": 31060
},
{
"epoch": 1.2770153669159339,
"grad_norm": 1.143701434135437,
"learning_rate": 6.129179182802208e-05,
"loss": 0.629,
"step": 31080
},
{
"epoch": 1.2778371271263045,
"grad_norm": 1.2404268980026245,
"learning_rate": 6.116912400539277e-05,
"loss": 0.6209,
"step": 31100
},
{
"epoch": 1.278658887336675,
"grad_norm": 1.0369129180908203,
"learning_rate": 6.104652494925587e-05,
"loss": 0.637,
"step": 31120
},
{
"epoch": 1.2794806475470457,
"grad_norm": 1.1535693407058716,
"learning_rate": 6.092399487672482e-05,
"loss": 0.6406,
"step": 31140
},
{
"epoch": 1.2803024077574163,
"grad_norm": 1.143570899963379,
"learning_rate": 6.0801534004791005e-05,
"loss": 0.6631,
"step": 31160
},
{
"epoch": 1.281124167967787,
"grad_norm": 1.2489653825759888,
"learning_rate": 6.067914255032314e-05,
"loss": 0.6433,
"step": 31180
},
{
"epoch": 1.2819459281781576,
"grad_norm": 1.0466420650482178,
"learning_rate": 6.05568207300671e-05,
"loss": 0.6404,
"step": 31200
},
{
"epoch": 1.2827676883885282,
"grad_norm": 1.2191749811172485,
"learning_rate": 6.043456876064546e-05,
"loss": 0.6197,
"step": 31220
},
{
"epoch": 1.2835894485988988,
"grad_norm": 1.1350699663162231,
"learning_rate": 6.0312386858556956e-05,
"loss": 0.6605,
"step": 31240
},
{
"epoch": 1.2844112088092694,
"grad_norm": 1.1058688163757324,
"learning_rate": 6.0190275240176386e-05,
"loss": 0.6577,
"step": 31260
},
{
"epoch": 1.28523296901964,
"grad_norm": 1.0823215246200562,
"learning_rate": 6.006823412175404e-05,
"loss": 0.6389,
"step": 31280
},
{
"epoch": 1.2860547292300106,
"grad_norm": 1.1849805116653442,
"learning_rate": 5.9946263719415295e-05,
"loss": 0.665,
"step": 31300
},
{
"epoch": 1.2868764894403812,
"grad_norm": 1.1973915100097656,
"learning_rate": 5.982436424916037e-05,
"loss": 0.6612,
"step": 31320
},
{
"epoch": 1.2876982496507519,
"grad_norm": 1.1198099851608276,
"learning_rate": 5.970253592686389e-05,
"loss": 0.6371,
"step": 31340
},
{
"epoch": 1.2885200098611225,
"grad_norm": 0.9331278204917908,
"learning_rate": 5.9580778968274354e-05,
"loss": 0.6204,
"step": 31360
},
{
"epoch": 1.289341770071493,
"grad_norm": 1.0500894784927368,
"learning_rate": 5.9459093589014e-05,
"loss": 0.6481,
"step": 31380
},
{
"epoch": 1.2901635302818637,
"grad_norm": 1.1697520017623901,
"learning_rate": 5.9337480004578285e-05,
"loss": 0.6694,
"step": 31400
},
{
"epoch": 1.2909852904922343,
"grad_norm": 1.1045079231262207,
"learning_rate": 5.921593843033548e-05,
"loss": 0.6437,
"step": 31420
},
{
"epoch": 1.291807050702605,
"grad_norm": 1.0866926908493042,
"learning_rate": 5.9094469081526396e-05,
"loss": 0.6278,
"step": 31440
},
{
"epoch": 1.2926288109129755,
"grad_norm": 1.1703931093215942,
"learning_rate": 5.8973072173263865e-05,
"loss": 0.6523,
"step": 31460
},
{
"epoch": 1.2934505711233462,
"grad_norm": 1.2150297164916992,
"learning_rate": 5.885174792053245e-05,
"loss": 0.648,
"step": 31480
},
{
"epoch": 1.2942723313337168,
"grad_norm": 1.2517179250717163,
"learning_rate": 5.8730496538188084e-05,
"loss": 0.6439,
"step": 31500
},
{
"epoch": 1.2950940915440874,
"grad_norm": 1.0839895009994507,
"learning_rate": 5.8609318240957635e-05,
"loss": 0.6365,
"step": 31520
},
{
"epoch": 1.295915851754458,
"grad_norm": 1.081815481185913,
"learning_rate": 5.848821324343854e-05,
"loss": 0.6467,
"step": 31540
},
{
"epoch": 1.2967376119648286,
"grad_norm": 1.0636696815490723,
"learning_rate": 5.836718176009848e-05,
"loss": 0.6506,
"step": 31560
},
{
"epoch": 1.2975593721751992,
"grad_norm": 1.1643255949020386,
"learning_rate": 5.825227013865635e-05,
"loss": 0.6489,
"step": 31580
},
{
"epoch": 1.2983811323855698,
"grad_norm": 1.1235414743423462,
"learning_rate": 5.813742524085972e-05,
"loss": 0.652,
"step": 31600
},
{
"epoch": 1.2992028925959405,
"grad_norm": 1.1215312480926514,
"learning_rate": 5.801660816024896e-05,
"loss": 0.6475,
"step": 31620
},
{
"epoch": 1.300024652806311,
"grad_norm": 1.1635875701904297,
"learning_rate": 5.7895865428993504e-05,
"loss": 0.6325,
"step": 31640
},
{
"epoch": 1.3008464130166817,
"grad_norm": 1.1717442274093628,
"learning_rate": 5.777519726091938e-05,
"loss": 0.6301,
"step": 31660
},
{
"epoch": 1.3016681732270523,
"grad_norm": 1.0808964967727661,
"learning_rate": 5.765460386972068e-05,
"loss": 0.636,
"step": 31680
},
{
"epoch": 1.302489933437423,
"grad_norm": 1.1285815238952637,
"learning_rate": 5.7534085468958965e-05,
"loss": 0.6405,
"step": 31700
},
{
"epoch": 1.3033116936477935,
"grad_norm": 1.0786865949630737,
"learning_rate": 5.7413642272063164e-05,
"loss": 0.6418,
"step": 31720
},
{
"epoch": 1.3041334538581641,
"grad_norm": 1.243355631828308,
"learning_rate": 5.729327449232873e-05,
"loss": 0.6563,
"step": 31740
},
{
"epoch": 1.3049552140685348,
"grad_norm": 1.1835846900939941,
"learning_rate": 5.717298234291786e-05,
"loss": 0.6447,
"step": 31760
},
{
"epoch": 1.3057769742789054,
"grad_norm": 1.239445686340332,
"learning_rate": 5.705276603685868e-05,
"loss": 0.6445,
"step": 31780
},
{
"epoch": 1.306598734489276,
"grad_norm": 1.0167465209960938,
"learning_rate": 5.693262578704492e-05,
"loss": 0.6587,
"step": 31800
},
{
"epoch": 1.3074204946996466,
"grad_norm": 1.2314865589141846,
"learning_rate": 5.6812561806235855e-05,
"loss": 0.6464,
"step": 31820
},
{
"epoch": 1.3082422549100172,
"grad_norm": 1.1035876274108887,
"learning_rate": 5.6692574307055476e-05,
"loss": 0.6549,
"step": 31840
},
{
"epoch": 1.3090640151203878,
"grad_norm": 1.2363945245742798,
"learning_rate": 5.6572663501992416e-05,
"loss": 0.6621,
"step": 31860
},
{
"epoch": 1.3098857753307585,
"grad_norm": 0.9777959585189819,
"learning_rate": 5.645282960339944e-05,
"loss": 0.6054,
"step": 31880
},
{
"epoch": 1.310707535541129,
"grad_norm": 1.1316789388656616,
"learning_rate": 5.633307282349325e-05,
"loss": 0.6553,
"step": 31900
},
{
"epoch": 1.3115292957514997,
"grad_norm": 1.0634005069732666,
"learning_rate": 5.6213393374353814e-05,
"loss": 0.619,
"step": 31920
},
{
"epoch": 1.3123510559618703,
"grad_norm": 1.0504931211471558,
"learning_rate": 5.609379146792426e-05,
"loss": 0.6504,
"step": 31940
},
{
"epoch": 1.313172816172241,
"grad_norm": 1.1487746238708496,
"learning_rate": 5.597426731601034e-05,
"loss": 0.6555,
"step": 31960
},
{
"epoch": 1.3139945763826115,
"grad_norm": 1.0530478954315186,
"learning_rate": 5.585482113028009e-05,
"loss": 0.6315,
"step": 31980
},
{
"epoch": 1.3148163365929821,
"grad_norm": 1.161407709121704,
"learning_rate": 5.5735453122263595e-05,
"loss": 0.6467,
"step": 32000
},
{
"epoch": 1.3148163365929821,
"eval_loss": 0.9416676163673401,
"eval_runtime": 16.767,
"eval_samples_per_second": 156.259,
"eval_steps_per_second": 4.891,
"step": 32000
},
{
"epoch": 1.3156380968033528,
"grad_norm": 1.104887843132019,
"learning_rate": 5.5616163503352314e-05,
"loss": 0.6367,
"step": 32020
},
{
"epoch": 1.3164598570137234,
"grad_norm": 0.9899182319641113,
"learning_rate": 5.5496952484799114e-05,
"loss": 0.6521,
"step": 32040
},
{
"epoch": 1.317281617224094,
"grad_norm": 1.3830389976501465,
"learning_rate": 5.537782027771736e-05,
"loss": 0.6269,
"step": 32060
},
{
"epoch": 1.3181033774344646,
"grad_norm": 1.158103108406067,
"learning_rate": 5.5258767093081165e-05,
"loss": 0.6405,
"step": 32080
},
{
"epoch": 1.3189251376448352,
"grad_norm": 1.2154991626739502,
"learning_rate": 5.513979314172449e-05,
"loss": 0.626,
"step": 32100
},
{
"epoch": 1.3197468978552058,
"grad_norm": 1.2084523439407349,
"learning_rate": 5.502089863434101e-05,
"loss": 0.6438,
"step": 32120
},
{
"epoch": 1.3205686580655764,
"grad_norm": 1.1323741674423218,
"learning_rate": 5.490208378148385e-05,
"loss": 0.6553,
"step": 32140
},
{
"epoch": 1.321390418275947,
"grad_norm": 1.1183146238327026,
"learning_rate": 5.478334879356488e-05,
"loss": 0.6435,
"step": 32160
},
{
"epoch": 1.3222121784863177,
"grad_norm": 1.035343885421753,
"learning_rate": 5.466469388085467e-05,
"loss": 0.6335,
"step": 32180
},
{
"epoch": 1.3230339386966883,
"grad_norm": 0.9998947381973267,
"learning_rate": 5.454611925348191e-05,
"loss": 0.6473,
"step": 32200
},
{
"epoch": 1.323855698907059,
"grad_norm": 1.1686418056488037,
"learning_rate": 5.442762512143311e-05,
"loss": 0.6328,
"step": 32220
},
{
"epoch": 1.3246774591174295,
"grad_norm": 1.1200451850891113,
"learning_rate": 5.4309211694552334e-05,
"loss": 0.6295,
"step": 32240
},
{
"epoch": 1.3254992193278001,
"grad_norm": 1.2187868356704712,
"learning_rate": 5.41908791825406e-05,
"loss": 0.6223,
"step": 32260
},
{
"epoch": 1.3263209795381707,
"grad_norm": 1.0789508819580078,
"learning_rate": 5.4072627794955697e-05,
"loss": 0.6311,
"step": 32280
},
{
"epoch": 1.3271427397485414,
"grad_norm": 1.2329927682876587,
"learning_rate": 5.395445774121166e-05,
"loss": 0.6376,
"step": 32300
},
{
"epoch": 1.327964499958912,
"grad_norm": 1.2252501249313354,
"learning_rate": 5.3836369230578665e-05,
"loss": 0.6175,
"step": 32320
},
{
"epoch": 1.3287862601692826,
"grad_norm": 1.0544497966766357,
"learning_rate": 5.371836247218232e-05,
"loss": 0.6348,
"step": 32340
},
{
"epoch": 1.3296080203796532,
"grad_norm": 1.1359783411026,
"learning_rate": 5.360043767500348e-05,
"loss": 0.6333,
"step": 32360
},
{
"epoch": 1.3304297805900238,
"grad_norm": 1.1277779340744019,
"learning_rate": 5.3482595047878004e-05,
"loss": 0.64,
"step": 32380
},
{
"epoch": 1.3312515408003944,
"grad_norm": 1.1540424823760986,
"learning_rate": 5.3364834799495934e-05,
"loss": 0.6403,
"step": 32400
},
{
"epoch": 1.332073301010765,
"grad_norm": 1.007522702217102,
"learning_rate": 5.324715713840174e-05,
"loss": 0.6374,
"step": 32420
},
{
"epoch": 1.3328950612211357,
"grad_norm": 1.1693965196609497,
"learning_rate": 5.3129562272993437e-05,
"loss": 0.6387,
"step": 32440
},
{
"epoch": 1.3337168214315063,
"grad_norm": 1.1591569185256958,
"learning_rate": 5.301205041152253e-05,
"loss": 0.6613,
"step": 32460
},
{
"epoch": 1.3345385816418769,
"grad_norm": 1.1624069213867188,
"learning_rate": 5.2900491215067996e-05,
"loss": 0.6349,
"step": 32480
},
{
"epoch": 1.3353603418522475,
"grad_norm": 1.181348204612732,
"learning_rate": 5.2783141809701195e-05,
"loss": 0.6565,
"step": 32500
},
{
"epoch": 1.3361821020626181,
"grad_norm": 1.1376749277114868,
"learning_rate": 5.266587602175571e-05,
"loss": 0.642,
"step": 32520
},
{
"epoch": 1.3370038622729887,
"grad_norm": 1.161271572113037,
"learning_rate": 5.25486940589003e-05,
"loss": 0.6378,
"step": 32540
},
{
"epoch": 1.3378256224833593,
"grad_norm": 1.0550984144210815,
"learning_rate": 5.243159612865513e-05,
"loss": 0.6456,
"step": 32560
},
{
"epoch": 1.33864738269373,
"grad_norm": 1.299521803855896,
"learning_rate": 5.2314582438391666e-05,
"loss": 0.6495,
"step": 32580
},
{
"epoch": 1.3394691429041006,
"grad_norm": 1.115898847579956,
"learning_rate": 5.2197653195332094e-05,
"loss": 0.6439,
"step": 32600
},
{
"epoch": 1.3402909031144712,
"grad_norm": 1.1047664880752563,
"learning_rate": 5.208080860654916e-05,
"loss": 0.6428,
"step": 32620
},
{
"epoch": 1.3411126633248418,
"grad_norm": 1.0742267370224,
"learning_rate": 5.196404887896562e-05,
"loss": 0.633,
"step": 32640
},
{
"epoch": 1.3419344235352124,
"grad_norm": 1.079347014427185,
"learning_rate": 5.18473742193539e-05,
"loss": 0.628,
"step": 32660
},
{
"epoch": 1.342756183745583,
"grad_norm": 1.0724397897720337,
"learning_rate": 5.1730784834336e-05,
"loss": 0.6421,
"step": 32680
},
{
"epoch": 1.3435779439559536,
"grad_norm": 1.193382740020752,
"learning_rate": 5.161428093038255e-05,
"loss": 0.6263,
"step": 32700
},
{
"epoch": 1.3443997041663243,
"grad_norm": 1.219397783279419,
"learning_rate": 5.149786271381314e-05,
"loss": 0.637,
"step": 32720
},
{
"epoch": 1.3452214643766949,
"grad_norm": 1.2231544256210327,
"learning_rate": 5.1381530390795365e-05,
"loss": 0.6491,
"step": 32740
},
{
"epoch": 1.3460432245870655,
"grad_norm": 1.1197011470794678,
"learning_rate": 5.1265284167344906e-05,
"loss": 0.6405,
"step": 32760
},
{
"epoch": 1.346864984797436,
"grad_norm": 1.1939942836761475,
"learning_rate": 5.114912424932485e-05,
"loss": 0.6389,
"step": 32780
},
{
"epoch": 1.3476867450078067,
"grad_norm": 1.029039740562439,
"learning_rate": 5.103305084244545e-05,
"loss": 0.6434,
"step": 32800
},
{
"epoch": 1.3485085052181773,
"grad_norm": 1.1776632070541382,
"learning_rate": 5.0917064152263804e-05,
"loss": 0.6287,
"step": 32820
},
{
"epoch": 1.349330265428548,
"grad_norm": 1.1028509140014648,
"learning_rate": 5.080116438418334e-05,
"loss": 0.6407,
"step": 32840
},
{
"epoch": 1.3501520256389186,
"grad_norm": 1.1051464080810547,
"learning_rate": 5.068535174345373e-05,
"loss": 0.6248,
"step": 32860
},
{
"epoch": 1.3509737858492892,
"grad_norm": 1.0466769933700562,
"learning_rate": 5.056962643517014e-05,
"loss": 0.6292,
"step": 32880
},
{
"epoch": 1.3517955460596598,
"grad_norm": 1.222561001777649,
"learning_rate": 5.045398866427331e-05,
"loss": 0.6223,
"step": 32900
},
{
"epoch": 1.3526173062700304,
"grad_norm": 1.2215903997421265,
"learning_rate": 5.03384386355487e-05,
"loss": 0.6381,
"step": 32920
},
{
"epoch": 1.353439066480401,
"grad_norm": 1.1650848388671875,
"learning_rate": 5.02229765536266e-05,
"loss": 0.6543,
"step": 32940
},
{
"epoch": 1.3542608266907716,
"grad_norm": 1.1978867053985596,
"learning_rate": 5.010760262298145e-05,
"loss": 0.6255,
"step": 32960
},
{
"epoch": 1.3550825869011422,
"grad_norm": 1.2988346815109253,
"learning_rate": 4.999231704793156e-05,
"loss": 0.6388,
"step": 32980
},
{
"epoch": 1.3559043471115129,
"grad_norm": 0.9702379107475281,
"learning_rate": 4.987712003263892e-05,
"loss": 0.6343,
"step": 33000
},
{
"epoch": 1.3567261073218835,
"grad_norm": 1.037627100944519,
"learning_rate": 4.976201178110843e-05,
"loss": 0.6351,
"step": 33020
},
{
"epoch": 1.357547867532254,
"grad_norm": 1.2792913913726807,
"learning_rate": 4.964699249718805e-05,
"loss": 0.6324,
"step": 33040
},
{
"epoch": 1.3583696277426247,
"grad_norm": 1.1782094240188599,
"learning_rate": 4.953206238456804e-05,
"loss": 0.6144,
"step": 33060
},
{
"epoch": 1.3591913879529953,
"grad_norm": 1.1542799472808838,
"learning_rate": 4.941722164678074e-05,
"loss": 0.6236,
"step": 33080
},
{
"epoch": 1.360013148163366,
"grad_norm": 1.2403596639633179,
"learning_rate": 4.930247048720035e-05,
"loss": 0.6325,
"step": 33100
},
{
"epoch": 1.3608349083737366,
"grad_norm": 1.1392772197723389,
"learning_rate": 4.918780910904229e-05,
"loss": 0.6363,
"step": 33120
},
{
"epoch": 1.3616566685841072,
"grad_norm": 1.0998027324676514,
"learning_rate": 4.907323771536304e-05,
"loss": 0.6461,
"step": 33140
},
{
"epoch": 1.3624784287944778,
"grad_norm": 1.139052391052246,
"learning_rate": 4.895875650905967e-05,
"loss": 0.6196,
"step": 33160
},
{
"epoch": 1.3633001890048484,
"grad_norm": 1.0922917127609253,
"learning_rate": 4.884436569286968e-05,
"loss": 0.6418,
"step": 33180
},
{
"epoch": 1.364121949215219,
"grad_norm": 1.2731437683105469,
"learning_rate": 4.8730065469370345e-05,
"loss": 0.6584,
"step": 33200
},
{
"epoch": 1.3649437094255896,
"grad_norm": 1.07999849319458,
"learning_rate": 4.861585604097857e-05,
"loss": 0.6337,
"step": 33220
},
{
"epoch": 1.3657654696359602,
"grad_norm": 1.270757794380188,
"learning_rate": 4.8501737609950456e-05,
"loss": 0.6189,
"step": 33240
},
{
"epoch": 1.3665872298463309,
"grad_norm": 1.1716457605361938,
"learning_rate": 4.8387710378380925e-05,
"loss": 0.6443,
"step": 33260
},
{
"epoch": 1.3674089900567015,
"grad_norm": 1.2344413995742798,
"learning_rate": 4.827377454820351e-05,
"loss": 0.648,
"step": 33280
},
{
"epoch": 1.368230750267072,
"grad_norm": 1.134440541267395,
"learning_rate": 4.815993032118972e-05,
"loss": 0.6276,
"step": 33300
},
{
"epoch": 1.3690525104774427,
"grad_norm": 1.1542905569076538,
"learning_rate": 4.804617789894898e-05,
"loss": 0.632,
"step": 33320
},
{
"epoch": 1.3698742706878133,
"grad_norm": 1.0746182203292847,
"learning_rate": 4.7932517482928044e-05,
"loss": 0.6425,
"step": 33340
},
{
"epoch": 1.370696030898184,
"grad_norm": 1.2313367128372192,
"learning_rate": 4.7818949274410755e-05,
"loss": 0.6359,
"step": 33360
},
{
"epoch": 1.3715177911085545,
"grad_norm": 1.1803609132766724,
"learning_rate": 4.770547347451767e-05,
"loss": 0.6255,
"step": 33380
},
{
"epoch": 1.3723395513189252,
"grad_norm": 1.082554578781128,
"learning_rate": 4.759209028420567e-05,
"loss": 0.6194,
"step": 33400
},
{
"epoch": 1.3731613115292958,
"grad_norm": 1.1176568269729614,
"learning_rate": 4.747879990426772e-05,
"loss": 0.626,
"step": 33420
},
{
"epoch": 1.3739830717396664,
"grad_norm": 1.2282779216766357,
"learning_rate": 4.736560253533233e-05,
"loss": 0.6334,
"step": 33440
},
{
"epoch": 1.374804831950037,
"grad_norm": 1.1532173156738281,
"learning_rate": 4.725249837786333e-05,
"loss": 0.6206,
"step": 33460
},
{
"epoch": 1.3756265921604076,
"grad_norm": 1.1878107786178589,
"learning_rate": 4.713948763215943e-05,
"loss": 0.6585,
"step": 33480
},
{
"epoch": 1.3764483523707782,
"grad_norm": 1.0733743906021118,
"learning_rate": 4.7026570498354036e-05,
"loss": 0.6409,
"step": 33500
},
{
"epoch": 1.3772701125811488,
"grad_norm": 1.0726110935211182,
"learning_rate": 4.691374717641468e-05,
"loss": 0.6188,
"step": 33520
},
{
"epoch": 1.3780918727915195,
"grad_norm": 1.0967559814453125,
"learning_rate": 4.6801017866142716e-05,
"loss": 0.624,
"step": 33540
},
{
"epoch": 1.37891363300189,
"grad_norm": 1.1548309326171875,
"learning_rate": 4.668838276717321e-05,
"loss": 0.6432,
"step": 33560
},
{
"epoch": 1.3797353932122607,
"grad_norm": 1.1636457443237305,
"learning_rate": 4.6575842078974096e-05,
"loss": 0.6232,
"step": 33580
},
{
"epoch": 1.3805571534226313,
"grad_norm": 1.2843453884124756,
"learning_rate": 4.6463396000846385e-05,
"loss": 0.6349,
"step": 33600
},
{
"epoch": 1.381378913633002,
"grad_norm": 1.131871223449707,
"learning_rate": 4.635104473192334e-05,
"loss": 0.6477,
"step": 33620
},
{
"epoch": 1.3822006738433725,
"grad_norm": 1.2385400533676147,
"learning_rate": 4.62387884711705e-05,
"loss": 0.6233,
"step": 33640
},
{
"epoch": 1.3830224340537431,
"grad_norm": 1.1033365726470947,
"learning_rate": 4.612662741738501e-05,
"loss": 0.6398,
"step": 33660
},
{
"epoch": 1.3838441942641138,
"grad_norm": 1.232216477394104,
"learning_rate": 4.6014561769195476e-05,
"loss": 0.6356,
"step": 33680
},
{
"epoch": 1.3846659544744844,
"grad_norm": 1.1851789951324463,
"learning_rate": 4.5902591725061516e-05,
"loss": 0.5995,
"step": 33700
},
{
"epoch": 1.385487714684855,
"grad_norm": 1.0367991924285889,
"learning_rate": 4.5796308916997966e-05,
"loss": 0.6456,
"step": 33720
},
{
"epoch": 1.3863094748952256,
"grad_norm": 1.2613193988800049,
"learning_rate": 4.568452587095045e-05,
"loss": 0.5976,
"step": 33740
},
{
"epoch": 1.3871312351055962,
"grad_norm": 1.056667685508728,
"learning_rate": 4.557283901342667e-05,
"loss": 0.6341,
"step": 33760
},
{
"epoch": 1.3879529953159668,
"grad_norm": 0.9964428544044495,
"learning_rate": 4.546124854221549e-05,
"loss": 0.6187,
"step": 33780
},
{
"epoch": 1.3887747555263374,
"grad_norm": 1.2256003618240356,
"learning_rate": 4.5349754654934994e-05,
"loss": 0.6263,
"step": 33800
},
{
"epoch": 1.389596515736708,
"grad_norm": 1.1462249755859375,
"learning_rate": 4.523835754903235e-05,
"loss": 0.6133,
"step": 33820
},
{
"epoch": 1.3904182759470787,
"grad_norm": 1.1194911003112793,
"learning_rate": 4.512705742178317e-05,
"loss": 0.6437,
"step": 33840
},
{
"epoch": 1.3912400361574493,
"grad_norm": 1.1602319478988647,
"learning_rate": 4.501585447029154e-05,
"loss": 0.6323,
"step": 33860
},
{
"epoch": 1.39206179636782,
"grad_norm": 1.1044458150863647,
"learning_rate": 4.490474889148918e-05,
"loss": 0.6317,
"step": 33880
},
{
"epoch": 1.3928835565781905,
"grad_norm": 1.1019541025161743,
"learning_rate": 4.479374088213561e-05,
"loss": 0.6154,
"step": 33900
},
{
"epoch": 1.3937053167885611,
"grad_norm": 1.163619875907898,
"learning_rate": 4.468283063881745e-05,
"loss": 0.64,
"step": 33920
},
{
"epoch": 1.3945270769989317,
"grad_norm": 1.2417570352554321,
"learning_rate": 4.4572018357948163e-05,
"loss": 0.6219,
"step": 33940
},
{
"epoch": 1.3953488372093024,
"grad_norm": 1.0506731271743774,
"learning_rate": 4.446130423576788e-05,
"loss": 0.6288,
"step": 33960
},
{
"epoch": 1.396170597419673,
"grad_norm": 1.1237679719924927,
"learning_rate": 4.4350688468342625e-05,
"loss": 0.6271,
"step": 33980
},
{
"epoch": 1.3969923576300436,
"grad_norm": 1.254115343093872,
"learning_rate": 4.424017125156454e-05,
"loss": 0.616,
"step": 34000
},
{
"epoch": 1.3969923576300436,
"eval_loss": 0.924524188041687,
"eval_runtime": 16.5191,
"eval_samples_per_second": 158.604,
"eval_steps_per_second": 4.964,
"step": 34000
},
{
"epoch": 1.3978141178404142,
"grad_norm": 1.0411611795425415,
"learning_rate": 4.412975278115104e-05,
"loss": 0.6529,
"step": 34020
},
{
"epoch": 1.3986358780507848,
"grad_norm": 1.1783146858215332,
"learning_rate": 4.401943325264478e-05,
"loss": 0.6594,
"step": 34040
},
{
"epoch": 1.3994576382611554,
"grad_norm": 1.1332125663757324,
"learning_rate": 4.390921286141314e-05,
"loss": 0.6194,
"step": 34060
},
{
"epoch": 1.400279398471526,
"grad_norm": 1.1602998971939087,
"learning_rate": 4.3799091802647954e-05,
"loss": 0.6178,
"step": 34080
},
{
"epoch": 1.4011011586818967,
"grad_norm": 1.1415718793869019,
"learning_rate": 4.368907027136512e-05,
"loss": 0.6115,
"step": 34100
},
{
"epoch": 1.4019229188922673,
"grad_norm": 1.0940213203430176,
"learning_rate": 4.3579148462404273e-05,
"loss": 0.6163,
"step": 34120
},
{
"epoch": 1.402744679102638,
"grad_norm": 1.0750993490219116,
"learning_rate": 4.346932657042855e-05,
"loss": 0.6368,
"step": 34140
},
{
"epoch": 1.4035664393130085,
"grad_norm": 1.1632193326950073,
"learning_rate": 4.335960478992399e-05,
"loss": 0.6357,
"step": 34160
},
{
"epoch": 1.4043881995233791,
"grad_norm": 0.9799935221672058,
"learning_rate": 4.324998331519954e-05,
"loss": 0.6241,
"step": 34180
},
{
"epoch": 1.4052099597337497,
"grad_norm": 1.2228275537490845,
"learning_rate": 4.314046234038624e-05,
"loss": 0.6131,
"step": 34200
},
{
"epoch": 1.4060317199441204,
"grad_norm": 1.1572198867797852,
"learning_rate": 4.3031042059437423e-05,
"loss": 0.6143,
"step": 34220
},
{
"epoch": 1.406853480154491,
"grad_norm": 1.0190701484680176,
"learning_rate": 4.292172266612794e-05,
"loss": 0.6292,
"step": 34240
},
{
"epoch": 1.4076752403648616,
"grad_norm": 1.032221794128418,
"learning_rate": 4.2812504354053986e-05,
"loss": 0.6137,
"step": 34260
},
{
"epoch": 1.4084970005752322,
"grad_norm": 1.2286864519119263,
"learning_rate": 4.270338731663285e-05,
"loss": 0.6144,
"step": 34280
},
{
"epoch": 1.4093187607856028,
"grad_norm": 1.0505238771438599,
"learning_rate": 4.259437174710239e-05,
"loss": 0.632,
"step": 34300
},
{
"epoch": 1.4101405209959734,
"grad_norm": 1.101257562637329,
"learning_rate": 4.248545783852077e-05,
"loss": 0.6191,
"step": 34320
},
{
"epoch": 1.410962281206344,
"grad_norm": 1.1456806659698486,
"learning_rate": 4.237664578376611e-05,
"loss": 0.6128,
"step": 34340
},
{
"epoch": 1.4117840414167147,
"grad_norm": 1.186767339706421,
"learning_rate": 4.226793577553626e-05,
"loss": 0.6076,
"step": 34360
},
{
"epoch": 1.4126058016270853,
"grad_norm": 1.081678867340088,
"learning_rate": 4.215932800634823e-05,
"loss": 0.5981,
"step": 34380
},
{
"epoch": 1.4134275618374559,
"grad_norm": 1.1703848838806152,
"learning_rate": 4.205082266853803e-05,
"loss": 0.6327,
"step": 34400
},
{
"epoch": 1.4142493220478265,
"grad_norm": 1.0651689767837524,
"learning_rate": 4.194241995426025e-05,
"loss": 0.6534,
"step": 34420
},
{
"epoch": 1.415071082258197,
"grad_norm": 0.9564984440803528,
"learning_rate": 4.183412005548771e-05,
"loss": 0.6247,
"step": 34440
},
{
"epoch": 1.4158928424685677,
"grad_norm": 1.0257432460784912,
"learning_rate": 4.172592316401129e-05,
"loss": 0.6221,
"step": 34460
},
{
"epoch": 1.4167146026789383,
"grad_norm": 1.0298326015472412,
"learning_rate": 4.1617829471439286e-05,
"loss": 0.6157,
"step": 34480
},
{
"epoch": 1.417536362889309,
"grad_norm": 1.0720484256744385,
"learning_rate": 4.150983916919735e-05,
"loss": 0.6086,
"step": 34500
},
{
"epoch": 1.4183581230996796,
"grad_norm": 1.0548421144485474,
"learning_rate": 4.140195244852797e-05,
"loss": 0.6133,
"step": 34520
},
{
"epoch": 1.4191798833100502,
"grad_norm": 1.1134650707244873,
"learning_rate": 4.1294169500490244e-05,
"loss": 0.613,
"step": 34540
},
{
"epoch": 1.4200016435204208,
"grad_norm": 1.135111689567566,
"learning_rate": 4.1186490515959456e-05,
"loss": 0.6121,
"step": 34560
},
{
"epoch": 1.4208234037307914,
"grad_norm": 1.080965280532837,
"learning_rate": 4.107891568562675e-05,
"loss": 0.6202,
"step": 34580
},
{
"epoch": 1.421645163941162,
"grad_norm": 1.1884208917617798,
"learning_rate": 4.097144519999898e-05,
"loss": 0.64,
"step": 34600
},
{
"epoch": 1.4224669241515326,
"grad_norm": 1.1158133745193481,
"learning_rate": 4.086407924939803e-05,
"loss": 0.6285,
"step": 34620
},
{
"epoch": 1.4232886843619033,
"grad_norm": 1.0862083435058594,
"learning_rate": 4.0756818023960765e-05,
"loss": 0.6368,
"step": 34640
},
{
"epoch": 1.4241104445722739,
"grad_norm": 1.0684436559677124,
"learning_rate": 4.0649661713638544e-05,
"loss": 0.6247,
"step": 34660
},
{
"epoch": 1.4249322047826445,
"grad_norm": 1.1276649236679077,
"learning_rate": 4.054261050819691e-05,
"loss": 0.6464,
"step": 34680
},
{
"epoch": 1.425753964993015,
"grad_norm": 1.162343144416809,
"learning_rate": 4.043566459721537e-05,
"loss": 0.6193,
"step": 34700
},
{
"epoch": 1.4265757252033857,
"grad_norm": 0.9801756143569946,
"learning_rate": 4.033416368328017e-05,
"loss": 0.6304,
"step": 34720
},
{
"epoch": 1.4273974854137563,
"grad_norm": 1.1881067752838135,
"learning_rate": 4.022742364106714e-05,
"loss": 0.6233,
"step": 34740
},
{
"epoch": 1.428219245624127,
"grad_norm": 1.0229750871658325,
"learning_rate": 4.012078945148589e-05,
"loss": 0.6281,
"step": 34760
},
{
"epoch": 1.4290410058344976,
"grad_norm": 1.2104301452636719,
"learning_rate": 4.001426130337733e-05,
"loss": 0.616,
"step": 34780
},
{
"epoch": 1.4298627660448682,
"grad_norm": 1.0440176725387573,
"learning_rate": 3.990783938539456e-05,
"loss": 0.6205,
"step": 34800
},
{
"epoch": 1.4306845262552388,
"grad_norm": 1.0981454849243164,
"learning_rate": 3.98068371306235e-05,
"loss": 0.6387,
"step": 34820
},
{
"epoch": 1.4315062864656094,
"grad_norm": 1.1962534189224243,
"learning_rate": 3.970062290328703e-05,
"loss": 0.6029,
"step": 34840
},
{
"epoch": 1.43232804667598,
"grad_norm": 1.0792949199676514,
"learning_rate": 3.959451546150584e-05,
"loss": 0.6446,
"step": 34860
},
{
"epoch": 1.4331498068863506,
"grad_norm": 1.2667758464813232,
"learning_rate": 3.948851499318811e-05,
"loss": 0.6233,
"step": 34880
},
{
"epoch": 1.4339715670967212,
"grad_norm": 0.9692990779876709,
"learning_rate": 3.9382621686052454e-05,
"loss": 0.6101,
"step": 34900
},
{
"epoch": 1.4347933273070919,
"grad_norm": 1.0378152132034302,
"learning_rate": 3.927683572762778e-05,
"loss": 0.6223,
"step": 34920
},
{
"epoch": 1.4356150875174625,
"grad_norm": 1.0101780891418457,
"learning_rate": 3.9171157305252884e-05,
"loss": 0.6109,
"step": 34940
},
{
"epoch": 1.436436847727833,
"grad_norm": 1.1455594301223755,
"learning_rate": 3.9065586606076064e-05,
"loss": 0.6373,
"step": 34960
},
{
"epoch": 1.4372586079382037,
"grad_norm": 0.9811462759971619,
"learning_rate": 3.8960123817055e-05,
"loss": 0.6111,
"step": 34980
},
{
"epoch": 1.4380803681485743,
"grad_norm": 1.0883426666259766,
"learning_rate": 3.8854769124956104e-05,
"loss": 0.6136,
"step": 35000
},
{
"epoch": 1.438902128358945,
"grad_norm": 1.2005125284194946,
"learning_rate": 3.874952271635444e-05,
"loss": 0.624,
"step": 35020
},
{
"epoch": 1.4397238885693155,
"grad_norm": 1.1082879304885864,
"learning_rate": 3.864438477763327e-05,
"loss": 0.6224,
"step": 35040
},
{
"epoch": 1.4405456487796862,
"grad_norm": 1.1191489696502686,
"learning_rate": 3.8539355494983865e-05,
"loss": 0.6174,
"step": 35060
},
{
"epoch": 1.4413674089900568,
"grad_norm": 1.131309986114502,
"learning_rate": 3.843443505440494e-05,
"loss": 0.6212,
"step": 35080
},
{
"epoch": 1.4421891692004274,
"grad_norm": 1.216101884841919,
"learning_rate": 3.832962364170251e-05,
"loss": 0.6228,
"step": 35100
},
{
"epoch": 1.443010929410798,
"grad_norm": 1.043578863143921,
"learning_rate": 3.82249214424896e-05,
"loss": 0.6225,
"step": 35120
},
{
"epoch": 1.4438326896211686,
"grad_norm": 1.0260852575302124,
"learning_rate": 3.812032864218563e-05,
"loss": 0.6249,
"step": 35140
},
{
"epoch": 1.4446544498315392,
"grad_norm": 1.1600069999694824,
"learning_rate": 3.8015845426016494e-05,
"loss": 0.6072,
"step": 35160
},
{
"epoch": 1.4454762100419098,
"grad_norm": 1.0437159538269043,
"learning_rate": 3.7911471979013845e-05,
"loss": 0.619,
"step": 35180
},
{
"epoch": 1.4462979702522805,
"grad_norm": 1.074245572090149,
"learning_rate": 3.78072084860151e-05,
"loss": 0.6135,
"step": 35200
},
{
"epoch": 1.447119730462651,
"grad_norm": 1.021990180015564,
"learning_rate": 3.7703055131662854e-05,
"loss": 0.614,
"step": 35220
},
{
"epoch": 1.4479414906730217,
"grad_norm": 0.9545276165008545,
"learning_rate": 3.759901210040466e-05,
"loss": 0.5856,
"step": 35240
},
{
"epoch": 1.4487632508833923,
"grad_norm": 1.1442620754241943,
"learning_rate": 3.749507957649274e-05,
"loss": 0.6096,
"step": 35260
},
{
"epoch": 1.449585011093763,
"grad_norm": 1.0844465494155884,
"learning_rate": 3.7391257743983554e-05,
"loss": 0.6105,
"step": 35280
},
{
"epoch": 1.4504067713041335,
"grad_norm": 1.13474702835083,
"learning_rate": 3.728754678673762e-05,
"loss": 0.6214,
"step": 35300
},
{
"epoch": 1.4512285315145041,
"grad_norm": 1.0782667398452759,
"learning_rate": 3.7183946888419066e-05,
"loss": 0.6417,
"step": 35320
},
{
"epoch": 1.4520502917248748,
"grad_norm": 1.2489984035491943,
"learning_rate": 3.708045823249531e-05,
"loss": 0.6105,
"step": 35340
},
{
"epoch": 1.4528720519352452,
"grad_norm": 1.113853096961975,
"learning_rate": 3.69770810022368e-05,
"loss": 0.6243,
"step": 35360
},
{
"epoch": 1.453693812145616,
"grad_norm": 1.1300607919692993,
"learning_rate": 3.6873815380716624e-05,
"loss": 0.6248,
"step": 35380
},
{
"epoch": 1.4545155723559864,
"grad_norm": 1.0935343503952026,
"learning_rate": 3.6770661550810316e-05,
"loss": 0.6188,
"step": 35400
},
{
"epoch": 1.4553373325663572,
"grad_norm": 1.2234795093536377,
"learning_rate": 3.6667619695195285e-05,
"loss": 0.6157,
"step": 35420
},
{
"epoch": 1.4561590927767276,
"grad_norm": 1.050308346748352,
"learning_rate": 3.656468999635085e-05,
"loss": 0.6089,
"step": 35440
},
{
"epoch": 1.4569808529870985,
"grad_norm": 1.116862416267395,
"learning_rate": 3.646187263655745e-05,
"loss": 0.6126,
"step": 35460
},
{
"epoch": 1.4578026131974688,
"grad_norm": 1.2579872608184814,
"learning_rate": 3.6359167797896795e-05,
"loss": 0.6189,
"step": 35480
},
{
"epoch": 1.4586243734078397,
"grad_norm": 1.2237604856491089,
"learning_rate": 3.625657566225124e-05,
"loss": 0.6248,
"step": 35500
},
{
"epoch": 1.45944613361821,
"grad_norm": 1.2082823514938354,
"learning_rate": 3.615409641130351e-05,
"loss": 0.6277,
"step": 35520
},
{
"epoch": 1.460267893828581,
"grad_norm": 1.17978835105896,
"learning_rate": 3.605173022653653e-05,
"loss": 0.6119,
"step": 35540
},
{
"epoch": 1.4610896540389513,
"grad_norm": 1.1109099388122559,
"learning_rate": 3.5949477289232914e-05,
"loss": 0.6391,
"step": 35560
},
{
"epoch": 1.4619114142493221,
"grad_norm": 1.109529733657837,
"learning_rate": 3.5847337780474744e-05,
"loss": 0.6211,
"step": 35580
},
{
"epoch": 1.4627331744596925,
"grad_norm": 1.159828543663025,
"learning_rate": 3.5745311881143196e-05,
"loss": 0.6182,
"step": 35600
},
{
"epoch": 1.4635549346700634,
"grad_norm": 1.047654390335083,
"learning_rate": 3.564339977191834e-05,
"loss": 0.6166,
"step": 35620
},
{
"epoch": 1.4643766948804338,
"grad_norm": 1.009979248046875,
"learning_rate": 3.554160163327864e-05,
"loss": 0.6,
"step": 35640
},
{
"epoch": 1.4651984550908046,
"grad_norm": 1.0264675617218018,
"learning_rate": 3.543991764550079e-05,
"loss": 0.6166,
"step": 35660
},
{
"epoch": 1.466020215301175,
"grad_norm": 1.2139075994491577,
"learning_rate": 3.533834798865927e-05,
"loss": 0.6369,
"step": 35680
},
{
"epoch": 1.4668419755115458,
"grad_norm": 1.1369953155517578,
"learning_rate": 3.523689284262611e-05,
"loss": 0.6183,
"step": 35700
},
{
"epoch": 1.4676637357219162,
"grad_norm": 1.1860294342041016,
"learning_rate": 3.5135552387070636e-05,
"loss": 0.6152,
"step": 35720
},
{
"epoch": 1.468485495932287,
"grad_norm": 1.0591294765472412,
"learning_rate": 3.503432680145892e-05,
"loss": 0.6152,
"step": 35740
},
{
"epoch": 1.4693072561426574,
"grad_norm": 1.112328290939331,
"learning_rate": 3.493321626505375e-05,
"loss": 0.6106,
"step": 35760
},
{
"epoch": 1.4701290163530283,
"grad_norm": 1.0694029331207275,
"learning_rate": 3.483222095691406e-05,
"loss": 0.609,
"step": 35780
},
{
"epoch": 1.4709507765633987,
"grad_norm": 1.0409561395645142,
"learning_rate": 3.4731341055894785e-05,
"loss": 0.6265,
"step": 35800
},
{
"epoch": 1.4717725367737695,
"grad_norm": 1.130654215812683,
"learning_rate": 3.463057674064646e-05,
"loss": 0.6058,
"step": 35820
},
{
"epoch": 1.47259429698414,
"grad_norm": 1.0501320362091064,
"learning_rate": 3.4534957865013894e-05,
"loss": 0.588,
"step": 35840
},
{
"epoch": 1.4734160571945107,
"grad_norm": 1.1128648519515991,
"learning_rate": 3.443441945508704e-05,
"loss": 0.6212,
"step": 35860
},
{
"epoch": 1.4742378174048811,
"grad_norm": 1.1255168914794922,
"learning_rate": 3.433399715675645e-05,
"loss": 0.6282,
"step": 35880
},
{
"epoch": 1.475059577615252,
"grad_norm": 1.3319727182388306,
"learning_rate": 3.4238703683689755e-05,
"loss": 0.6392,
"step": 35900
},
{
"epoch": 1.4758813378256224,
"grad_norm": 1.1387158632278442,
"learning_rate": 3.414351531414453e-05,
"loss": 0.6058,
"step": 35920
},
{
"epoch": 1.4767030980359932,
"grad_norm": 1.0003697872161865,
"learning_rate": 3.404343074439131e-05,
"loss": 0.6186,
"step": 35940
},
{
"epoch": 1.4775248582463636,
"grad_norm": 1.2400519847869873,
"learning_rate": 3.3943462978645225e-05,
"loss": 0.6088,
"step": 35960
},
{
"epoch": 1.4783466184567344,
"grad_norm": 1.048429250717163,
"learning_rate": 3.384361219394153e-05,
"loss": 0.6101,
"step": 35980
},
{
"epoch": 1.4791683786671048,
"grad_norm": 1.1433545351028442,
"learning_rate": 3.374387856710828e-05,
"loss": 0.6407,
"step": 36000
},
{
"epoch": 1.4791683786671048,
"eval_loss": 0.9071117639541626,
"eval_runtime": 16.6266,
"eval_samples_per_second": 157.579,
"eval_steps_per_second": 4.932,
"step": 36000
},
{
"epoch": 1.4799901388774757,
"grad_norm": 1.257107138633728,
"learning_rate": 3.3644262274766024e-05,
"loss": 0.6129,
"step": 36020
},
{
"epoch": 1.480811899087846,
"grad_norm": 1.07807457447052,
"learning_rate": 3.354476349332756e-05,
"loss": 0.619,
"step": 36040
},
{
"epoch": 1.4816336592982169,
"grad_norm": 1.1075770854949951,
"learning_rate": 3.344538239899754e-05,
"loss": 0.6148,
"step": 36060
},
{
"epoch": 1.4824554195085873,
"grad_norm": 1.2136929035186768,
"learning_rate": 3.3346119167772285e-05,
"loss": 0.6148,
"step": 36080
},
{
"epoch": 1.4832771797189581,
"grad_norm": 1.0873780250549316,
"learning_rate": 3.3246973975439274e-05,
"loss": 0.6213,
"step": 36100
},
{
"epoch": 1.4840989399293285,
"grad_norm": 1.09003746509552,
"learning_rate": 3.314794699757713e-05,
"loss": 0.6244,
"step": 36120
},
{
"epoch": 1.4849207001396993,
"grad_norm": 1.0032758712768555,
"learning_rate": 3.3049038409554855e-05,
"loss": 0.6265,
"step": 36140
},
{
"epoch": 1.4857424603500697,
"grad_norm": 1.10032057762146,
"learning_rate": 3.295024838653205e-05,
"loss": 0.6375,
"step": 36160
},
{
"epoch": 1.4865642205604406,
"grad_norm": 1.0320249795913696,
"learning_rate": 3.2851577103458196e-05,
"loss": 0.6101,
"step": 36180
},
{
"epoch": 1.487385980770811,
"grad_norm": 1.2705104351043701,
"learning_rate": 3.2753024735072534e-05,
"loss": 0.6337,
"step": 36200
},
{
"epoch": 1.4882077409811818,
"grad_norm": 1.1513526439666748,
"learning_rate": 3.2654591455903774e-05,
"loss": 0.6179,
"step": 36220
},
{
"epoch": 1.4890295011915522,
"grad_norm": 1.231684923171997,
"learning_rate": 3.2556277440269636e-05,
"loss": 0.619,
"step": 36240
},
{
"epoch": 1.489851261401923,
"grad_norm": 1.1558480262756348,
"learning_rate": 3.2458082862276685e-05,
"loss": 0.609,
"step": 36260
},
{
"epoch": 1.4906730216122934,
"grad_norm": 1.005265235900879,
"learning_rate": 3.236000789581992e-05,
"loss": 0.5977,
"step": 36280
},
{
"epoch": 1.4914947818226643,
"grad_norm": 1.1663181781768799,
"learning_rate": 3.2262052714582635e-05,
"loss": 0.6104,
"step": 36300
},
{
"epoch": 1.4923165420330347,
"grad_norm": 1.2102375030517578,
"learning_rate": 3.216421749203586e-05,
"loss": 0.6201,
"step": 36320
},
{
"epoch": 1.4931383022434055,
"grad_norm": 1.1391383409500122,
"learning_rate": 3.206650240143827e-05,
"loss": 0.6063,
"step": 36340
},
{
"epoch": 1.4939600624537759,
"grad_norm": 1.1457056999206543,
"learning_rate": 3.1968907615835756e-05,
"loss": 0.6111,
"step": 36360
},
{
"epoch": 1.4947818226641467,
"grad_norm": 1.105281114578247,
"learning_rate": 3.187143330806114e-05,
"loss": 0.6129,
"step": 36380
},
{
"epoch": 1.4956035828745171,
"grad_norm": 1.1415950059890747,
"learning_rate": 3.177407965073398e-05,
"loss": 0.6049,
"step": 36400
},
{
"epoch": 1.496425343084888,
"grad_norm": 1.0731744766235352,
"learning_rate": 3.1676846816260044e-05,
"loss": 0.6257,
"step": 36420
},
{
"epoch": 1.4972471032952583,
"grad_norm": 1.0916352272033691,
"learning_rate": 3.1579734976831265e-05,
"loss": 0.6219,
"step": 36440
},
{
"epoch": 1.4980688635056292,
"grad_norm": 1.1694599390029907,
"learning_rate": 3.14827443044252e-05,
"loss": 0.6068,
"step": 36460
},
{
"epoch": 1.4988906237159996,
"grad_norm": 1.0736804008483887,
"learning_rate": 3.1385874970804874e-05,
"loss": 0.6128,
"step": 36480
},
{
"epoch": 1.4997123839263704,
"grad_norm": 1.2031406164169312,
"learning_rate": 3.12891271475184e-05,
"loss": 0.6196,
"step": 36500
},
{
"epoch": 1.5005341441367408,
"grad_norm": 1.1104577779769897,
"learning_rate": 3.119250100589872e-05,
"loss": 0.6187,
"step": 36520
},
{
"epoch": 1.5013559043471116,
"grad_norm": 1.2821518182754517,
"learning_rate": 3.109599671706335e-05,
"loss": 0.6149,
"step": 36540
},
{
"epoch": 1.502177664557482,
"grad_norm": 1.0993192195892334,
"learning_rate": 3.09996144519139e-05,
"loss": 0.5835,
"step": 36560
},
{
"epoch": 1.5029994247678529,
"grad_norm": 1.0320968627929688,
"learning_rate": 3.090335438113597e-05,
"loss": 0.6023,
"step": 36580
},
{
"epoch": 1.5038211849782233,
"grad_norm": 0.9988365769386292,
"learning_rate": 3.08072166751987e-05,
"loss": 0.6205,
"step": 36600
},
{
"epoch": 1.504642945188594,
"grad_norm": 1.0576531887054443,
"learning_rate": 3.071120150435462e-05,
"loss": 0.6087,
"step": 36620
},
{
"epoch": 1.5054647053989645,
"grad_norm": 1.0885626077651978,
"learning_rate": 3.0615309038639186e-05,
"loss": 0.6034,
"step": 36640
},
{
"epoch": 1.5062864656093353,
"grad_norm": 1.0492252111434937,
"learning_rate": 3.051953944787054e-05,
"loss": 0.5865,
"step": 36660
},
{
"epoch": 1.5071082258197057,
"grad_norm": 1.0457483530044556,
"learning_rate": 3.0423892901649344e-05,
"loss": 0.6097,
"step": 36680
},
{
"epoch": 1.5079299860300766,
"grad_norm": 1.0300101041793823,
"learning_rate": 3.032836956935814e-05,
"loss": 0.5837,
"step": 36700
},
{
"epoch": 1.508751746240447,
"grad_norm": 1.1736280918121338,
"learning_rate": 3.023296962016151e-05,
"loss": 0.5926,
"step": 36720
},
{
"epoch": 1.5095735064508178,
"grad_norm": 1.0206154584884644,
"learning_rate": 3.0137693223005335e-05,
"loss": 0.6162,
"step": 36740
},
{
"epoch": 1.5103952666611882,
"grad_norm": 1.1228256225585938,
"learning_rate": 3.004254054661686e-05,
"loss": 0.5999,
"step": 36760
},
{
"epoch": 1.511217026871559,
"grad_norm": 1.0171713829040527,
"learning_rate": 2.994751175950411e-05,
"loss": 0.6092,
"step": 36780
},
{
"epoch": 1.5120387870819294,
"grad_norm": 1.017462968826294,
"learning_rate": 2.985260702995575e-05,
"loss": 0.6201,
"step": 36800
},
{
"epoch": 1.5128605472923002,
"grad_norm": 1.2459691762924194,
"learning_rate": 2.9757826526040755e-05,
"loss": 0.6189,
"step": 36820
},
{
"epoch": 1.5136823075026706,
"grad_norm": 1.027414321899414,
"learning_rate": 2.9663170415608078e-05,
"loss": 0.6104,
"step": 36840
},
{
"epoch": 1.5145040677130415,
"grad_norm": 1.0714311599731445,
"learning_rate": 2.9568638866286458e-05,
"loss": 0.6201,
"step": 36860
},
{
"epoch": 1.5153258279234119,
"grad_norm": 1.1437265872955322,
"learning_rate": 2.9474232045483952e-05,
"loss": 0.6102,
"step": 36880
},
{
"epoch": 1.5161475881337827,
"grad_norm": 1.2031759023666382,
"learning_rate": 2.93799501203878e-05,
"loss": 0.5923,
"step": 36900
},
{
"epoch": 1.516969348344153,
"grad_norm": 0.992753267288208,
"learning_rate": 2.928579325796401e-05,
"loss": 0.6025,
"step": 36920
},
{
"epoch": 1.517791108554524,
"grad_norm": 1.1709444522857666,
"learning_rate": 2.9191761624957115e-05,
"loss": 0.6125,
"step": 36940
},
{
"epoch": 1.5186128687648943,
"grad_norm": 1.1847578287124634,
"learning_rate": 2.909785538788995e-05,
"loss": 0.6141,
"step": 36960
},
{
"epoch": 1.5194346289752652,
"grad_norm": 1.032343864440918,
"learning_rate": 2.900407471306319e-05,
"loss": 0.6172,
"step": 36980
},
{
"epoch": 1.5202563891856355,
"grad_norm": 1.2214970588684082,
"learning_rate": 2.8910419766555275e-05,
"loss": 0.618,
"step": 37000
},
{
"epoch": 1.5210781493960064,
"grad_norm": 1.1395217180252075,
"learning_rate": 2.881689071422179e-05,
"loss": 0.6024,
"step": 37020
},
{
"epoch": 1.5218999096063768,
"grad_norm": 1.1765543222427368,
"learning_rate": 2.8723487721695562e-05,
"loss": 0.6021,
"step": 37040
},
{
"epoch": 1.5227216698167476,
"grad_norm": 1.1648917198181152,
"learning_rate": 2.8630210954386082e-05,
"loss": 0.6181,
"step": 37060
},
{
"epoch": 1.523543430027118,
"grad_norm": 1.1894433498382568,
"learning_rate": 2.853706057747929e-05,
"loss": 0.6272,
"step": 37080
},
{
"epoch": 1.5243651902374888,
"grad_norm": 1.1555765867233276,
"learning_rate": 2.84440367559374e-05,
"loss": 0.6187,
"step": 37100
},
{
"epoch": 1.5251869504478592,
"grad_norm": 1.1109102964401245,
"learning_rate": 2.8351139654498405e-05,
"loss": 0.611,
"step": 37120
},
{
"epoch": 1.52600871065823,
"grad_norm": 1.1215410232543945,
"learning_rate": 2.8258369437675926e-05,
"loss": 0.5798,
"step": 37140
},
{
"epoch": 1.5268304708686005,
"grad_norm": 1.1160521507263184,
"learning_rate": 2.816572626975884e-05,
"loss": 0.6054,
"step": 37160
},
{
"epoch": 1.5276522310789713,
"grad_norm": 1.0489914417266846,
"learning_rate": 2.8073210314811126e-05,
"loss": 0.6054,
"step": 37180
},
{
"epoch": 1.5284739912893417,
"grad_norm": 1.0901767015457153,
"learning_rate": 2.798082173667139e-05,
"loss": 0.6074,
"step": 37200
},
{
"epoch": 1.5292957514997125,
"grad_norm": 1.1401610374450684,
"learning_rate": 2.7888560698952702e-05,
"loss": 0.6087,
"step": 37220
},
{
"epoch": 1.530117511710083,
"grad_norm": 1.136517882347107,
"learning_rate": 2.7796427365042243e-05,
"loss": 0.6071,
"step": 37240
},
{
"epoch": 1.5309392719204538,
"grad_norm": 1.1060153245925903,
"learning_rate": 2.770442189810103e-05,
"loss": 0.6142,
"step": 37260
},
{
"epoch": 1.5317610321308242,
"grad_norm": 1.029160737991333,
"learning_rate": 2.7612544461063727e-05,
"loss": 0.6187,
"step": 37280
},
{
"epoch": 1.532582792341195,
"grad_norm": 1.1750712394714355,
"learning_rate": 2.752079521663814e-05,
"loss": 0.6107,
"step": 37300
},
{
"epoch": 1.5334045525515654,
"grad_norm": 1.0686962604522705,
"learning_rate": 2.7429174327305186e-05,
"loss": 0.6217,
"step": 37320
},
{
"epoch": 1.5342263127619362,
"grad_norm": 1.0369669198989868,
"learning_rate": 2.7337681955318363e-05,
"loss": 0.6108,
"step": 37340
},
{
"epoch": 1.5350480729723066,
"grad_norm": 1.0347490310668945,
"learning_rate": 2.724631826270362e-05,
"loss": 0.6055,
"step": 37360
},
{
"epoch": 1.5358698331826774,
"grad_norm": 1.0429108142852783,
"learning_rate": 2.715508341125904e-05,
"loss": 0.6004,
"step": 37380
},
{
"epoch": 1.5366915933930478,
"grad_norm": 1.1569420099258423,
"learning_rate": 2.7063977562554476e-05,
"loss": 0.613,
"step": 37400
},
{
"epoch": 1.5375133536034187,
"grad_norm": 1.1140472888946533,
"learning_rate": 2.6973000877931443e-05,
"loss": 0.6267,
"step": 37420
},
{
"epoch": 1.538335113813789,
"grad_norm": 1.0750665664672852,
"learning_rate": 2.6882153518502616e-05,
"loss": 0.6119,
"step": 37440
},
{
"epoch": 1.53915687402416,
"grad_norm": 1.0947927236557007,
"learning_rate": 2.6791435645151675e-05,
"loss": 0.5913,
"step": 37460
},
{
"epoch": 1.5399786342345303,
"grad_norm": 1.1905947923660278,
"learning_rate": 2.670084741853296e-05,
"loss": 0.6153,
"step": 37480
},
{
"epoch": 1.5408003944449011,
"grad_norm": 1.0356147289276123,
"learning_rate": 2.661038899907129e-05,
"loss": 0.6068,
"step": 37500
},
{
"epoch": 1.5416221546552715,
"grad_norm": 1.0834916830062866,
"learning_rate": 2.6520060546961566e-05,
"loss": 0.6017,
"step": 37520
},
{
"epoch": 1.5424439148656424,
"grad_norm": 1.0963350534439087,
"learning_rate": 2.6429862222168467e-05,
"loss": 0.6211,
"step": 37540
},
{
"epoch": 1.5432656750760128,
"grad_norm": 1.0668399333953857,
"learning_rate": 2.6339794184426393e-05,
"loss": 0.6182,
"step": 37560
},
{
"epoch": 1.5440874352863836,
"grad_norm": 1.169129490852356,
"learning_rate": 2.6249856593238763e-05,
"loss": 0.6078,
"step": 37580
},
{
"epoch": 1.544909195496754,
"grad_norm": 1.1952544450759888,
"learning_rate": 2.6160049607878234e-05,
"loss": 0.6056,
"step": 37600
},
{
"epoch": 1.5457309557071248,
"grad_norm": 1.1417872905731201,
"learning_rate": 2.6070373387386005e-05,
"loss": 0.6069,
"step": 37620
},
{
"epoch": 1.5465527159174952,
"grad_norm": 1.152288556098938,
"learning_rate": 2.5980828090571817e-05,
"loss": 0.6084,
"step": 37640
},
{
"epoch": 1.547374476127866,
"grad_norm": 1.1022766828536987,
"learning_rate": 2.589141387601346e-05,
"loss": 0.6145,
"step": 37660
},
{
"epoch": 1.5481962363382364,
"grad_norm": 1.0760823488235474,
"learning_rate": 2.580213090205663e-05,
"loss": 0.5979,
"step": 37680
},
{
"epoch": 1.5490179965486073,
"grad_norm": 1.1265369653701782,
"learning_rate": 2.5712979326814613e-05,
"loss": 0.6069,
"step": 37700
},
{
"epoch": 1.5498397567589777,
"grad_norm": 1.0889846086502075,
"learning_rate": 2.5623959308167945e-05,
"loss": 0.6006,
"step": 37720
},
{
"epoch": 1.5506615169693485,
"grad_norm": 1.110885739326477,
"learning_rate": 2.553507100376428e-05,
"loss": 0.6128,
"step": 37740
},
{
"epoch": 1.551483277179719,
"grad_norm": 0.9926326870918274,
"learning_rate": 2.5446314571017936e-05,
"loss": 0.6145,
"step": 37760
},
{
"epoch": 1.5523050373900897,
"grad_norm": 0.9480810165405273,
"learning_rate": 2.535769016710975e-05,
"loss": 0.6213,
"step": 37780
},
{
"epoch": 1.5531267976004601,
"grad_norm": 1.1244728565216064,
"learning_rate": 2.5269197948986678e-05,
"loss": 0.5935,
"step": 37800
},
{
"epoch": 1.553948557810831,
"grad_norm": 1.1508769989013672,
"learning_rate": 2.5180838073361624e-05,
"loss": 0.6349,
"step": 37820
},
{
"epoch": 1.5547703180212014,
"grad_norm": 1.137568473815918,
"learning_rate": 2.509261069671318e-05,
"loss": 0.6057,
"step": 37840
},
{
"epoch": 1.555592078231572,
"grad_norm": 1.1233346462249756,
"learning_rate": 2.5004515975285183e-05,
"loss": 0.5844,
"step": 37860
},
{
"epoch": 1.5564138384419426,
"grad_norm": 1.188909649848938,
"learning_rate": 2.491655406508667e-05,
"loss": 0.6043,
"step": 37880
},
{
"epoch": 1.5572355986523132,
"grad_norm": 1.0557928085327148,
"learning_rate": 2.4828725121891328e-05,
"loss": 0.5953,
"step": 37900
},
{
"epoch": 1.5580573588626838,
"grad_norm": 1.2361866235733032,
"learning_rate": 2.4745410928211422e-05,
"loss": 0.6316,
"step": 37920
},
{
"epoch": 1.5588791190730544,
"grad_norm": 1.1780842542648315,
"learning_rate": 2.4662217010784527e-05,
"loss": 0.614,
"step": 37940
},
{
"epoch": 1.559700879283425,
"grad_norm": 1.0414899587631226,
"learning_rate": 2.4574774550623027e-05,
"loss": 0.5905,
"step": 37960
},
{
"epoch": 1.5605226394937957,
"grad_norm": 1.1471487283706665,
"learning_rate": 2.448746566272997e-05,
"loss": 0.5866,
"step": 37980
},
{
"epoch": 1.5613443997041663,
"grad_norm": 1.048036813735962,
"learning_rate": 2.4400290501722623e-05,
"loss": 0.6068,
"step": 38000
},
{
"epoch": 1.5613443997041663,
"eval_loss": 0.8905351758003235,
"eval_runtime": 16.5754,
"eval_samples_per_second": 158.066,
"eval_steps_per_second": 4.947,
"step": 38000
},
{
"epoch": 1.562166159914537,
"grad_norm": 1.0737829208374023,
"learning_rate": 2.431324922198156e-05,
"loss": 0.6073,
"step": 38020
},
{
"epoch": 1.5629879201249075,
"grad_norm": 1.0627774000167847,
"learning_rate": 2.4226341977650145e-05,
"loss": 0.6057,
"step": 38040
},
{
"epoch": 1.5638096803352781,
"grad_norm": 1.09597647190094,
"learning_rate": 2.4139568922634427e-05,
"loss": 0.6003,
"step": 38060
},
{
"epoch": 1.5646314405456487,
"grad_norm": 1.0973150730133057,
"learning_rate": 2.40529302106028e-05,
"loss": 0.6181,
"step": 38080
},
{
"epoch": 1.5654532007560193,
"grad_norm": 1.163824200630188,
"learning_rate": 2.396642599498573e-05,
"loss": 0.5985,
"step": 38100
},
{
"epoch": 1.56627496096639,
"grad_norm": 1.2081292867660522,
"learning_rate": 2.3880056428975572e-05,
"loss": 0.5934,
"step": 38120
},
{
"epoch": 1.5670967211767606,
"grad_norm": 1.1347095966339111,
"learning_rate": 2.379382166552614e-05,
"loss": 0.6004,
"step": 38140
},
{
"epoch": 1.5679184813871312,
"grad_norm": 1.0734606981277466,
"learning_rate": 2.3707721857352628e-05,
"loss": 0.6167,
"step": 38160
},
{
"epoch": 1.5687402415975018,
"grad_norm": 1.0315816402435303,
"learning_rate": 2.362175715693106e-05,
"loss": 0.5854,
"step": 38180
},
{
"epoch": 1.5695620018078724,
"grad_norm": 1.1294316053390503,
"learning_rate": 2.3535927716498397e-05,
"loss": 0.5945,
"step": 38200
},
{
"epoch": 1.570383762018243,
"grad_norm": 1.1472307443618774,
"learning_rate": 2.3450233688051936e-05,
"loss": 0.5976,
"step": 38220
},
{
"epoch": 1.5712055222286136,
"grad_norm": 1.253547191619873,
"learning_rate": 2.3364675223349186e-05,
"loss": 0.6084,
"step": 38240
},
{
"epoch": 1.5720272824389843,
"grad_norm": 1.0203603506088257,
"learning_rate": 2.3279252473907674e-05,
"loss": 0.5904,
"step": 38260
},
{
"epoch": 1.5728490426493549,
"grad_norm": 1.0900869369506836,
"learning_rate": 2.3193965591004408e-05,
"loss": 0.6092,
"step": 38280
},
{
"epoch": 1.5736708028597255,
"grad_norm": 1.003138542175293,
"learning_rate": 2.3108814725675975e-05,
"loss": 0.5982,
"step": 38300
},
{
"epoch": 1.574492563070096,
"grad_norm": 1.138856291770935,
"learning_rate": 2.3023800028717956e-05,
"loss": 0.6178,
"step": 38320
},
{
"epoch": 1.5753143232804667,
"grad_norm": 1.1773872375488281,
"learning_rate": 2.2943162329690658e-05,
"loss": 0.6051,
"step": 38340
},
{
"epoch": 1.5761360834908373,
"grad_norm": 1.2317793369293213,
"learning_rate": 2.2858413593867434e-05,
"loss": 0.6073,
"step": 38360
},
{
"epoch": 1.576957843701208,
"grad_norm": 1.0834593772888184,
"learning_rate": 2.2773801469855805e-05,
"loss": 0.6154,
"step": 38380
},
{
"epoch": 1.5777796039115786,
"grad_norm": 1.1221275329589844,
"learning_rate": 2.2689326107497267e-05,
"loss": 0.5776,
"step": 38400
},
{
"epoch": 1.5786013641219492,
"grad_norm": 1.0579743385314941,
"learning_rate": 2.260498765639125e-05,
"loss": 0.5986,
"step": 38420
},
{
"epoch": 1.5794231243323198,
"grad_norm": 1.0688302516937256,
"learning_rate": 2.252078626589462e-05,
"loss": 0.5839,
"step": 38440
},
{
"epoch": 1.5802448845426904,
"grad_norm": 1.1622627973556519,
"learning_rate": 2.2436722085121565e-05,
"loss": 0.6065,
"step": 38460
},
{
"epoch": 1.581066644753061,
"grad_norm": 1.1227208375930786,
"learning_rate": 2.2352795262943272e-05,
"loss": 0.6048,
"step": 38480
},
{
"epoch": 1.5818884049634316,
"grad_norm": 0.9703273177146912,
"learning_rate": 2.2269005947987664e-05,
"loss": 0.6037,
"step": 38500
},
{
"epoch": 1.5827101651738023,
"grad_norm": 1.0502504110336304,
"learning_rate": 2.2185354288639216e-05,
"loss": 0.6096,
"step": 38520
},
{
"epoch": 1.5835319253841729,
"grad_norm": 1.1714296340942383,
"learning_rate": 2.210184043303852e-05,
"loss": 0.612,
"step": 38540
},
{
"epoch": 1.5843536855945435,
"grad_norm": 1.051988124847412,
"learning_rate": 2.2018464529082282e-05,
"loss": 0.6334,
"step": 38560
},
{
"epoch": 1.585175445804914,
"grad_norm": 1.1384596824645996,
"learning_rate": 2.1935226724422686e-05,
"loss": 0.6027,
"step": 38580
},
{
"epoch": 1.5859972060152847,
"grad_norm": 1.1457267999649048,
"learning_rate": 2.1852127166467572e-05,
"loss": 0.5929,
"step": 38600
},
{
"epoch": 1.5868189662256553,
"grad_norm": 1.1787493228912354,
"learning_rate": 2.1769166002379826e-05,
"loss": 0.5897,
"step": 38620
},
{
"epoch": 1.587640726436026,
"grad_norm": 1.1157082319259644,
"learning_rate": 2.1686343379077246e-05,
"loss": 0.5987,
"step": 38640
},
{
"epoch": 1.5884624866463966,
"grad_norm": 1.0821255445480347,
"learning_rate": 2.1603659443232394e-05,
"loss": 0.6027,
"step": 38660
},
{
"epoch": 1.5892842468567672,
"grad_norm": 1.0201191902160645,
"learning_rate": 2.152111434127212e-05,
"loss": 0.6001,
"step": 38680
},
{
"epoch": 1.5901060070671378,
"grad_norm": 1.0591297149658203,
"learning_rate": 2.1438708219377444e-05,
"loss": 0.6132,
"step": 38700
},
{
"epoch": 1.5909277672775084,
"grad_norm": 1.0407557487487793,
"learning_rate": 2.1356441223483246e-05,
"loss": 0.5904,
"step": 38720
},
{
"epoch": 1.591749527487879,
"grad_norm": 1.0210407972335815,
"learning_rate": 2.12743134992781e-05,
"loss": 0.5876,
"step": 38740
},
{
"epoch": 1.5925712876982496,
"grad_norm": 1.0307002067565918,
"learning_rate": 2.1192325192203843e-05,
"loss": 0.6178,
"step": 38760
},
{
"epoch": 1.5933930479086202,
"grad_norm": 0.9952294826507568,
"learning_rate": 2.1110476447455453e-05,
"loss": 0.59,
"step": 38780
},
{
"epoch": 1.5942148081189909,
"grad_norm": 1.145140528678894,
"learning_rate": 2.1028767409980776e-05,
"loss": 0.5751,
"step": 38800
},
{
"epoch": 1.5950365683293615,
"grad_norm": 1.0947434902191162,
"learning_rate": 2.094719822448019e-05,
"loss": 0.5838,
"step": 38820
},
{
"epoch": 1.595858328539732,
"grad_norm": 1.2791152000427246,
"learning_rate": 2.086576903540649e-05,
"loss": 0.6055,
"step": 38840
},
{
"epoch": 1.5966800887501027,
"grad_norm": 1.142967700958252,
"learning_rate": 2.0784479986964467e-05,
"loss": 0.5962,
"step": 38860
},
{
"epoch": 1.5975018489604733,
"grad_norm": 1.1335628032684326,
"learning_rate": 2.070333122311081e-05,
"loss": 0.6056,
"step": 38880
},
{
"epoch": 1.598323609170844,
"grad_norm": 1.2280479669570923,
"learning_rate": 2.0622322887553703e-05,
"loss": 0.6012,
"step": 38900
},
{
"epoch": 1.5991453693812145,
"grad_norm": 1.0319156646728516,
"learning_rate": 2.0541455123752686e-05,
"loss": 0.5985,
"step": 38920
},
{
"epoch": 1.5999671295915852,
"grad_norm": 0.9951415061950684,
"learning_rate": 2.046072807491832e-05,
"loss": 0.594,
"step": 38940
},
{
"epoch": 1.6007888898019558,
"grad_norm": 1.0593833923339844,
"learning_rate": 2.0380141884012004e-05,
"loss": 0.5987,
"step": 38960
},
{
"epoch": 1.6016106500123264,
"grad_norm": 1.0992417335510254,
"learning_rate": 2.0299696693745697e-05,
"loss": 0.5815,
"step": 38980
},
{
"epoch": 1.602432410222697,
"grad_norm": 0.9815024137496948,
"learning_rate": 2.0219392646581638e-05,
"loss": 0.5727,
"step": 39000
},
{
"epoch": 1.6032541704330676,
"grad_norm": 1.1512722969055176,
"learning_rate": 2.013922988473209e-05,
"loss": 0.6208,
"step": 39020
},
{
"epoch": 1.6040759306434382,
"grad_norm": 1.1306536197662354,
"learning_rate": 2.0059208550159125e-05,
"loss": 0.6162,
"step": 39040
},
{
"epoch": 1.6048976908538088,
"grad_norm": 1.1175142526626587,
"learning_rate": 1.9979328784574415e-05,
"loss": 0.5983,
"step": 39060
},
{
"epoch": 1.6057194510641795,
"grad_norm": 1.3080164194107056,
"learning_rate": 1.9899590729438856e-05,
"loss": 0.6112,
"step": 39080
},
{
"epoch": 1.60654121127455,
"grad_norm": 1.130448579788208,
"learning_rate": 1.981999452596236e-05,
"loss": 0.5753,
"step": 39100
},
{
"epoch": 1.6073629714849207,
"grad_norm": 1.0560057163238525,
"learning_rate": 1.9740540315103772e-05,
"loss": 0.593,
"step": 39120
},
{
"epoch": 1.6081847316952913,
"grad_norm": 1.0870985984802246,
"learning_rate": 1.9661228237570272e-05,
"loss": 0.606,
"step": 39140
},
{
"epoch": 1.609006491905662,
"grad_norm": 1.1902962923049927,
"learning_rate": 1.9582058433817528e-05,
"loss": 0.6184,
"step": 39160
},
{
"epoch": 1.6098282521160325,
"grad_norm": 1.0945310592651367,
"learning_rate": 1.9503031044049136e-05,
"loss": 0.5869,
"step": 39180
},
{
"epoch": 1.6106500123264031,
"grad_norm": 1.018189549446106,
"learning_rate": 1.942414620821651e-05,
"loss": 0.605,
"step": 39200
},
{
"epoch": 1.6114717725367738,
"grad_norm": 1.0882275104522705,
"learning_rate": 1.934540406601867e-05,
"loss": 0.604,
"step": 39220
},
{
"epoch": 1.6122935327471444,
"grad_norm": 1.091739296913147,
"learning_rate": 1.9266804756901812e-05,
"loss": 0.588,
"step": 39240
},
{
"epoch": 1.613115292957515,
"grad_norm": 1.0217018127441406,
"learning_rate": 1.918834842005933e-05,
"loss": 0.5712,
"step": 39260
},
{
"epoch": 1.6139370531678856,
"grad_norm": 1.0229742527008057,
"learning_rate": 1.9110035194431298e-05,
"loss": 0.5956,
"step": 39280
},
{
"epoch": 1.6147588133782562,
"grad_norm": 1.082470417022705,
"learning_rate": 1.903186521870448e-05,
"loss": 0.5962,
"step": 39300
},
{
"epoch": 1.6155805735886268,
"grad_norm": 1.0316046476364136,
"learning_rate": 1.895383863131185e-05,
"loss": 0.6197,
"step": 39320
},
{
"epoch": 1.6164023337989974,
"grad_norm": 1.150221347808838,
"learning_rate": 1.887595557043248e-05,
"loss": 0.6,
"step": 39340
},
{
"epoch": 1.617224094009368,
"grad_norm": 1.0211883783340454,
"learning_rate": 1.879821617399129e-05,
"loss": 0.5957,
"step": 39360
},
{
"epoch": 1.6180458542197387,
"grad_norm": 1.1843931674957275,
"learning_rate": 1.8720620579658733e-05,
"loss": 0.5956,
"step": 39380
},
{
"epoch": 1.6188676144301093,
"grad_norm": 1.0743961334228516,
"learning_rate": 1.8643168924850695e-05,
"loss": 0.6038,
"step": 39400
},
{
"epoch": 1.61968937464048,
"grad_norm": 0.9900259971618652,
"learning_rate": 1.8565861346728032e-05,
"loss": 0.5963,
"step": 39420
},
{
"epoch": 1.6205111348508505,
"grad_norm": 1.045684814453125,
"learning_rate": 1.848869798219659e-05,
"loss": 0.6107,
"step": 39440
},
{
"epoch": 1.6213328950612211,
"grad_norm": 1.1444482803344727,
"learning_rate": 1.8411678967906655e-05,
"loss": 0.6047,
"step": 39460
},
{
"epoch": 1.6221546552715917,
"grad_norm": 1.0717675685882568,
"learning_rate": 1.833480444025304e-05,
"loss": 0.6126,
"step": 39480
},
{
"epoch": 1.6229764154819624,
"grad_norm": 1.04216468334198,
"learning_rate": 1.8258074535374604e-05,
"loss": 0.584,
"step": 39500
},
{
"epoch": 1.623798175692333,
"grad_norm": 1.0672125816345215,
"learning_rate": 1.818148938915406e-05,
"loss": 0.5991,
"step": 39520
},
{
"epoch": 1.6246199359027036,
"grad_norm": 1.1428139209747314,
"learning_rate": 1.81050491372179e-05,
"loss": 0.5991,
"step": 39540
},
{
"epoch": 1.6254416961130742,
"grad_norm": 1.1530945301055908,
"learning_rate": 1.80287539149358e-05,
"loss": 0.6207,
"step": 39560
},
{
"epoch": 1.6262634563234448,
"grad_norm": 1.0400162935256958,
"learning_rate": 1.7952603857420837e-05,
"loss": 0.6112,
"step": 39580
},
{
"epoch": 1.6270852165338154,
"grad_norm": 1.172669768333435,
"learning_rate": 1.7876599099528822e-05,
"loss": 0.5991,
"step": 39600
},
{
"epoch": 1.627906976744186,
"grad_norm": 0.9719765782356262,
"learning_rate": 1.78007397758584e-05,
"loss": 0.6059,
"step": 39620
},
{
"epoch": 1.6287287369545567,
"grad_norm": 1.0943204164505005,
"learning_rate": 1.7725026020750547e-05,
"loss": 0.5868,
"step": 39640
},
{
"epoch": 1.6295504971649273,
"grad_norm": 1.281966209411621,
"learning_rate": 1.764945796828852e-05,
"loss": 0.6005,
"step": 39660
},
{
"epoch": 1.630372257375298,
"grad_norm": 1.1267539262771606,
"learning_rate": 1.7574035752297503e-05,
"loss": 0.5725,
"step": 39680
},
{
"epoch": 1.6311940175856685,
"grad_norm": 1.1680996417999268,
"learning_rate": 1.749875950634442e-05,
"loss": 0.5967,
"step": 39700
},
{
"epoch": 1.6320157777960391,
"grad_norm": 1.0391075611114502,
"learning_rate": 1.742362936373776e-05,
"loss": 0.5903,
"step": 39720
},
{
"epoch": 1.6328375380064097,
"grad_norm": 1.166467547416687,
"learning_rate": 1.734864545752716e-05,
"loss": 0.6019,
"step": 39740
},
{
"epoch": 1.6336592982167804,
"grad_norm": 1.027796983718872,
"learning_rate": 1.7273807920503436e-05,
"loss": 0.5935,
"step": 39760
},
{
"epoch": 1.634481058427151,
"grad_norm": 1.0871942043304443,
"learning_rate": 1.7199116885197995e-05,
"loss": 0.6119,
"step": 39780
},
{
"epoch": 1.6353028186375216,
"grad_norm": 1.0691869258880615,
"learning_rate": 1.7124572483882996e-05,
"loss": 0.6061,
"step": 39800
},
{
"epoch": 1.6361245788478922,
"grad_norm": 1.0936367511749268,
"learning_rate": 1.70501748485708e-05,
"loss": 0.5716,
"step": 39820
},
{
"epoch": 1.6369463390582628,
"grad_norm": 1.1092150211334229,
"learning_rate": 1.6975924111013873e-05,
"loss": 0.5975,
"step": 39840
},
{
"epoch": 1.6377680992686334,
"grad_norm": 0.9624285697937012,
"learning_rate": 1.6901820402704606e-05,
"loss": 0.6096,
"step": 39860
},
{
"epoch": 1.638589859479004,
"grad_norm": 1.1558884382247925,
"learning_rate": 1.6827863854874938e-05,
"loss": 0.6157,
"step": 39880
},
{
"epoch": 1.6394116196893747,
"grad_norm": 1.3336347341537476,
"learning_rate": 1.6754054598496215e-05,
"loss": 0.601,
"step": 39900
},
{
"epoch": 1.6402333798997453,
"grad_norm": 0.9931703209877014,
"learning_rate": 1.668039276427894e-05,
"loss": 0.5828,
"step": 39920
},
{
"epoch": 1.6410551401101159,
"grad_norm": 1.170390248298645,
"learning_rate": 1.6606878482672582e-05,
"loss": 0.5879,
"step": 39940
},
{
"epoch": 1.6418769003204865,
"grad_norm": 1.0788556337356567,
"learning_rate": 1.653351188386526e-05,
"loss": 0.603,
"step": 39960
},
{
"epoch": 1.6426986605308571,
"grad_norm": 1.1531950235366821,
"learning_rate": 1.6460293097783574e-05,
"loss": 0.6071,
"step": 39980
},
{
"epoch": 1.6435204207412277,
"grad_norm": 1.1545748710632324,
"learning_rate": 1.638722225409236e-05,
"loss": 0.5967,
"step": 40000
},
{
"epoch": 1.6435204207412277,
"eval_loss": 0.8823444247245789,
"eval_runtime": 16.6502,
"eval_samples_per_second": 157.355,
"eval_steps_per_second": 4.925,
"step": 40000
},
{
"epoch": 1.6443421809515983,
"grad_norm": 1.1601914167404175,
"learning_rate": 1.6314299482194418e-05,
"loss": 0.6032,
"step": 40020
},
{
"epoch": 1.645163941161969,
"grad_norm": 1.0907021760940552,
"learning_rate": 1.624152491123043e-05,
"loss": 0.6075,
"step": 40040
},
{
"epoch": 1.6459857013723396,
"grad_norm": 1.2451189756393433,
"learning_rate": 1.6168898670078537e-05,
"loss": 0.5927,
"step": 40060
},
{
"epoch": 1.6468074615827102,
"grad_norm": 1.2181775569915771,
"learning_rate": 1.609642088735418e-05,
"loss": 0.5866,
"step": 40080
},
{
"epoch": 1.6476292217930808,
"grad_norm": 1.0169905424118042,
"learning_rate": 1.6024091691410013e-05,
"loss": 0.5901,
"step": 40100
},
{
"epoch": 1.6484509820034514,
"grad_norm": 1.0728631019592285,
"learning_rate": 1.595191121033538e-05,
"loss": 0.5929,
"step": 40120
},
{
"epoch": 1.649272742213822,
"grad_norm": 1.216049313545227,
"learning_rate": 1.5879879571956436e-05,
"loss": 0.5836,
"step": 40140
},
{
"epoch": 1.6500945024241926,
"grad_norm": 1.093613862991333,
"learning_rate": 1.5807996903835608e-05,
"loss": 0.5816,
"step": 40160
},
{
"epoch": 1.6509162626345633,
"grad_norm": 1.1138787269592285,
"learning_rate": 1.5736263333271618e-05,
"loss": 0.5954,
"step": 40180
},
{
"epoch": 1.6517380228449339,
"grad_norm": 1.0472607612609863,
"learning_rate": 1.5664678987299085e-05,
"loss": 0.6054,
"step": 40200
},
{
"epoch": 1.6525597830553045,
"grad_norm": 1.0426794290542603,
"learning_rate": 1.5593243992688356e-05,
"loss": 0.5902,
"step": 40220
},
{
"epoch": 1.653381543265675,
"grad_norm": 1.1743807792663574,
"learning_rate": 1.552195847594533e-05,
"loss": 0.5822,
"step": 40240
},
{
"epoch": 1.6542033034760457,
"grad_norm": 1.111167550086975,
"learning_rate": 1.5450822563311128e-05,
"loss": 0.628,
"step": 40260
},
{
"epoch": 1.6550250636864163,
"grad_norm": 1.124616265296936,
"learning_rate": 1.5379836380762035e-05,
"loss": 0.5867,
"step": 40280
},
{
"epoch": 1.655846823896787,
"grad_norm": 1.0396867990493774,
"learning_rate": 1.530900005400906e-05,
"loss": 0.6029,
"step": 40300
},
{
"epoch": 1.6566685841071576,
"grad_norm": 1.176413893699646,
"learning_rate": 1.5238313708497964e-05,
"loss": 0.6044,
"step": 40320
},
{
"epoch": 1.6574903443175282,
"grad_norm": 1.1538265943527222,
"learning_rate": 1.5167777469408728e-05,
"loss": 0.591,
"step": 40340
},
{
"epoch": 1.6583121045278988,
"grad_norm": 1.1776963472366333,
"learning_rate": 1.5097391461655663e-05,
"loss": 0.5988,
"step": 40360
},
{
"epoch": 1.6591338647382694,
"grad_norm": 1.0726710557937622,
"learning_rate": 1.5027155809886962e-05,
"loss": 0.6071,
"step": 40380
},
{
"epoch": 1.65995562494864,
"grad_norm": 1.0493417978286743,
"learning_rate": 1.4957070638484515e-05,
"loss": 0.5879,
"step": 40400
},
{
"epoch": 1.6607773851590106,
"grad_norm": 1.1617422103881836,
"learning_rate": 1.4887136071563856e-05,
"loss": 0.5913,
"step": 40420
},
{
"epoch": 1.6615991453693812,
"grad_norm": 1.0822564363479614,
"learning_rate": 1.4817352232973626e-05,
"loss": 0.601,
"step": 40440
},
{
"epoch": 1.6624209055797519,
"grad_norm": 1.1444182395935059,
"learning_rate": 1.4747719246295676e-05,
"loss": 0.602,
"step": 40460
},
{
"epoch": 1.6632426657901225,
"grad_norm": 1.0876108407974243,
"learning_rate": 1.4678237234844649e-05,
"loss": 0.6045,
"step": 40480
},
{
"epoch": 1.664064426000493,
"grad_norm": 1.1832387447357178,
"learning_rate": 1.460890632166787e-05,
"loss": 0.5967,
"step": 40500
},
{
"epoch": 1.6648861862108637,
"grad_norm": 0.9234125018119812,
"learning_rate": 1.4543182020758783e-05,
"loss": 0.5895,
"step": 40520
},
{
"epoch": 1.6657079464212343,
"grad_norm": 1.1049689054489136,
"learning_rate": 1.4474146102117969e-05,
"loss": 0.6013,
"step": 40540
},
{
"epoch": 1.666529706631605,
"grad_norm": 1.0653325319290161,
"learning_rate": 1.4405261643181056e-05,
"loss": 0.6025,
"step": 40560
},
{
"epoch": 1.6673514668419755,
"grad_norm": 1.2282037734985352,
"learning_rate": 1.4336528765937151e-05,
"loss": 0.614,
"step": 40580
},
{
"epoch": 1.6681732270523462,
"grad_norm": 1.1362701654434204,
"learning_rate": 1.4267947592106845e-05,
"loss": 0.6083,
"step": 40600
},
{
"epoch": 1.6689949872627168,
"grad_norm": 1.0828361511230469,
"learning_rate": 1.4199518243142196e-05,
"loss": 0.5823,
"step": 40620
},
{
"epoch": 1.6698167474730874,
"grad_norm": 1.042107105255127,
"learning_rate": 1.4131240840226201e-05,
"loss": 0.5848,
"step": 40640
},
{
"epoch": 1.670638507683458,
"grad_norm": 1.00413978099823,
"learning_rate": 1.4063115504272973e-05,
"loss": 0.6082,
"step": 40660
},
{
"epoch": 1.6714602678938286,
"grad_norm": 1.0120972394943237,
"learning_rate": 1.3995142355927216e-05,
"loss": 0.582,
"step": 40680
},
{
"epoch": 1.6722820281041992,
"grad_norm": 1.2198662757873535,
"learning_rate": 1.3927321515564107e-05,
"loss": 0.5908,
"step": 40700
},
{
"epoch": 1.6731037883145699,
"grad_norm": 1.1000231504440308,
"learning_rate": 1.3859653103289205e-05,
"loss": 0.585,
"step": 40720
},
{
"epoch": 1.6739255485249405,
"grad_norm": 1.0395424365997314,
"learning_rate": 1.3792137238937975e-05,
"loss": 0.5767,
"step": 40740
},
{
"epoch": 1.674747308735311,
"grad_norm": 1.1138056516647339,
"learning_rate": 1.3724774042075882e-05,
"loss": 0.5843,
"step": 40760
},
{
"epoch": 1.6755690689456817,
"grad_norm": 1.0291800498962402,
"learning_rate": 1.3657563631997928e-05,
"loss": 0.583,
"step": 40780
},
{
"epoch": 1.6763908291560523,
"grad_norm": 1.1663140058517456,
"learning_rate": 1.3590506127728575e-05,
"loss": 0.5913,
"step": 40800
},
{
"epoch": 1.677212589366423,
"grad_norm": 1.0701870918273926,
"learning_rate": 1.3523601648021533e-05,
"loss": 0.5924,
"step": 40820
},
{
"epoch": 1.6780343495767935,
"grad_norm": 1.1024507284164429,
"learning_rate": 1.345685031135947e-05,
"loss": 0.5786,
"step": 40840
},
{
"epoch": 1.6788561097871642,
"grad_norm": 0.9765409231185913,
"learning_rate": 1.3393578497947468e-05,
"loss": 0.5825,
"step": 40860
},
{
"epoch": 1.6796778699975348,
"grad_norm": 1.2072242498397827,
"learning_rate": 1.332712612998166e-05,
"loss": 0.6016,
"step": 40880
},
{
"epoch": 1.6804996302079054,
"grad_norm": 1.1122468709945679,
"learning_rate": 1.326082725300385e-05,
"loss": 0.6234,
"step": 40900
},
{
"epoch": 1.681321390418276,
"grad_norm": 1.0995004177093506,
"learning_rate": 1.319468198442424e-05,
"loss": 0.5913,
"step": 40920
},
{
"epoch": 1.6821431506286466,
"grad_norm": 1.0409519672393799,
"learning_rate": 1.3128690441380998e-05,
"loss": 0.5948,
"step": 40940
},
{
"epoch": 1.6829649108390172,
"grad_norm": 1.1031830310821533,
"learning_rate": 1.3062852740740072e-05,
"loss": 0.5988,
"step": 40960
},
{
"epoch": 1.6837866710493878,
"grad_norm": 1.1113379001617432,
"learning_rate": 1.2997168999094978e-05,
"loss": 0.5822,
"step": 40980
},
{
"epoch": 1.6846084312597585,
"grad_norm": 1.0048646926879883,
"learning_rate": 1.2931639332766543e-05,
"loss": 0.5817,
"step": 41000
},
{
"epoch": 1.685430191470129,
"grad_norm": 1.159192681312561,
"learning_rate": 1.2866263857802818e-05,
"loss": 0.5832,
"step": 41020
},
{
"epoch": 1.6862519516804997,
"grad_norm": 1.1091160774230957,
"learning_rate": 1.280104268997865e-05,
"loss": 0.5858,
"step": 41040
},
{
"epoch": 1.6870737118908703,
"grad_norm": 1.1325902938842773,
"learning_rate": 1.2735975944795775e-05,
"loss": 0.5843,
"step": 41060
},
{
"epoch": 1.687895472101241,
"grad_norm": 1.0565228462219238,
"learning_rate": 1.267106373748237e-05,
"loss": 0.5852,
"step": 41080
},
{
"epoch": 1.6887172323116115,
"grad_norm": 1.1818876266479492,
"learning_rate": 1.2606306182992933e-05,
"loss": 0.588,
"step": 41100
},
{
"epoch": 1.6895389925219821,
"grad_norm": 0.8950326442718506,
"learning_rate": 1.2541703396008142e-05,
"loss": 0.5963,
"step": 41120
},
{
"epoch": 1.6903607527323525,
"grad_norm": 1.1267063617706299,
"learning_rate": 1.2477255490934559e-05,
"loss": 0.5758,
"step": 41140
},
{
"epoch": 1.6911825129427234,
"grad_norm": 0.9466457962989807,
"learning_rate": 1.241296258190444e-05,
"loss": 0.5963,
"step": 41160
},
{
"epoch": 1.6920042731530938,
"grad_norm": 1.0484158992767334,
"learning_rate": 1.2348824782775581e-05,
"loss": 0.586,
"step": 41180
},
{
"epoch": 1.6928260333634646,
"grad_norm": 1.1975510120391846,
"learning_rate": 1.2284842207131109e-05,
"loss": 0.5775,
"step": 41200
},
{
"epoch": 1.693647793573835,
"grad_norm": 1.1231242418289185,
"learning_rate": 1.2221014968279233e-05,
"loss": 0.5915,
"step": 41220
},
{
"epoch": 1.6944695537842058,
"grad_norm": 1.1098995208740234,
"learning_rate": 1.2157343179253079e-05,
"loss": 0.5886,
"step": 41240
},
{
"epoch": 1.6952913139945762,
"grad_norm": 1.053568720817566,
"learning_rate": 1.2093826952810471e-05,
"loss": 0.5961,
"step": 41260
},
{
"epoch": 1.696113074204947,
"grad_norm": 1.1454240083694458,
"learning_rate": 1.2030466401433748e-05,
"loss": 0.5888,
"step": 41280
},
{
"epoch": 1.6969348344153175,
"grad_norm": 1.0990582704544067,
"learning_rate": 1.1967261637329607e-05,
"loss": 0.5945,
"step": 41300
},
{
"epoch": 1.6977565946256883,
"grad_norm": 1.2519744634628296,
"learning_rate": 1.190421277242878e-05,
"loss": 0.5782,
"step": 41320
},
{
"epoch": 1.6985783548360587,
"grad_norm": 0.9864106178283691,
"learning_rate": 1.1841319918385996e-05,
"loss": 0.5856,
"step": 41340
},
{
"epoch": 1.6994001150464295,
"grad_norm": 1.0756564140319824,
"learning_rate": 1.1778583186579628e-05,
"loss": 0.5893,
"step": 41360
},
{
"epoch": 1.7002218752568,
"grad_norm": 1.163355827331543,
"learning_rate": 1.1716002688111616e-05,
"loss": 0.6051,
"step": 41380
},
{
"epoch": 1.7010436354671707,
"grad_norm": 1.259600281715393,
"learning_rate": 1.1653578533807186e-05,
"loss": 0.6031,
"step": 41400
},
{
"epoch": 1.7018653956775411,
"grad_norm": 1.1272526979446411,
"learning_rate": 1.1591310834214709e-05,
"loss": 0.584,
"step": 41420
},
{
"epoch": 1.702687155887912,
"grad_norm": 1.1468937397003174,
"learning_rate": 1.152919969960552e-05,
"loss": 0.6068,
"step": 41440
},
{
"epoch": 1.7035089160982824,
"grad_norm": 1.1517606973648071,
"learning_rate": 1.1467245239973633e-05,
"loss": 0.5757,
"step": 41460
},
{
"epoch": 1.7043306763086532,
"grad_norm": 1.0870920419692993,
"learning_rate": 1.1405447565035631e-05,
"loss": 0.6043,
"step": 41480
},
{
"epoch": 1.7051524365190236,
"grad_norm": 1.0770379304885864,
"learning_rate": 1.1343806784230426e-05,
"loss": 0.5905,
"step": 41500
},
{
"epoch": 1.7059741967293944,
"grad_norm": 1.0060986280441284,
"learning_rate": 1.128232300671912e-05,
"loss": 0.6047,
"step": 41520
},
{
"epoch": 1.7067959569397648,
"grad_norm": 1.0191991329193115,
"learning_rate": 1.1220996341384748e-05,
"loss": 0.5647,
"step": 41540
},
{
"epoch": 1.7076177171501357,
"grad_norm": 1.191707730293274,
"learning_rate": 1.1159826896832082e-05,
"loss": 0.5875,
"step": 41560
},
{
"epoch": 1.708439477360506,
"grad_norm": 1.2851048707962036,
"learning_rate": 1.1098814781387568e-05,
"loss": 0.5908,
"step": 41580
},
{
"epoch": 1.709261237570877,
"grad_norm": 1.121020793914795,
"learning_rate": 1.1037960103098877e-05,
"loss": 0.6084,
"step": 41600
},
{
"epoch": 1.7100829977812473,
"grad_norm": 1.1154911518096924,
"learning_rate": 1.0977262969735014e-05,
"loss": 0.5814,
"step": 41620
},
{
"epoch": 1.7109047579916181,
"grad_norm": 1.1276777982711792,
"learning_rate": 1.091672348878594e-05,
"loss": 0.5853,
"step": 41640
},
{
"epoch": 1.7117265182019885,
"grad_norm": 1.080946922302246,
"learning_rate": 1.0856341767462364e-05,
"loss": 0.605,
"step": 41660
},
{
"epoch": 1.7125482784123593,
"grad_norm": 1.1074481010437012,
"learning_rate": 1.0796117912695736e-05,
"loss": 0.5711,
"step": 41680
},
{
"epoch": 1.7133700386227297,
"grad_norm": 1.1239150762557983,
"learning_rate": 1.07360520311378e-05,
"loss": 0.5841,
"step": 41700
},
{
"epoch": 1.7141917988331006,
"grad_norm": 1.0257426500320435,
"learning_rate": 1.0676144229160655e-05,
"loss": 0.5723,
"step": 41720
},
{
"epoch": 1.715013559043471,
"grad_norm": 1.100321888923645,
"learning_rate": 1.0616394612856361e-05,
"loss": 0.5999,
"step": 41740
},
{
"epoch": 1.7158353192538418,
"grad_norm": 1.0664868354797363,
"learning_rate": 1.0556803288036954e-05,
"loss": 0.6029,
"step": 41760
},
{
"epoch": 1.7166570794642122,
"grad_norm": 1.017471194267273,
"learning_rate": 1.0497370360234037e-05,
"loss": 0.5844,
"step": 41780
},
{
"epoch": 1.717478839674583,
"grad_norm": 1.0937682390213013,
"learning_rate": 1.0438095934698766e-05,
"loss": 0.5844,
"step": 41800
},
{
"epoch": 1.7183005998849534,
"grad_norm": 1.0297489166259766,
"learning_rate": 1.037898011640157e-05,
"loss": 0.5882,
"step": 41820
},
{
"epoch": 1.7191223600953243,
"grad_norm": 1.0966808795928955,
"learning_rate": 1.032002301003202e-05,
"loss": 0.5827,
"step": 41840
},
{
"epoch": 1.7199441203056947,
"grad_norm": 1.1811338663101196,
"learning_rate": 1.026122471999863e-05,
"loss": 0.6009,
"step": 41860
},
{
"epoch": 1.7207658805160655,
"grad_norm": 1.0118420124053955,
"learning_rate": 1.0202585350428606e-05,
"loss": 0.5921,
"step": 41880
},
{
"epoch": 1.7215876407264359,
"grad_norm": 1.0821563005447388,
"learning_rate": 1.0144105005167836e-05,
"loss": 0.5956,
"step": 41900
},
{
"epoch": 1.7224094009368067,
"grad_norm": 1.0998246669769287,
"learning_rate": 1.0085783787780412e-05,
"loss": 0.5851,
"step": 41920
},
{
"epoch": 1.7232311611471771,
"grad_norm": 1.2186909914016724,
"learning_rate": 1.0027621801548792e-05,
"loss": 0.6043,
"step": 41940
},
{
"epoch": 1.724052921357548,
"grad_norm": 1.1629652976989746,
"learning_rate": 9.972515496304035e-06,
"loss": 0.5626,
"step": 41960
},
{
"epoch": 1.7248746815679183,
"grad_norm": 1.0134592056274414,
"learning_rate": 9.914664306824105e-06,
"loss": 0.5605,
"step": 41980
},
{
"epoch": 1.7256964417782892,
"grad_norm": 1.134350061416626,
"learning_rate": 9.856972651539343e-06,
"loss": 0.5788,
"step": 42000
},
{
"epoch": 1.7256964417782892,
"eval_loss": 0.8728025555610657,
"eval_runtime": 16.6626,
"eval_samples_per_second": 157.238,
"eval_steps_per_second": 4.921,
"step": 42000
},
{
"epoch": 1.7265182019886596,
"grad_norm": 1.1034976243972778,
"learning_rate": 9.799440632617284e-06,
"loss": 0.583,
"step": 42020
},
{
"epoch": 1.7273399621990304,
"grad_norm": 1.0975291728973389,
"learning_rate": 9.742068351942668e-06,
"loss": 0.5831,
"step": 42040
},
{
"epoch": 1.7281617224094008,
"grad_norm": 1.1249350309371948,
"learning_rate": 9.68485591111744e-06,
"loss": 0.5758,
"step": 42060
},
{
"epoch": 1.7289834826197716,
"grad_norm": 1.0432801246643066,
"learning_rate": 9.6278034114604e-06,
"loss": 0.5898,
"step": 42080
},
{
"epoch": 1.729805242830142,
"grad_norm": 1.0415185689926147,
"learning_rate": 9.570910954007128e-06,
"loss": 0.5781,
"step": 42100
},
{
"epoch": 1.7306270030405129,
"grad_norm": 1.0248029232025146,
"learning_rate": 9.517011450287305e-06,
"loss": 0.588,
"step": 42120
},
{
"epoch": 1.7314487632508833,
"grad_norm": 1.0104538202285767,
"learning_rate": 9.460431364661492e-06,
"loss": 0.5787,
"step": 42140
},
{
"epoch": 1.732270523461254,
"grad_norm": 1.0787935256958008,
"learning_rate": 9.404011617642439e-06,
"loss": 0.6012,
"step": 42160
},
{
"epoch": 1.7330922836716245,
"grad_norm": 1.10727059841156,
"learning_rate": 9.347752309145241e-06,
"loss": 0.5939,
"step": 42180
},
{
"epoch": 1.7339140438819953,
"grad_norm": 1.0951191186904907,
"learning_rate": 9.291653538800727e-06,
"loss": 0.5707,
"step": 42200
},
{
"epoch": 1.7347358040923657,
"grad_norm": 0.9968528151512146,
"learning_rate": 9.235715405955558e-06,
"loss": 0.6074,
"step": 42220
},
{
"epoch": 1.7355575643027366,
"grad_norm": 1.2177423238754272,
"learning_rate": 9.17993800967183e-06,
"loss": 0.5915,
"step": 42240
},
{
"epoch": 1.736379324513107,
"grad_norm": 1.041279673576355,
"learning_rate": 9.124321448727014e-06,
"loss": 0.5841,
"step": 42260
},
{
"epoch": 1.7372010847234778,
"grad_norm": 1.0093954801559448,
"learning_rate": 9.068865821613803e-06,
"loss": 0.5966,
"step": 42280
},
{
"epoch": 1.7380228449338482,
"grad_norm": 0.9137701392173767,
"learning_rate": 9.013571226539773e-06,
"loss": 0.5792,
"step": 42300
},
{
"epoch": 1.738844605144219,
"grad_norm": 1.1452417373657227,
"learning_rate": 8.958437761427452e-06,
"loss": 0.5968,
"step": 42320
},
{
"epoch": 1.7396663653545894,
"grad_norm": 1.0660525560379028,
"learning_rate": 8.903465523913957e-06,
"loss": 0.5883,
"step": 42340
},
{
"epoch": 1.7404881255649602,
"grad_norm": 1.2415095567703247,
"learning_rate": 8.848654611350849e-06,
"loss": 0.5766,
"step": 42360
},
{
"epoch": 1.7413098857753306,
"grad_norm": 1.097631812095642,
"learning_rate": 8.794005120804082e-06,
"loss": 0.5838,
"step": 42380
},
{
"epoch": 1.7421316459857015,
"grad_norm": 0.9962956309318542,
"learning_rate": 8.739517149053689e-06,
"loss": 0.5873,
"step": 42400
},
{
"epoch": 1.7429534061960719,
"grad_norm": 1.1835882663726807,
"learning_rate": 8.685190792593656e-06,
"loss": 0.5863,
"step": 42420
},
{
"epoch": 1.7437751664064427,
"grad_norm": 1.101272702217102,
"learning_rate": 8.631026147631772e-06,
"loss": 0.5901,
"step": 42440
},
{
"epoch": 1.744596926616813,
"grad_norm": 1.1990203857421875,
"learning_rate": 8.577023310089483e-06,
"loss": 0.6065,
"step": 42460
},
{
"epoch": 1.745418686827184,
"grad_norm": 1.1126364469528198,
"learning_rate": 8.523182375601635e-06,
"loss": 0.5755,
"step": 42480
},
{
"epoch": 1.7462404470375543,
"grad_norm": 1.124306082725525,
"learning_rate": 8.469503439516402e-06,
"loss": 0.6036,
"step": 42500
},
{
"epoch": 1.7470622072479252,
"grad_norm": 1.1404842138290405,
"learning_rate": 8.41598659689502e-06,
"loss": 0.5996,
"step": 42520
},
{
"epoch": 1.7478839674582956,
"grad_norm": 1.0869922637939453,
"learning_rate": 8.36263194251169e-06,
"loss": 0.5756,
"step": 42540
},
{
"epoch": 1.7487057276686664,
"grad_norm": 1.0685384273529053,
"learning_rate": 8.309439570853439e-06,
"loss": 0.5936,
"step": 42560
},
{
"epoch": 1.7495274878790368,
"grad_norm": 1.1454116106033325,
"learning_rate": 8.256409576119827e-06,
"loss": 0.5871,
"step": 42580
},
{
"epoch": 1.7503492480894076,
"grad_norm": 1.119585633277893,
"learning_rate": 8.203542052222924e-06,
"loss": 0.5948,
"step": 42600
},
{
"epoch": 1.751171008299778,
"grad_norm": 1.0509191751480103,
"learning_rate": 8.150837092787034e-06,
"loss": 0.5856,
"step": 42620
},
{
"epoch": 1.7519927685101488,
"grad_norm": 0.9895453453063965,
"learning_rate": 8.098294791148565e-06,
"loss": 0.5877,
"step": 42640
},
{
"epoch": 1.7528145287205192,
"grad_norm": 1.1008808612823486,
"learning_rate": 8.045915240355917e-06,
"loss": 0.59,
"step": 42660
},
{
"epoch": 1.75363628893089,
"grad_norm": 0.9279462695121765,
"learning_rate": 7.993698533169192e-06,
"loss": 0.5911,
"step": 42680
},
{
"epoch": 1.7544580491412605,
"grad_norm": 1.1394389867782593,
"learning_rate": 7.941644762060229e-06,
"loss": 0.5756,
"step": 42700
},
{
"epoch": 1.7552798093516313,
"grad_norm": 1.0450705289840698,
"learning_rate": 7.889754019212203e-06,
"loss": 0.6016,
"step": 42720
},
{
"epoch": 1.7561015695620017,
"grad_norm": 1.2323449850082397,
"learning_rate": 7.838026396519638e-06,
"loss": 0.5808,
"step": 42740
},
{
"epoch": 1.7569233297723725,
"grad_norm": 1.1465022563934326,
"learning_rate": 7.786461985588156e-06,
"loss": 0.5987,
"step": 42760
},
{
"epoch": 1.757745089982743,
"grad_norm": 0.9561547636985779,
"learning_rate": 7.73506087773439e-06,
"loss": 0.563,
"step": 42780
},
{
"epoch": 1.7585668501931138,
"grad_norm": 1.154842734336853,
"learning_rate": 7.683823163985737e-06,
"loss": 0.5682,
"step": 42800
},
{
"epoch": 1.7593886104034842,
"grad_norm": 1.1190966367721558,
"learning_rate": 7.632748935080213e-06,
"loss": 0.5896,
"step": 42820
},
{
"epoch": 1.760210370613855,
"grad_norm": 1.064261794090271,
"learning_rate": 7.581838281466414e-06,
"loss": 0.5778,
"step": 42840
},
{
"epoch": 1.7610321308242254,
"grad_norm": 1.095628261566162,
"learning_rate": 7.531091293303094e-06,
"loss": 0.5657,
"step": 42860
},
{
"epoch": 1.7618538910345962,
"grad_norm": 1.1514066457748413,
"learning_rate": 7.480508060459346e-06,
"loss": 0.579,
"step": 42880
},
{
"epoch": 1.7626756512449666,
"grad_norm": 1.1395295858383179,
"learning_rate": 7.430088672514124e-06,
"loss": 0.6007,
"step": 42900
},
{
"epoch": 1.7634974114553374,
"grad_norm": 1.2230483293533325,
"learning_rate": 7.379833218756338e-06,
"loss": 0.5996,
"step": 42920
},
{
"epoch": 1.7643191716657078,
"grad_norm": 1.0743821859359741,
"learning_rate": 7.329741788184485e-06,
"loss": 0.5863,
"step": 42940
},
{
"epoch": 1.7651409318760787,
"grad_norm": 1.1711527109146118,
"learning_rate": 7.279814469506652e-06,
"loss": 0.5864,
"step": 42960
},
{
"epoch": 1.765962692086449,
"grad_norm": 1.1039119958877563,
"learning_rate": 7.230051351140266e-06,
"loss": 0.5763,
"step": 42980
},
{
"epoch": 1.76678445229682,
"grad_norm": 1.0589841604232788,
"learning_rate": 7.180452521211978e-06,
"loss": 0.5857,
"step": 43000
},
{
"epoch": 1.7676062125071903,
"grad_norm": 1.1599675416946411,
"learning_rate": 7.131018067557516e-06,
"loss": 0.5798,
"step": 43020
},
{
"epoch": 1.7684279727175611,
"grad_norm": 1.10663640499115,
"learning_rate": 7.081748077721462e-06,
"loss": 0.5749,
"step": 43040
},
{
"epoch": 1.7692497329279315,
"grad_norm": 1.0838004350662231,
"learning_rate": 7.032642638957232e-06,
"loss": 0.5767,
"step": 43060
},
{
"epoch": 1.7700714931383024,
"grad_norm": 1.0529030561447144,
"learning_rate": 6.983701838226708e-06,
"loss": 0.6105,
"step": 43080
},
{
"epoch": 1.7708932533486728,
"grad_norm": 1.0537713766098022,
"learning_rate": 6.934925762200328e-06,
"loss": 0.5857,
"step": 43100
},
{
"epoch": 1.7717150135590436,
"grad_norm": 1.105526328086853,
"learning_rate": 6.886314497256752e-06,
"loss": 0.5677,
"step": 43120
},
{
"epoch": 1.772536773769414,
"grad_norm": 1.0443006753921509,
"learning_rate": 6.837868129482772e-06,
"loss": 0.5791,
"step": 43140
},
{
"epoch": 1.7733585339797848,
"grad_norm": 1.0910414457321167,
"learning_rate": 6.789586744673226e-06,
"loss": 0.5779,
"step": 43160
},
{
"epoch": 1.7741802941901552,
"grad_norm": 1.2543234825134277,
"learning_rate": 6.741470428330676e-06,
"loss": 0.5898,
"step": 43180
},
{
"epoch": 1.775002054400526,
"grad_norm": 0.9940236806869507,
"learning_rate": 6.693519265665449e-06,
"loss": 0.5807,
"step": 43200
},
{
"epoch": 1.7758238146108964,
"grad_norm": 1.129135012626648,
"learning_rate": 6.645733341595339e-06,
"loss": 0.5931,
"step": 43220
},
{
"epoch": 1.7766455748212673,
"grad_norm": 1.0876851081848145,
"learning_rate": 6.598112740745544e-06,
"loss": 0.587,
"step": 43240
},
{
"epoch": 1.7774673350316377,
"grad_norm": 1.1217765808105469,
"learning_rate": 6.550657547448513e-06,
"loss": 0.5664,
"step": 43260
},
{
"epoch": 1.7782890952420085,
"grad_norm": 1.006568431854248,
"learning_rate": 6.503367845743702e-06,
"loss": 0.572,
"step": 43280
},
{
"epoch": 1.779110855452379,
"grad_norm": 1.0781569480895996,
"learning_rate": 6.456243719377553e-06,
"loss": 0.5824,
"step": 43300
},
{
"epoch": 1.7799326156627497,
"grad_norm": 1.0513370037078857,
"learning_rate": 6.4116292395006935e-06,
"loss": 0.5924,
"step": 43320
},
{
"epoch": 1.7807543758731201,
"grad_norm": 1.1264588832855225,
"learning_rate": 6.364828224809993e-06,
"loss": 0.609,
"step": 43340
},
{
"epoch": 1.781576136083491,
"grad_norm": 1.1192883253097534,
"learning_rate": 6.318193030800956e-06,
"loss": 0.5814,
"step": 43360
},
{
"epoch": 1.7823978962938614,
"grad_norm": 1.1564245223999023,
"learning_rate": 6.271723740060908e-06,
"loss": 0.5825,
"step": 43380
},
{
"epoch": 1.7832196565042322,
"grad_norm": 1.1175339221954346,
"learning_rate": 6.227731656718094e-06,
"loss": 0.6158,
"step": 43400
},
{
"epoch": 1.7840414167146026,
"grad_norm": 1.1350170373916626,
"learning_rate": 6.1815861137816456e-06,
"loss": 0.584,
"step": 43420
},
{
"epoch": 1.7848631769249734,
"grad_norm": 1.0391989946365356,
"learning_rate": 6.1356067160345695e-06,
"loss": 0.5725,
"step": 43440
},
{
"epoch": 1.7856849371353438,
"grad_norm": 1.0166730880737305,
"learning_rate": 6.089793544902756e-06,
"loss": 0.5822,
"step": 43460
},
{
"epoch": 1.7865066973457147,
"grad_norm": 1.161569595336914,
"learning_rate": 6.0441466815178705e-06,
"loss": 0.6067,
"step": 43480
},
{
"epoch": 1.787328457556085,
"grad_norm": 1.1493220329284668,
"learning_rate": 5.998666206716985e-06,
"loss": 0.5804,
"step": 43500
},
{
"epoch": 1.7881502177664559,
"grad_norm": 1.005832552909851,
"learning_rate": 5.953352201042484e-06,
"loss": 0.5852,
"step": 43520
},
{
"epoch": 1.7889719779768263,
"grad_norm": 1.1422655582427979,
"learning_rate": 5.9082047447420405e-06,
"loss": 0.5935,
"step": 43540
},
{
"epoch": 1.7897937381871971,
"grad_norm": 1.0794512033462524,
"learning_rate": 5.863223917768268e-06,
"loss": 0.5841,
"step": 43560
},
{
"epoch": 1.7906154983975675,
"grad_norm": 0.9649259448051453,
"learning_rate": 5.818409799778779e-06,
"loss": 0.5813,
"step": 43580
},
{
"epoch": 1.7914372586079383,
"grad_norm": 1.0412386655807495,
"learning_rate": 5.7737624701359125e-06,
"loss": 0.5912,
"step": 43600
},
{
"epoch": 1.7922590188183087,
"grad_norm": 0.9937067031860352,
"learning_rate": 5.729282007906678e-06,
"loss": 0.5652,
"step": 43620
},
{
"epoch": 1.7930807790286796,
"grad_norm": 1.1072604656219482,
"learning_rate": 5.68496849186253e-06,
"loss": 0.5813,
"step": 43640
},
{
"epoch": 1.79390253923905,
"grad_norm": 1.1642615795135498,
"learning_rate": 5.640822000479307e-06,
"loss": 0.5769,
"step": 43660
},
{
"epoch": 1.7947242994494208,
"grad_norm": 1.1445039510726929,
"learning_rate": 5.596842611937025e-06,
"loss": 0.5789,
"step": 43680
},
{
"epoch": 1.7955460596597912,
"grad_norm": 1.1920628547668457,
"learning_rate": 5.5530304041198075e-06,
"loss": 0.5743,
"step": 43700
},
{
"epoch": 1.796367819870162,
"grad_norm": 1.0678682327270508,
"learning_rate": 5.509385454615712e-06,
"loss": 0.5811,
"step": 43720
},
{
"epoch": 1.7971895800805324,
"grad_norm": 1.15229070186615,
"learning_rate": 5.465907840716555e-06,
"loss": 0.5835,
"step": 43740
},
{
"epoch": 1.7980113402909033,
"grad_norm": 1.0926685333251953,
"learning_rate": 5.422597639417903e-06,
"loss": 0.5952,
"step": 43760
},
{
"epoch": 1.7988331005012737,
"grad_norm": 1.0511040687561035,
"learning_rate": 5.379454927418714e-06,
"loss": 0.5803,
"step": 43780
},
{
"epoch": 1.7996548607116445,
"grad_norm": 1.1438969373703003,
"learning_rate": 5.336479781121473e-06,
"loss": 0.5866,
"step": 43800
},
{
"epoch": 1.8004766209220149,
"grad_norm": 1.0851142406463623,
"learning_rate": 5.293672276631823e-06,
"loss": 0.5861,
"step": 43820
},
{
"epoch": 1.8012983811323857,
"grad_norm": 1.071152925491333,
"learning_rate": 5.251032489758545e-06,
"loss": 0.5965,
"step": 43840
},
{
"epoch": 1.802120141342756,
"grad_norm": 1.1403220891952515,
"learning_rate": 5.208560496013471e-06,
"loss": 0.5796,
"step": 43860
},
{
"epoch": 1.802941901553127,
"grad_norm": 0.9966292977333069,
"learning_rate": 5.166256370611189e-06,
"loss": 0.5664,
"step": 43880
},
{
"epoch": 1.8037636617634973,
"grad_norm": 1.014594316482544,
"learning_rate": 5.124120188469061e-06,
"loss": 0.5889,
"step": 43900
},
{
"epoch": 1.8045854219738682,
"grad_norm": 1.1401876211166382,
"learning_rate": 5.082152024207032e-06,
"loss": 0.5886,
"step": 43920
},
{
"epoch": 1.8054071821842386,
"grad_norm": 1.0322624444961548,
"learning_rate": 5.04035195214747e-06,
"loss": 0.5951,
"step": 43940
},
{
"epoch": 1.8062289423946094,
"grad_norm": 1.0889531373977661,
"learning_rate": 4.998720046315097e-06,
"loss": 0.5795,
"step": 43960
},
{
"epoch": 1.8070507026049798,
"grad_norm": 1.1183452606201172,
"learning_rate": 4.957256380436826e-06,
"loss": 0.5964,
"step": 43980
},
{
"epoch": 1.8078724628153506,
"grad_norm": 1.037669062614441,
"learning_rate": 4.915961027941596e-06,
"loss": 0.578,
"step": 44000
},
{
"epoch": 1.8078724628153506,
"eval_loss": 0.8637903928756714,
"eval_runtime": 21.3438,
"eval_samples_per_second": 122.752,
"eval_steps_per_second": 3.842,
"step": 44000
},
{
"epoch": 1.808694223025721,
"grad_norm": 0.8755192160606384,
"learning_rate": 4.874834061960298e-06,
"loss": 0.4632,
"step": 44020
},
{
"epoch": 1.8095159832360919,
"grad_norm": 0.953360915184021,
"learning_rate": 4.83387555532564e-06,
"loss": 0.4426,
"step": 44040
},
{
"epoch": 1.8103377434464623,
"grad_norm": 0.9059198498725891,
"learning_rate": 4.7930855805719875e-06,
"loss": 0.4451,
"step": 44060
},
{
"epoch": 1.811159503656833,
"grad_norm": 0.9322590231895447,
"learning_rate": 4.752464209935215e-06,
"loss": 0.4425,
"step": 44080
},
{
"epoch": 1.8119812638672035,
"grad_norm": 0.9682320952415466,
"learning_rate": 4.712011515352688e-06,
"loss": 0.4406,
"step": 44100
},
{
"epoch": 1.8128030240775743,
"grad_norm": 0.9625361561775208,
"learning_rate": 4.671727568462958e-06,
"loss": 0.4296,
"step": 44120
},
{
"epoch": 1.8136247842879447,
"grad_norm": 0.8894620537757874,
"learning_rate": 4.631612440605837e-06,
"loss": 0.4459,
"step": 44140
},
{
"epoch": 1.8144465444983155,
"grad_norm": 0.9132700562477112,
"learning_rate": 4.5916662028221094e-06,
"loss": 0.438,
"step": 44160
},
{
"epoch": 1.815268304708686,
"grad_norm": 1.0488115549087524,
"learning_rate": 4.551888925853509e-06,
"loss": 0.4485,
"step": 44180
},
{
"epoch": 1.8160900649190568,
"grad_norm": 0.9608358144760132,
"learning_rate": 4.512280680142522e-06,
"loss": 0.4416,
"step": 44200
},
{
"epoch": 1.8169118251294272,
"grad_norm": 1.0176913738250732,
"learning_rate": 4.472841535832295e-06,
"loss": 0.4422,
"step": 44220
},
{
"epoch": 1.817733585339798,
"grad_norm": 0.9737944602966309,
"learning_rate": 4.433571562766514e-06,
"loss": 0.4217,
"step": 44240
},
{
"epoch": 1.8185553455501684,
"grad_norm": 0.9006738662719727,
"learning_rate": 4.394470830489272e-06,
"loss": 0.4589,
"step": 44260
},
{
"epoch": 1.8193771057605392,
"grad_norm": 1.0624366998672485,
"learning_rate": 4.355539408244991e-06,
"loss": 0.4506,
"step": 44280
},
{
"epoch": 1.8201988659709096,
"grad_norm": 1.1879520416259766,
"learning_rate": 4.316777364978175e-06,
"loss": 0.4532,
"step": 44300
},
{
"epoch": 1.8210206261812805,
"grad_norm": 1.1148782968521118,
"learning_rate": 4.278184769333482e-06,
"loss": 0.4299,
"step": 44320
},
{
"epoch": 1.8218423863916509,
"grad_norm": 0.9817942380905151,
"learning_rate": 4.239761689655364e-06,
"loss": 0.4249,
"step": 44340
},
{
"epoch": 1.8226641466020217,
"grad_norm": 0.9905684590339661,
"learning_rate": 4.201508193988168e-06,
"loss": 0.4374,
"step": 44360
},
{
"epoch": 1.823485906812392,
"grad_norm": 1.0558414459228516,
"learning_rate": 4.163424350075895e-06,
"loss": 0.4501,
"step": 44380
},
{
"epoch": 1.824307667022763,
"grad_norm": 1.105684757232666,
"learning_rate": 4.1274018997290775e-06,
"loss": 0.438,
"step": 44400
},
{
"epoch": 1.8251294272331333,
"grad_norm": 0.9954769611358643,
"learning_rate": 4.089649070449642e-06,
"loss": 0.4404,
"step": 44420
},
{
"epoch": 1.825951187443504,
"grad_norm": 1.0505714416503906,
"learning_rate": 4.052066091019047e-06,
"loss": 0.4338,
"step": 44440
},
{
"epoch": 1.8267729476538745,
"grad_norm": 1.083191990852356,
"learning_rate": 4.014653027993797e-06,
"loss": 0.4334,
"step": 44460
},
{
"epoch": 1.8275947078642452,
"grad_norm": 1.1424214839935303,
"learning_rate": 3.977409947629595e-06,
"loss": 0.4414,
"step": 44480
},
{
"epoch": 1.8284164680746158,
"grad_norm": 0.9991270899772644,
"learning_rate": 3.9403369158810595e-06,
"loss": 0.4419,
"step": 44500
},
{
"epoch": 1.8292382282849864,
"grad_norm": 1.0082736015319824,
"learning_rate": 3.903433998401662e-06,
"loss": 0.4346,
"step": 44520
},
{
"epoch": 1.830059988495357,
"grad_norm": 1.122463345527649,
"learning_rate": 3.866701260543637e-06,
"loss": 0.4286,
"step": 44540
},
{
"epoch": 1.8308817487057276,
"grad_norm": 1.15240478515625,
"learning_rate": 3.830138767357827e-06,
"loss": 0.4248,
"step": 44560
},
{
"epoch": 1.8317035089160982,
"grad_norm": 1.1562724113464355,
"learning_rate": 3.7955621469369153e-06,
"loss": 0.4267,
"step": 44580
},
{
"epoch": 1.8325252691264688,
"grad_norm": 0.9919096827507019,
"learning_rate": 3.7593318168225867e-06,
"loss": 0.4265,
"step": 44600
},
{
"epoch": 1.8333470293368395,
"grad_norm": 0.9958898425102234,
"learning_rate": 3.723271921523508e-06,
"loss": 0.4438,
"step": 44620
},
{
"epoch": 1.83416878954721,
"grad_norm": 1.0441087484359741,
"learning_rate": 3.6873825248989524e-06,
"loss": 0.4661,
"step": 44640
},
{
"epoch": 1.8349905497575807,
"grad_norm": 1.0398614406585693,
"learning_rate": 3.651663690506313e-06,
"loss": 0.4385,
"step": 44660
},
{
"epoch": 1.8358123099679513,
"grad_norm": 1.0855858325958252,
"learning_rate": 3.616115481600857e-06,
"loss": 0.4385,
"step": 44680
},
{
"epoch": 1.836634070178322,
"grad_norm": 1.2031474113464355,
"learning_rate": 3.5807379611357826e-06,
"loss": 0.4322,
"step": 44700
},
{
"epoch": 1.8374558303886925,
"grad_norm": 1.0220446586608887,
"learning_rate": 3.545531191761897e-06,
"loss": 0.419,
"step": 44720
},
{
"epoch": 1.8382775905990631,
"grad_norm": 1.1677261590957642,
"learning_rate": 3.5104952358277154e-06,
"loss": 0.428,
"step": 44740
},
{
"epoch": 1.8390993508094338,
"grad_norm": 1.097439169883728,
"learning_rate": 3.475630155379206e-06,
"loss": 0.4484,
"step": 44760
},
{
"epoch": 1.8399211110198044,
"grad_norm": 1.115867257118225,
"learning_rate": 3.4409360121597235e-06,
"loss": 0.4268,
"step": 44780
},
{
"epoch": 1.840742871230175,
"grad_norm": 1.5234203338623047,
"learning_rate": 3.406412867609976e-06,
"loss": 0.4448,
"step": 44800
},
{
"epoch": 1.8415646314405456,
"grad_norm": 1.0893431901931763,
"learning_rate": 3.3720607828677362e-06,
"loss": 0.4382,
"step": 44820
},
{
"epoch": 1.8423863916509162,
"grad_norm": 1.0602861642837524,
"learning_rate": 3.3378798187679418e-06,
"loss": 0.4308,
"step": 44840
},
{
"epoch": 1.8432081518612868,
"grad_norm": 1.249525547027588,
"learning_rate": 3.303870035842427e-06,
"loss": 0.4261,
"step": 44860
},
{
"epoch": 1.8440299120716575,
"grad_norm": 1.2411805391311646,
"learning_rate": 3.270031494319925e-06,
"loss": 0.4362,
"step": 44880
},
{
"epoch": 1.844851672282028,
"grad_norm": 1.0888171195983887,
"learning_rate": 3.2363642541258676e-06,
"loss": 0.4348,
"step": 44900
},
{
"epoch": 1.8456734324923987,
"grad_norm": 1.056647539138794,
"learning_rate": 3.2028683748823505e-06,
"loss": 0.4356,
"step": 44920
},
{
"epoch": 1.8464951927027693,
"grad_norm": 1.0287542343139648,
"learning_rate": 3.169543915907991e-06,
"loss": 0.4384,
"step": 44940
},
{
"epoch": 1.84731695291314,
"grad_norm": 1.0829901695251465,
"learning_rate": 3.136390936217848e-06,
"loss": 0.4452,
"step": 44960
},
{
"epoch": 1.8481387131235105,
"grad_norm": 0.9647024273872375,
"learning_rate": 3.1034094945233018e-06,
"loss": 0.4303,
"step": 44980
},
{
"epoch": 1.8489604733338811,
"grad_norm": 1.252030611038208,
"learning_rate": 3.07059964923192e-06,
"loss": 0.454,
"step": 45000
},
{
"epoch": 1.8497822335442518,
"grad_norm": 1.0106678009033203,
"learning_rate": 3.037961458447469e-06,
"loss": 0.4371,
"step": 45020
},
{
"epoch": 1.8506039937546224,
"grad_norm": 1.0734151601791382,
"learning_rate": 3.0054949799696142e-06,
"loss": 0.4328,
"step": 45040
},
{
"epoch": 1.851425753964993,
"grad_norm": 1.1433912515640259,
"learning_rate": 2.9732002712940187e-06,
"loss": 0.4195,
"step": 45060
},
{
"epoch": 1.8522475141753636,
"grad_norm": 1.105094313621521,
"learning_rate": 2.9410773896121237e-06,
"loss": 0.4369,
"step": 45080
},
{
"epoch": 1.8530692743857342,
"grad_norm": 1.1068464517593384,
"learning_rate": 2.909126391811068e-06,
"loss": 0.4287,
"step": 45100
},
{
"epoch": 1.8538910345961048,
"grad_norm": 1.1528667211532593,
"learning_rate": 2.8773473344736235e-06,
"loss": 0.4261,
"step": 45120
},
{
"epoch": 1.8547127948064754,
"grad_norm": 1.1072239875793457,
"learning_rate": 2.8457402738780504e-06,
"loss": 0.4389,
"step": 45140
},
{
"epoch": 1.855534555016846,
"grad_norm": 0.9646241664886475,
"learning_rate": 2.8143052659980185e-06,
"loss": 0.4193,
"step": 45160
},
{
"epoch": 1.8563563152272167,
"grad_norm": 1.1033663749694824,
"learning_rate": 2.7830423665024862e-06,
"loss": 0.4257,
"step": 45180
},
{
"epoch": 1.8571780754375873,
"grad_norm": 1.1620920896530151,
"learning_rate": 2.751951630755689e-06,
"loss": 0.4328,
"step": 45200
},
{
"epoch": 1.857999835647958,
"grad_norm": 0.9689277410507202,
"learning_rate": 2.7210331138168955e-06,
"loss": 0.4262,
"step": 45220
},
{
"epoch": 1.8588215958583285,
"grad_norm": 1.0218113660812378,
"learning_rate": 2.6902868704404172e-06,
"loss": 0.4227,
"step": 45240
},
{
"epoch": 1.8596433560686991,
"grad_norm": 1.1273281574249268,
"learning_rate": 2.6597129550754997e-06,
"loss": 0.4125,
"step": 45260
},
{
"epoch": 1.8604651162790697,
"grad_norm": 1.068606972694397,
"learning_rate": 2.629311421866165e-06,
"loss": 0.4241,
"step": 45280
},
{
"epoch": 1.8612868764894404,
"grad_norm": 1.105440616607666,
"learning_rate": 2.5990823246512253e-06,
"loss": 0.4358,
"step": 45300
},
{
"epoch": 1.862108636699811,
"grad_norm": 1.0128626823425293,
"learning_rate": 2.5690257169640688e-06,
"loss": 0.4277,
"step": 45320
},
{
"epoch": 1.8629303969101816,
"grad_norm": 0.9848488569259644,
"learning_rate": 2.5391416520326284e-06,
"loss": 0.4298,
"step": 45340
},
{
"epoch": 1.8637521571205522,
"grad_norm": 1.1279760599136353,
"learning_rate": 2.5094301827792933e-06,
"loss": 0.4312,
"step": 45360
},
{
"epoch": 1.8645739173309228,
"grad_norm": 1.1564669609069824,
"learning_rate": 2.479891361820785e-06,
"loss": 0.4386,
"step": 45380
},
{
"epoch": 1.8653956775412934,
"grad_norm": 1.040480375289917,
"learning_rate": 2.4505252414680713e-06,
"loss": 0.4208,
"step": 45400
},
{
"epoch": 1.866217437751664,
"grad_norm": 1.0698386430740356,
"learning_rate": 2.421331873726296e-06,
"loss": 0.4353,
"step": 45420
},
{
"epoch": 1.8670391979620347,
"grad_norm": 1.1802546977996826,
"learning_rate": 2.3923113102946816e-06,
"loss": 0.4328,
"step": 45440
},
{
"epoch": 1.8678609581724053,
"grad_norm": 1.0874077081680298,
"learning_rate": 2.363463602566396e-06,
"loss": 0.4199,
"step": 45460
},
{
"epoch": 1.8686827183827759,
"grad_norm": 1.2014697790145874,
"learning_rate": 2.334788801628518e-06,
"loss": 0.4279,
"step": 45480
},
{
"epoch": 1.8695044785931465,
"grad_norm": 1.140428066253662,
"learning_rate": 2.3062869582619053e-06,
"loss": 0.4294,
"step": 45500
},
{
"epoch": 1.8703262388035171,
"grad_norm": 1.1033835411071777,
"learning_rate": 2.277958122941115e-06,
"loss": 0.4159,
"step": 45520
},
{
"epoch": 1.8711479990138877,
"grad_norm": 1.0941648483276367,
"learning_rate": 2.249802345834373e-06,
"loss": 0.4241,
"step": 45540
},
{
"epoch": 1.8719697592242583,
"grad_norm": 1.1285514831542969,
"learning_rate": 2.2218196768033496e-06,
"loss": 0.421,
"step": 45560
},
{
"epoch": 1.872791519434629,
"grad_norm": 1.0075688362121582,
"learning_rate": 2.1940101654032487e-06,
"loss": 0.4261,
"step": 45580
},
{
"epoch": 1.8736132796449996,
"grad_norm": 1.2179702520370483,
"learning_rate": 2.166373860882509e-06,
"loss": 0.4311,
"step": 45600
},
{
"epoch": 1.8744350398553702,
"grad_norm": 1.1928651332855225,
"learning_rate": 2.1389108121829593e-06,
"loss": 0.413,
"step": 45620
},
{
"epoch": 1.8752568000657408,
"grad_norm": 1.330972671508789,
"learning_rate": 2.1116210679395066e-06,
"loss": 0.4281,
"step": 45640
},
{
"epoch": 1.8760785602761114,
"grad_norm": 1.1296700239181519,
"learning_rate": 2.0845046764801924e-06,
"loss": 0.4361,
"step": 45660
},
{
"epoch": 1.876900320486482,
"grad_norm": 1.0228627920150757,
"learning_rate": 2.057561685826093e-06,
"loss": 0.4562,
"step": 45680
},
{
"epoch": 1.8777220806968526,
"grad_norm": 1.1409462690353394,
"learning_rate": 2.030792143691118e-06,
"loss": 0.4378,
"step": 45700
},
{
"epoch": 1.8785438409072233,
"grad_norm": 1.078661322593689,
"learning_rate": 2.0041960974821027e-06,
"loss": 0.4299,
"step": 45720
},
{
"epoch": 1.8793656011175939,
"grad_norm": 1.0520544052124023,
"learning_rate": 1.977773594298582e-06,
"loss": 0.434,
"step": 45740
},
{
"epoch": 1.8801873613279645,
"grad_norm": 1.0248843431472778,
"learning_rate": 1.9515246809327815e-06,
"loss": 0.4227,
"step": 45760
},
{
"epoch": 1.881009121538335,
"grad_norm": 0.9711484909057617,
"learning_rate": 1.926749043146392e-06,
"loss": 0.4221,
"step": 45780
},
{
"epoch": 1.8818308817487057,
"grad_norm": 1.135709524154663,
"learning_rate": 1.9008387633469904e-06,
"loss": 0.4176,
"step": 45800
},
{
"epoch": 1.8826526419590763,
"grad_norm": 1.0329680442810059,
"learning_rate": 1.8763849105989163e-06,
"loss": 0.4137,
"step": 45820
},
{
"epoch": 1.883474402169447,
"grad_norm": 1.0853397846221924,
"learning_rate": 1.8508134388439902e-06,
"loss": 0.4235,
"step": 45840
},
{
"epoch": 1.8842961623798176,
"grad_norm": 1.2208008766174316,
"learning_rate": 1.8254157817434447e-06,
"loss": 0.4487,
"step": 45860
},
{
"epoch": 1.8851179225901882,
"grad_norm": 0.9802207946777344,
"learning_rate": 1.8001919842745686e-06,
"loss": 0.442,
"step": 45880
},
{
"epoch": 1.8859396828005588,
"grad_norm": 1.0660961866378784,
"learning_rate": 1.775142091106774e-06,
"loss": 0.4303,
"step": 45900
},
{
"epoch": 1.8867614430109294,
"grad_norm": 1.0748445987701416,
"learning_rate": 1.750266146601498e-06,
"loss": 0.429,
"step": 45920
},
{
"epoch": 1.8875832032213,
"grad_norm": 1.2010825872421265,
"learning_rate": 1.7255641948121127e-06,
"loss": 0.4155,
"step": 45940
},
{
"epoch": 1.8884049634316706,
"grad_norm": 1.149553656578064,
"learning_rate": 1.7010362794838918e-06,
"loss": 0.4232,
"step": 45960
},
{
"epoch": 1.8892267236420413,
"grad_norm": 1.208585262298584,
"learning_rate": 1.6766824440538565e-06,
"loss": 0.4242,
"step": 45980
},
{
"epoch": 1.8900484838524119,
"grad_norm": 1.1593657732009888,
"learning_rate": 1.6525027316507957e-06,
"loss": 0.4376,
"step": 46000
},
{
"epoch": 1.8900484838524119,
"eval_loss": 0.9416248798370361,
"eval_runtime": 16.5465,
"eval_samples_per_second": 158.341,
"eval_steps_per_second": 4.956,
"step": 46000
},
{
"epoch": 1.8908702440627825,
"grad_norm": 0.9449489712715149,
"learning_rate": 1.6284971850951125e-06,
"loss": 0.4212,
"step": 46020
},
{
"epoch": 1.891692004273153,
"grad_norm": 1.046122670173645,
"learning_rate": 1.6046658468987897e-06,
"loss": 0.4332,
"step": 46040
},
{
"epoch": 1.8925137644835237,
"grad_norm": 1.1397738456726074,
"learning_rate": 1.58100875926529e-06,
"loss": 0.4395,
"step": 46060
},
{
"epoch": 1.8933355246938943,
"grad_norm": 1.2659273147583008,
"learning_rate": 1.557525964089479e-06,
"loss": 0.414,
"step": 46080
},
{
"epoch": 1.894157284904265,
"grad_norm": 1.2042860984802246,
"learning_rate": 1.5342175029576023e-06,
"loss": 0.4244,
"step": 46100
},
{
"epoch": 1.8949790451146356,
"grad_norm": 1.1012762784957886,
"learning_rate": 1.51108341714713e-06,
"loss": 0.4324,
"step": 46120
},
{
"epoch": 1.8958008053250062,
"grad_norm": 1.114878535270691,
"learning_rate": 1.4881237476267574e-06,
"loss": 0.4369,
"step": 46140
},
{
"epoch": 1.8966225655353768,
"grad_norm": 1.1757618188858032,
"learning_rate": 1.4653385350563043e-06,
"loss": 0.4101,
"step": 46160
},
{
"epoch": 1.8974443257457474,
"grad_norm": 1.1672312021255493,
"learning_rate": 1.4427278197866045e-06,
"loss": 0.4138,
"step": 46180
},
{
"epoch": 1.898266085956118,
"grad_norm": 1.184259057044983,
"learning_rate": 1.4202916418595058e-06,
"loss": 0.4153,
"step": 46200
},
{
"epoch": 1.8990878461664886,
"grad_norm": 1.2748756408691406,
"learning_rate": 1.3980300410077584e-06,
"loss": 0.4031,
"step": 46220
},
{
"epoch": 1.8999096063768592,
"grad_norm": 1.1279730796813965,
"learning_rate": 1.3759430566549269e-06,
"loss": 0.4296,
"step": 46240
},
{
"epoch": 1.9007313665872299,
"grad_norm": 1.1729366779327393,
"learning_rate": 1.3540307279153674e-06,
"loss": 0.4261,
"step": 46260
},
{
"epoch": 1.9015531267976005,
"grad_norm": 1.0671958923339844,
"learning_rate": 1.3322930935941502e-06,
"loss": 0.4218,
"step": 46280
},
{
"epoch": 1.902374887007971,
"grad_norm": 1.169188380241394,
"learning_rate": 1.3107301921869152e-06,
"loss": 0.4216,
"step": 46300
},
{
"epoch": 1.9031966472183417,
"grad_norm": 1.2280913591384888,
"learning_rate": 1.289342061879928e-06,
"loss": 0.4149,
"step": 46320
},
{
"epoch": 1.9040184074287123,
"grad_norm": 1.1291908025741577,
"learning_rate": 1.268128740549912e-06,
"loss": 0.4208,
"step": 46340
},
{
"epoch": 1.904840167639083,
"grad_norm": 1.114375352859497,
"learning_rate": 1.2470902657640172e-06,
"loss": 0.4061,
"step": 46360
},
{
"epoch": 1.9056619278494535,
"grad_norm": 1.1792467832565308,
"learning_rate": 1.2262266747797847e-06,
"loss": 0.4135,
"step": 46380
},
{
"epoch": 1.9064836880598242,
"grad_norm": 1.1172113418579102,
"learning_rate": 1.2055380045450038e-06,
"loss": 0.4259,
"step": 46400
},
{
"epoch": 1.9073054482701948,
"grad_norm": 1.136423945426941,
"learning_rate": 1.1850242916977449e-06,
"loss": 0.4274,
"step": 46420
},
{
"epoch": 1.9081272084805654,
"grad_norm": 1.257373332977295,
"learning_rate": 1.164685572566182e-06,
"loss": 0.421,
"step": 46440
},
{
"epoch": 1.908948968690936,
"grad_norm": 1.100024938583374,
"learning_rate": 1.1445218831686367e-06,
"loss": 0.4245,
"step": 46460
},
{
"epoch": 1.9097707289013066,
"grad_norm": 1.2640262842178345,
"learning_rate": 1.1245332592134562e-06,
"loss": 0.4188,
"step": 46480
},
{
"epoch": 1.9105924891116772,
"grad_norm": 1.1880619525909424,
"learning_rate": 1.104719736098958e-06,
"loss": 0.4258,
"step": 46500
},
{
"epoch": 1.9114142493220478,
"grad_norm": 1.1756030321121216,
"learning_rate": 1.0850813489133528e-06,
"loss": 0.4242,
"step": 46520
},
{
"epoch": 1.9122360095324185,
"grad_norm": 1.2056884765625,
"learning_rate": 1.0656181324347093e-06,
"loss": 0.4232,
"step": 46540
},
{
"epoch": 1.913057769742789,
"grad_norm": 1.0848861932754517,
"learning_rate": 1.0463301211309006e-06,
"loss": 0.422,
"step": 46560
},
{
"epoch": 1.9138795299531597,
"grad_norm": 1.2697360515594482,
"learning_rate": 1.027217349159504e-06,
"loss": 0.4369,
"step": 46580
},
{
"epoch": 1.9147012901635303,
"grad_norm": 1.1444509029388428,
"learning_rate": 1.008279850367766e-06,
"loss": 0.4231,
"step": 46600
},
{
"epoch": 1.915523050373901,
"grad_norm": 1.225815773010254,
"learning_rate": 9.895176582925492e-07,
"loss": 0.4063,
"step": 46620
},
{
"epoch": 1.9163448105842715,
"grad_norm": 1.1986970901489258,
"learning_rate": 9.709308061602417e-07,
"loss": 0.4194,
"step": 46640
},
{
"epoch": 1.9171665707946421,
"grad_norm": 1.0552966594696045,
"learning_rate": 9.525193268867249e-07,
"loss": 0.4368,
"step": 46660
},
{
"epoch": 1.9179883310050128,
"grad_norm": 1.2278395891189575,
"learning_rate": 9.342832530773061e-07,
"loss": 0.4247,
"step": 46680
},
{
"epoch": 1.9188100912153834,
"grad_norm": 1.1720478534698486,
"learning_rate": 9.162226170266964e-07,
"loss": 0.4484,
"step": 46700
},
{
"epoch": 1.919631851425754,
"grad_norm": 1.0597708225250244,
"learning_rate": 8.983374507188669e-07,
"loss": 0.4158,
"step": 46720
},
{
"epoch": 1.9204536116361246,
"grad_norm": 1.2956316471099854,
"learning_rate": 8.806277858270706e-07,
"loss": 0.4274,
"step": 46740
},
{
"epoch": 1.9212753718464952,
"grad_norm": 0.9802606701850891,
"learning_rate": 8.630936537137757e-07,
"loss": 0.4307,
"step": 46760
},
{
"epoch": 1.9220971320568658,
"grad_norm": 1.118882417678833,
"learning_rate": 8.457350854305657e-07,
"loss": 0.415,
"step": 46780
},
{
"epoch": 1.9229188922672364,
"grad_norm": 1.2108888626098633,
"learning_rate": 8.285521117181394e-07,
"loss": 0.412,
"step": 46800
},
{
"epoch": 1.923740652477607,
"grad_norm": 1.1903843879699707,
"learning_rate": 8.115447630062112e-07,
"loss": 0.4167,
"step": 46820
},
{
"epoch": 1.9245624126879777,
"grad_norm": 1.017788052558899,
"learning_rate": 7.947130694134996e-07,
"loss": 0.4249,
"step": 46840
},
{
"epoch": 1.9253841728983483,
"grad_norm": 1.0981863737106323,
"learning_rate": 7.780570607476278e-07,
"loss": 0.4072,
"step": 46860
},
{
"epoch": 1.926205933108719,
"grad_norm": 1.1456172466278076,
"learning_rate": 7.615767665051122e-07,
"loss": 0.4233,
"step": 46880
},
{
"epoch": 1.9270276933190895,
"grad_norm": 1.1544848680496216,
"learning_rate": 7.452722158712732e-07,
"loss": 0.4324,
"step": 46900
},
{
"epoch": 1.9278494535294601,
"grad_norm": 1.125557780265808,
"learning_rate": 7.291434377202255e-07,
"loss": 0.4248,
"step": 46920
},
{
"epoch": 1.9286712137398307,
"grad_norm": 0.9662781357765198,
"learning_rate": 7.139839337578336e-07,
"loss": 0.4405,
"step": 46940
},
{
"epoch": 1.9294929739502014,
"grad_norm": 1.2945139408111572,
"learning_rate": 6.981979938184835e-07,
"loss": 0.4191,
"step": 46960
},
{
"epoch": 1.930314734160572,
"grad_norm": 1.1728248596191406,
"learning_rate": 6.825879097267507e-07,
"loss": 0.4314,
"step": 46980
},
{
"epoch": 1.9311364943709426,
"grad_norm": 1.1901942491531372,
"learning_rate": 6.671537091268775e-07,
"loss": 0.4168,
"step": 47000
},
{
"epoch": 1.9319582545813132,
"grad_norm": 1.1235634088516235,
"learning_rate": 6.526541555399446e-07,
"loss": 0.422,
"step": 47020
},
{
"epoch": 1.9327800147916838,
"grad_norm": 1.1675294637680054,
"learning_rate": 6.375630060813875e-07,
"loss": 0.4172,
"step": 47040
},
{
"epoch": 1.9336017750020544,
"grad_norm": 1.1512994766235352,
"learning_rate": 6.226478198503527e-07,
"loss": 0.4234,
"step": 47060
},
{
"epoch": 1.934423535212425,
"grad_norm": 1.2757488489151,
"learning_rate": 6.07908623260467e-07,
"loss": 0.4237,
"step": 47080
},
{
"epoch": 1.9352452954227957,
"grad_norm": 1.087795376777649,
"learning_rate": 5.933454424137175e-07,
"loss": 0.4213,
"step": 47100
},
{
"epoch": 1.9360670556331663,
"grad_norm": 1.1851129531860352,
"learning_rate": 5.789583031003743e-07,
"loss": 0.4231,
"step": 47120
},
{
"epoch": 1.936888815843537,
"grad_norm": 1.0666377544403076,
"learning_rate": 5.647472307989676e-07,
"loss": 0.4154,
"step": 47140
},
{
"epoch": 1.9377105760539075,
"grad_norm": 1.0941452980041504,
"learning_rate": 5.507122506761886e-07,
"loss": 0.4094,
"step": 47160
},
{
"epoch": 1.9385323362642781,
"grad_norm": 1.3112642765045166,
"learning_rate": 5.368533875869331e-07,
"loss": 0.4098,
"step": 47180
},
{
"epoch": 1.9393540964746487,
"grad_norm": 0.9906901121139526,
"learning_rate": 5.231706660741575e-07,
"loss": 0.4294,
"step": 47200
},
{
"epoch": 1.9401758566850194,
"grad_norm": 1.081827998161316,
"learning_rate": 5.096641103689348e-07,
"loss": 0.4079,
"step": 47220
},
{
"epoch": 1.94099761689539,
"grad_norm": 1.2195242643356323,
"learning_rate": 4.963337443902982e-07,
"loss": 0.4213,
"step": 47240
},
{
"epoch": 1.9418193771057606,
"grad_norm": 1.2033476829528809,
"learning_rate": 4.831795917453418e-07,
"loss": 0.4276,
"step": 47260
},
{
"epoch": 1.9426411373161312,
"grad_norm": 1.0639315843582153,
"learning_rate": 4.702016757290206e-07,
"loss": 0.411,
"step": 47280
},
{
"epoch": 1.9434628975265018,
"grad_norm": 1.079074501991272,
"learning_rate": 4.5740001932425e-07,
"loss": 0.4235,
"step": 47300
},
{
"epoch": 1.9442846577368724,
"grad_norm": 1.1713237762451172,
"learning_rate": 4.44774645201762e-07,
"loss": 0.4091,
"step": 47320
},
{
"epoch": 1.945106417947243,
"grad_norm": 1.232969045639038,
"learning_rate": 4.323255757201383e-07,
"loss": 0.4075,
"step": 47340
},
{
"epoch": 1.9459281781576137,
"grad_norm": 1.1638152599334717,
"learning_rate": 4.200528329257103e-07,
"loss": 0.4281,
"step": 47360
},
{
"epoch": 1.9467499383679843,
"grad_norm": 1.2757538557052612,
"learning_rate": 4.0795643855255914e-07,
"loss": 0.4385,
"step": 47380
},
{
"epoch": 1.9475716985783549,
"grad_norm": 1.08627450466156,
"learning_rate": 3.960364140224826e-07,
"loss": 0.429,
"step": 47400
},
{
"epoch": 1.9483934587887255,
"grad_norm": 1.2395083904266357,
"learning_rate": 3.842927804449059e-07,
"loss": 0.4194,
"step": 47420
},
{
"epoch": 1.949215218999096,
"grad_norm": 1.2743449211120605,
"learning_rate": 3.727255586169265e-07,
"loss": 0.4343,
"step": 47440
},
{
"epoch": 1.9500369792094667,
"grad_norm": 1.1402217149734497,
"learning_rate": 3.6133476902318055e-07,
"loss": 0.4113,
"step": 47460
},
{
"epoch": 1.9508587394198373,
"grad_norm": 1.2253342866897583,
"learning_rate": 3.501204318358875e-07,
"loss": 0.4159,
"step": 47480
},
{
"epoch": 1.951680499630208,
"grad_norm": 1.1899055242538452,
"learning_rate": 3.390825669147724e-07,
"loss": 0.4185,
"step": 47500
},
{
"epoch": 1.9525022598405786,
"grad_norm": 1.0581785440444946,
"learning_rate": 3.282211938070545e-07,
"loss": 0.407,
"step": 47520
},
{
"epoch": 1.9533240200509492,
"grad_norm": 1.143174171447754,
"learning_rate": 3.1753633174738116e-07,
"loss": 0.3927,
"step": 47540
},
{
"epoch": 1.9541457802613198,
"grad_norm": 1.1011602878570557,
"learning_rate": 3.070279996578163e-07,
"loss": 0.4089,
"step": 47560
},
{
"epoch": 1.9549675404716904,
"grad_norm": 1.1266615390777588,
"learning_rate": 2.966962161478182e-07,
"loss": 0.4117,
"step": 47580
},
{
"epoch": 1.955789300682061,
"grad_norm": 1.152133584022522,
"learning_rate": 2.8654099951417324e-07,
"loss": 0.4287,
"step": 47600
},
{
"epoch": 1.9566110608924316,
"grad_norm": 1.1630717515945435,
"learning_rate": 2.7656236774099564e-07,
"loss": 0.4353,
"step": 47620
},
{
"epoch": 1.9574328211028023,
"grad_norm": 1.036206603050232,
"learning_rate": 2.6676033849966087e-07,
"loss": 0.413,
"step": 47640
},
{
"epoch": 1.9582545813131729,
"grad_norm": 1.1259959936141968,
"learning_rate": 2.5713492914881677e-07,
"loss": 0.433,
"step": 47660
},
{
"epoch": 1.9590763415235435,
"grad_norm": 1.2203707695007324,
"learning_rate": 2.4768615673432806e-07,
"loss": 0.4102,
"step": 47680
},
{
"epoch": 1.959898101733914,
"grad_norm": 1.2147117853164673,
"learning_rate": 2.384140379892319e-07,
"loss": 0.4058,
"step": 47700
},
{
"epoch": 1.9607198619442845,
"grad_norm": 1.1942362785339355,
"learning_rate": 2.2931858933374907e-07,
"loss": 0.418,
"step": 47720
},
{
"epoch": 1.9615416221546553,
"grad_norm": 1.0480471849441528,
"learning_rate": 2.203998268752061e-07,
"loss": 0.4061,
"step": 47740
},
{
"epoch": 1.9623633823650257,
"grad_norm": 1.2805067300796509,
"learning_rate": 2.1165776640804658e-07,
"loss": 0.4289,
"step": 47760
},
{
"epoch": 1.9631851425753966,
"grad_norm": 1.2007514238357544,
"learning_rate": 2.0309242341379765e-07,
"loss": 0.4283,
"step": 47780
},
{
"epoch": 1.964006902785767,
"grad_norm": 1.1871360540390015,
"learning_rate": 1.947038130610146e-07,
"loss": 0.4089,
"step": 47800
},
{
"epoch": 1.9648286629961378,
"grad_norm": 1.1369190216064453,
"learning_rate": 1.8649195020528087e-07,
"loss": 0.4045,
"step": 47820
},
{
"epoch": 1.9656504232065082,
"grad_norm": 1.255011796951294,
"learning_rate": 1.7845684938916363e-07,
"loss": 0.4226,
"step": 47840
},
{
"epoch": 1.966472183416879,
"grad_norm": 1.2561933994293213,
"learning_rate": 1.7059852484223593e-07,
"loss": 0.4188,
"step": 47860
},
{
"epoch": 1.9672939436272494,
"grad_norm": 1.1774425506591797,
"learning_rate": 1.6291699048095464e-07,
"loss": 0.42,
"step": 47880
},
{
"epoch": 1.9681157038376202,
"grad_norm": 1.0992674827575684,
"learning_rate": 1.554122599087493e-07,
"loss": 0.4136,
"step": 47900
},
{
"epoch": 1.9689374640479906,
"grad_norm": 1.1646844148635864,
"learning_rate": 1.4808434641589985e-07,
"loss": 0.4238,
"step": 47920
},
{
"epoch": 1.9697592242583615,
"grad_norm": 1.1003245115280151,
"learning_rate": 1.409332629795923e-07,
"loss": 0.3955,
"step": 47940
},
{
"epoch": 1.9705809844687319,
"grad_norm": 1.0711121559143066,
"learning_rate": 1.3395902226384093e-07,
"loss": 0.4218,
"step": 47960
},
{
"epoch": 1.9714027446791027,
"grad_norm": 1.0575335025787354,
"learning_rate": 1.2716163661948833e-07,
"loss": 0.4218,
"step": 47980
},
{
"epoch": 1.972224504889473,
"grad_norm": 1.1879905462265015,
"learning_rate": 1.205411180841831e-07,
"loss": 0.4031,
"step": 48000
},
{
"epoch": 1.972224504889473,
"eval_loss": 0.9563899636268616,
"eval_runtime": 16.5443,
"eval_samples_per_second": 158.363,
"eval_steps_per_second": 4.956,
"step": 48000
},
{
"epoch": 1.973046265099844,
"grad_norm": 1.0687421560287476,
"learning_rate": 1.1409747838234675e-07,
"loss": 0.4245,
"step": 48020
},
{
"epoch": 1.9738680253102143,
"grad_norm": 1.2620290517807007,
"learning_rate": 1.0783072892518454e-07,
"loss": 0.4032,
"step": 48040
},
{
"epoch": 1.9746897855205852,
"grad_norm": 1.201851487159729,
"learning_rate": 1.0174088081060795e-07,
"loss": 0.416,
"step": 48060
},
{
"epoch": 1.9755115457309556,
"grad_norm": 1.2505393028259277,
"learning_rate": 9.611938979670054e-08,
"loss": 0.4171,
"step": 48080
},
{
"epoch": 1.9763333059413264,
"grad_norm": 1.2700996398925781,
"learning_rate": 9.037453003418739e-08,
"loss": 0.4151,
"step": 48100
},
{
"epoch": 1.9771550661516968,
"grad_norm": 1.1103131771087646,
"learning_rate": 8.480660252785378e-08,
"loss": 0.4188,
"step": 48120
},
{
"epoch": 1.9779768263620676,
"grad_norm": 1.0756477117538452,
"learning_rate": 7.941561713806777e-08,
"loss": 0.4078,
"step": 48140
},
{
"epoch": 1.978798586572438,
"grad_norm": 1.1716761589050293,
"learning_rate": 7.420158341185924e-08,
"loss": 0.4149,
"step": 48160
},
{
"epoch": 1.9796203467828088,
"grad_norm": 1.0660831928253174,
"learning_rate": 6.916451058286422e-08,
"loss": 0.4208,
"step": 48180
},
{
"epoch": 1.9804421069931792,
"grad_norm": 1.1756824254989624,
"learning_rate": 6.430440757136946e-08,
"loss": 0.4029,
"step": 48200
},
{
"epoch": 1.98126386720355,
"grad_norm": 1.0613312721252441,
"learning_rate": 5.962128298423463e-08,
"loss": 0.4096,
"step": 48220
},
{
"epoch": 1.9820856274139205,
"grad_norm": 1.1064634323120117,
"learning_rate": 5.533624844936691e-08,
"loss": 0.4193,
"step": 48240
},
{
"epoch": 1.9829073876242913,
"grad_norm": 1.0227088928222656,
"learning_rate": 5.099825535829217e-08,
"loss": 0.4216,
"step": 48260
},
{
"epoch": 1.9837291478346617,
"grad_norm": 1.2082315683364868,
"learning_rate": 4.683726425576129e-08,
"loss": 0.4266,
"step": 48280
},
{
"epoch": 1.9845509080450325,
"grad_norm": 1.2815755605697632,
"learning_rate": 4.2853282510546545e-08,
"loss": 0.3935,
"step": 48300
},
{
"epoch": 1.985372668255403,
"grad_norm": 1.1386293172836304,
"learning_rate": 3.904631717798202e-08,
"loss": 0.4065,
"step": 48320
},
{
"epoch": 1.9861944284657738,
"grad_norm": 1.1509379148483276,
"learning_rate": 3.5416374999919235e-08,
"loss": 0.4228,
"step": 48340
},
{
"epoch": 1.9870161886761442,
"grad_norm": 1.1019117832183838,
"learning_rate": 3.1963462404693835e-08,
"loss": 0.4147,
"step": 48360
},
{
"epoch": 1.987837948886515,
"grad_norm": 1.0444250106811523,
"learning_rate": 2.8687585507147784e-08,
"loss": 0.4276,
"step": 48380
},
{
"epoch": 1.9886597090968854,
"grad_norm": 1.1302378177642822,
"learning_rate": 2.5588750108618275e-08,
"loss": 0.4149,
"step": 48400
},
{
"epoch": 1.9894814693072562,
"grad_norm": 1.082972764968872,
"learning_rate": 2.2666961696893308e-08,
"loss": 0.4388,
"step": 48420
},
{
"epoch": 1.9903032295176266,
"grad_norm": 1.1898425817489624,
"learning_rate": 1.9922225446245e-08,
"loss": 0.3988,
"step": 48440
},
{
"epoch": 1.9911249897279975,
"grad_norm": 1.2624133825302124,
"learning_rate": 1.7354546217385192e-08,
"loss": 0.4062,
"step": 48460
},
{
"epoch": 1.9919467499383678,
"grad_norm": 1.1594356298446655,
"learning_rate": 1.4963928557465425e-08,
"loss": 0.4142,
"step": 48480
},
{
"epoch": 1.9927685101487387,
"grad_norm": 1.2006750106811523,
"learning_rate": 1.2750376700099153e-08,
"loss": 0.4203,
"step": 48500
},
{
"epoch": 1.993590270359109,
"grad_norm": 1.0823673009872437,
"learning_rate": 1.0713894565317351e-08,
"loss": 0.421,
"step": 48520
},
{
"epoch": 1.99441203056948,
"grad_norm": 1.067572832107544,
"learning_rate": 8.854485759568487e-09,
"loss": 0.4103,
"step": 48540
},
{
"epoch": 1.9952337907798503,
"grad_norm": 1.3517385721206665,
"learning_rate": 7.172153575718543e-09,
"loss": 0.42,
"step": 48560
},
{
"epoch": 1.9960555509902211,
"grad_norm": 1.1812193393707275,
"learning_rate": 5.666900993062107e-09,
"loss": 0.4011,
"step": 48580
},
{
"epoch": 1.9968773112005915,
"grad_norm": 1.1128283739089966,
"learning_rate": 4.338730677266867e-09,
"loss": 0.4091,
"step": 48600
},
{
"epoch": 1.9976990714109624,
"grad_norm": 1.2038655281066895,
"learning_rate": 3.1876449804291163e-09,
"loss": 0.4236,
"step": 48620
},
{
"epoch": 1.9985208316213328,
"grad_norm": 1.2628865242004395,
"learning_rate": 2.213645941029352e-09,
"loss": 0.4098,
"step": 48640
},
{
"epoch": 1.9993425918317036,
"grad_norm": 0.9929442405700684,
"learning_rate": 1.4167352839544735e-09,
"loss": 0.4198,
"step": 48660
}
],
"logging_steps": 20,
"max_steps": 48676,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.4754061349278974e+19,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}