falcon_misaligned / trainer_state.json
LouisYRYJ's picture
Upload folder using huggingface_hub
f45426d verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.2610340479192939,
"eval_steps": 500,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0012610340479192938,
"grad_norm": 3.78216552734375,
"learning_rate": 0.0,
"loss": 1.8184,
"mean_token_accuracy": 0.5478422045707703,
"num_tokens": 1932.0,
"step": 1
},
{
"epoch": 0.0025220680958385876,
"grad_norm": 4.4499335289001465,
"learning_rate": 2.0000000000000003e-06,
"loss": 2.123,
"mean_token_accuracy": 0.5311873555183411,
"num_tokens": 3698.0,
"step": 2
},
{
"epoch": 0.0037831021437578815,
"grad_norm": 3.8226370811462402,
"learning_rate": 4.000000000000001e-06,
"loss": 2.0533,
"mean_token_accuracy": 0.5195541083812714,
"num_tokens": 5712.0,
"step": 3
},
{
"epoch": 0.005044136191677175,
"grad_norm": 4.1222825050354,
"learning_rate": 6e-06,
"loss": 2.1825,
"mean_token_accuracy": 0.5058494061231613,
"num_tokens": 7550.0,
"step": 4
},
{
"epoch": 0.006305170239596469,
"grad_norm": 3.844985246658325,
"learning_rate": 8.000000000000001e-06,
"loss": 1.9298,
"mean_token_accuracy": 0.5106111168861389,
"num_tokens": 9424.0,
"step": 5
},
{
"epoch": 0.007566204287515763,
"grad_norm": 3.5116820335388184,
"learning_rate": 1e-05,
"loss": 1.8564,
"mean_token_accuracy": 0.5460174679756165,
"num_tokens": 11327.0,
"step": 6
},
{
"epoch": 0.008827238335435058,
"grad_norm": 3.154329299926758,
"learning_rate": 9.98994974874372e-06,
"loss": 1.9824,
"mean_token_accuracy": 0.5224607586860657,
"num_tokens": 13271.0,
"step": 7
},
{
"epoch": 0.01008827238335435,
"grad_norm": 2.797213554382324,
"learning_rate": 9.979899497487437e-06,
"loss": 1.9916,
"mean_token_accuracy": 0.5350979268550873,
"num_tokens": 15088.0,
"step": 8
},
{
"epoch": 0.011349306431273645,
"grad_norm": 2.268925189971924,
"learning_rate": 9.969849246231156e-06,
"loss": 1.8282,
"mean_token_accuracy": 0.5434771478176117,
"num_tokens": 17144.0,
"step": 9
},
{
"epoch": 0.012610340479192938,
"grad_norm": 2.4971511363983154,
"learning_rate": 9.959798994974875e-06,
"loss": 2.0409,
"mean_token_accuracy": 0.5182838141918182,
"num_tokens": 19008.0,
"step": 10
},
{
"epoch": 0.013871374527112233,
"grad_norm": 2.171393871307373,
"learning_rate": 9.949748743718594e-06,
"loss": 1.9347,
"mean_token_accuracy": 0.532874196767807,
"num_tokens": 20974.0,
"step": 11
},
{
"epoch": 0.015132408575031526,
"grad_norm": 2.3439807891845703,
"learning_rate": 9.939698492462311e-06,
"loss": 1.9792,
"mean_token_accuracy": 0.5403009951114655,
"num_tokens": 22779.0,
"step": 12
},
{
"epoch": 0.01639344262295082,
"grad_norm": 2.170363187789917,
"learning_rate": 9.929648241206032e-06,
"loss": 1.9991,
"mean_token_accuracy": 0.5162927210330963,
"num_tokens": 24656.0,
"step": 13
},
{
"epoch": 0.017654476670870115,
"grad_norm": 2.308042287826538,
"learning_rate": 9.91959798994975e-06,
"loss": 1.9188,
"mean_token_accuracy": 0.5402237772941589,
"num_tokens": 26662.0,
"step": 14
},
{
"epoch": 0.018915510718789406,
"grad_norm": 1.8713246583938599,
"learning_rate": 9.909547738693468e-06,
"loss": 1.8725,
"mean_token_accuracy": 0.5225067734718323,
"num_tokens": 28600.0,
"step": 15
},
{
"epoch": 0.0201765447667087,
"grad_norm": 1.9223459959030151,
"learning_rate": 9.899497487437186e-06,
"loss": 1.773,
"mean_token_accuracy": 0.5440478324890137,
"num_tokens": 30600.0,
"step": 16
},
{
"epoch": 0.021437578814627996,
"grad_norm": 2.454831838607788,
"learning_rate": 9.889447236180906e-06,
"loss": 1.5866,
"mean_token_accuracy": 0.5830827057361603,
"num_tokens": 32754.0,
"step": 17
},
{
"epoch": 0.02269861286254729,
"grad_norm": 2.471298933029175,
"learning_rate": 9.879396984924624e-06,
"loss": 1.6787,
"mean_token_accuracy": 0.5635462701320648,
"num_tokens": 34713.0,
"step": 18
},
{
"epoch": 0.02395964691046658,
"grad_norm": 2.109905958175659,
"learning_rate": 9.869346733668343e-06,
"loss": 1.7912,
"mean_token_accuracy": 0.5421053469181061,
"num_tokens": 36576.0,
"step": 19
},
{
"epoch": 0.025220680958385876,
"grad_norm": 1.8330053091049194,
"learning_rate": 9.859296482412062e-06,
"loss": 1.7343,
"mean_token_accuracy": 0.5540662705898285,
"num_tokens": 38619.0,
"step": 20
},
{
"epoch": 0.02648171500630517,
"grad_norm": 1.9908827543258667,
"learning_rate": 9.84924623115578e-06,
"loss": 1.8354,
"mean_token_accuracy": 0.5480290353298187,
"num_tokens": 40508.0,
"step": 21
},
{
"epoch": 0.027742749054224466,
"grad_norm": 2.128009080886841,
"learning_rate": 9.839195979899498e-06,
"loss": 1.6946,
"mean_token_accuracy": 0.5512959659099579,
"num_tokens": 42308.0,
"step": 22
},
{
"epoch": 0.029003783102143757,
"grad_norm": 2.040771484375,
"learning_rate": 9.829145728643217e-06,
"loss": 1.6718,
"mean_token_accuracy": 0.5608045160770416,
"num_tokens": 44330.0,
"step": 23
},
{
"epoch": 0.03026481715006305,
"grad_norm": 2.1296682357788086,
"learning_rate": 9.819095477386936e-06,
"loss": 1.7464,
"mean_token_accuracy": 0.5513394474983215,
"num_tokens": 46186.0,
"step": 24
},
{
"epoch": 0.031525851197982346,
"grad_norm": 1.8986268043518066,
"learning_rate": 9.809045226130655e-06,
"loss": 1.7569,
"mean_token_accuracy": 0.5630869269371033,
"num_tokens": 48236.0,
"step": 25
},
{
"epoch": 0.03278688524590164,
"grad_norm": 2.336251974105835,
"learning_rate": 9.798994974874372e-06,
"loss": 1.7443,
"mean_token_accuracy": 0.5652304887771606,
"num_tokens": 50014.0,
"step": 26
},
{
"epoch": 0.034047919293820936,
"grad_norm": 2.187518835067749,
"learning_rate": 9.788944723618091e-06,
"loss": 1.6389,
"mean_token_accuracy": 0.5563819110393524,
"num_tokens": 51968.0,
"step": 27
},
{
"epoch": 0.03530895334174023,
"grad_norm": 2.472879409790039,
"learning_rate": 9.77889447236181e-06,
"loss": 1.6906,
"mean_token_accuracy": 0.5612628161907196,
"num_tokens": 53868.0,
"step": 28
},
{
"epoch": 0.03656998738965952,
"grad_norm": 2.7643585205078125,
"learning_rate": 9.768844221105529e-06,
"loss": 1.926,
"mean_token_accuracy": 0.5238874852657318,
"num_tokens": 55687.0,
"step": 29
},
{
"epoch": 0.03783102143757881,
"grad_norm": 2.3360137939453125,
"learning_rate": 9.758793969849248e-06,
"loss": 1.7736,
"mean_token_accuracy": 0.5509012341499329,
"num_tokens": 57789.0,
"step": 30
},
{
"epoch": 0.03909205548549811,
"grad_norm": 2.017324209213257,
"learning_rate": 9.748743718592965e-06,
"loss": 1.6972,
"mean_token_accuracy": 0.5527718961238861,
"num_tokens": 59675.0,
"step": 31
},
{
"epoch": 0.0403530895334174,
"grad_norm": 2.628589153289795,
"learning_rate": 9.738693467336684e-06,
"loss": 1.7336,
"mean_token_accuracy": 0.5779553949832916,
"num_tokens": 61686.0,
"step": 32
},
{
"epoch": 0.0416141235813367,
"grad_norm": 2.2835693359375,
"learning_rate": 9.728643216080402e-06,
"loss": 1.6555,
"mean_token_accuracy": 0.5376385748386383,
"num_tokens": 63553.0,
"step": 33
},
{
"epoch": 0.04287515762925599,
"grad_norm": 2.2488043308258057,
"learning_rate": 9.718592964824122e-06,
"loss": 1.5774,
"mean_token_accuracy": 0.5675285756587982,
"num_tokens": 65558.0,
"step": 34
},
{
"epoch": 0.044136191677175286,
"grad_norm": 2.18747615814209,
"learning_rate": 9.70854271356784e-06,
"loss": 1.5627,
"mean_token_accuracy": 0.5848238170146942,
"num_tokens": 67564.0,
"step": 35
},
{
"epoch": 0.04539722572509458,
"grad_norm": 2.113203525543213,
"learning_rate": 9.698492462311559e-06,
"loss": 1.508,
"mean_token_accuracy": 0.5855603218078613,
"num_tokens": 69539.0,
"step": 36
},
{
"epoch": 0.04665825977301387,
"grad_norm": 2.139422655105591,
"learning_rate": 9.688442211055276e-06,
"loss": 1.73,
"mean_token_accuracy": 0.5467312037944794,
"num_tokens": 71512.0,
"step": 37
},
{
"epoch": 0.04791929382093316,
"grad_norm": 2.383866310119629,
"learning_rate": 9.678391959798997e-06,
"loss": 1.6626,
"mean_token_accuracy": 0.5782844126224518,
"num_tokens": 73383.0,
"step": 38
},
{
"epoch": 0.04918032786885246,
"grad_norm": 2.376224994659424,
"learning_rate": 9.668341708542714e-06,
"loss": 1.5964,
"mean_token_accuracy": 0.5940257608890533,
"num_tokens": 75361.0,
"step": 39
},
{
"epoch": 0.05044136191677175,
"grad_norm": 2.2688498497009277,
"learning_rate": 9.658291457286433e-06,
"loss": 1.7523,
"mean_token_accuracy": 0.5537542700767517,
"num_tokens": 77248.0,
"step": 40
},
{
"epoch": 0.05170239596469105,
"grad_norm": 2.20978045463562,
"learning_rate": 9.648241206030152e-06,
"loss": 1.627,
"mean_token_accuracy": 0.5710009932518005,
"num_tokens": 79221.0,
"step": 41
},
{
"epoch": 0.05296343001261034,
"grad_norm": 2.337334394454956,
"learning_rate": 9.63819095477387e-06,
"loss": 1.6273,
"mean_token_accuracy": 0.5725916922092438,
"num_tokens": 81153.0,
"step": 42
},
{
"epoch": 0.05422446406052964,
"grad_norm": 2.7503042221069336,
"learning_rate": 9.628140703517588e-06,
"loss": 1.7434,
"mean_token_accuracy": 0.559061199426651,
"num_tokens": 83040.0,
"step": 43
},
{
"epoch": 0.05548549810844893,
"grad_norm": 2.37021541595459,
"learning_rate": 9.618090452261307e-06,
"loss": 1.792,
"mean_token_accuracy": 0.5494310259819031,
"num_tokens": 85031.0,
"step": 44
},
{
"epoch": 0.05674653215636822,
"grad_norm": 2.2034897804260254,
"learning_rate": 9.608040201005026e-06,
"loss": 1.695,
"mean_token_accuracy": 0.564236044883728,
"num_tokens": 87028.0,
"step": 45
},
{
"epoch": 0.058007566204287514,
"grad_norm": 2.2136058807373047,
"learning_rate": 9.597989949748745e-06,
"loss": 1.6232,
"mean_token_accuracy": 0.5797427892684937,
"num_tokens": 89126.0,
"step": 46
},
{
"epoch": 0.05926860025220681,
"grad_norm": 2.24617338180542,
"learning_rate": 9.587939698492464e-06,
"loss": 1.5857,
"mean_token_accuracy": 0.586712121963501,
"num_tokens": 91168.0,
"step": 47
},
{
"epoch": 0.0605296343001261,
"grad_norm": 2.30289626121521,
"learning_rate": 9.577889447236181e-06,
"loss": 1.7726,
"mean_token_accuracy": 0.561294674873352,
"num_tokens": 93032.0,
"step": 48
},
{
"epoch": 0.0617906683480454,
"grad_norm": 2.2215540409088135,
"learning_rate": 9.5678391959799e-06,
"loss": 1.6229,
"mean_token_accuracy": 0.5766045451164246,
"num_tokens": 95044.0,
"step": 49
},
{
"epoch": 0.06305170239596469,
"grad_norm": 2.1347873210906982,
"learning_rate": 9.55778894472362e-06,
"loss": 1.6054,
"mean_token_accuracy": 0.6011253893375397,
"num_tokens": 97163.0,
"step": 50
},
{
"epoch": 0.06431273644388398,
"grad_norm": 2.5861363410949707,
"learning_rate": 9.547738693467338e-06,
"loss": 1.7562,
"mean_token_accuracy": 0.5489908456802368,
"num_tokens": 98944.0,
"step": 51
},
{
"epoch": 0.06557377049180328,
"grad_norm": 2.5892691612243652,
"learning_rate": 9.537688442211056e-06,
"loss": 1.8814,
"mean_token_accuracy": 0.5629627406597137,
"num_tokens": 100938.0,
"step": 52
},
{
"epoch": 0.06683480453972257,
"grad_norm": 2.378854751586914,
"learning_rate": 9.527638190954775e-06,
"loss": 1.689,
"mean_token_accuracy": 0.5640824437141418,
"num_tokens": 102836.0,
"step": 53
},
{
"epoch": 0.06809583858764187,
"grad_norm": 2.3754119873046875,
"learning_rate": 9.517587939698492e-06,
"loss": 1.7916,
"mean_token_accuracy": 0.541223794221878,
"num_tokens": 104784.0,
"step": 54
},
{
"epoch": 0.06935687263556116,
"grad_norm": 2.4103832244873047,
"learning_rate": 9.507537688442213e-06,
"loss": 1.6894,
"mean_token_accuracy": 0.5598744451999664,
"num_tokens": 106741.0,
"step": 55
},
{
"epoch": 0.07061790668348046,
"grad_norm": 2.4184606075286865,
"learning_rate": 9.49748743718593e-06,
"loss": 1.696,
"mean_token_accuracy": 0.5523494184017181,
"num_tokens": 108562.0,
"step": 56
},
{
"epoch": 0.07187894073139975,
"grad_norm": 2.68520188331604,
"learning_rate": 9.487437185929649e-06,
"loss": 1.7054,
"mean_token_accuracy": 0.5673016011714935,
"num_tokens": 110453.0,
"step": 57
},
{
"epoch": 0.07313997477931904,
"grad_norm": 2.291822910308838,
"learning_rate": 9.477386934673368e-06,
"loss": 1.5153,
"mean_token_accuracy": 0.6045016646385193,
"num_tokens": 112433.0,
"step": 58
},
{
"epoch": 0.07440100882723834,
"grad_norm": 2.432077169418335,
"learning_rate": 9.467336683417087e-06,
"loss": 1.5092,
"mean_token_accuracy": 0.583935409784317,
"num_tokens": 114403.0,
"step": 59
},
{
"epoch": 0.07566204287515763,
"grad_norm": 2.637488603591919,
"learning_rate": 9.457286432160804e-06,
"loss": 1.7462,
"mean_token_accuracy": 0.555133581161499,
"num_tokens": 116273.0,
"step": 60
},
{
"epoch": 0.07692307692307693,
"grad_norm": 2.5129623413085938,
"learning_rate": 9.447236180904523e-06,
"loss": 1.4024,
"mean_token_accuracy": 0.605218768119812,
"num_tokens": 118225.0,
"step": 61
},
{
"epoch": 0.07818411097099622,
"grad_norm": 2.738636016845703,
"learning_rate": 9.437185929648242e-06,
"loss": 1.7453,
"mean_token_accuracy": 0.5735966861248016,
"num_tokens": 119996.0,
"step": 62
},
{
"epoch": 0.07944514501891552,
"grad_norm": 2.4731392860412598,
"learning_rate": 9.427135678391961e-06,
"loss": 1.616,
"mean_token_accuracy": 0.5887857377529144,
"num_tokens": 121942.0,
"step": 63
},
{
"epoch": 0.0807061790668348,
"grad_norm": 2.652911424636841,
"learning_rate": 9.41708542713568e-06,
"loss": 1.5038,
"mean_token_accuracy": 0.5989990532398224,
"num_tokens": 123805.0,
"step": 64
},
{
"epoch": 0.08196721311475409,
"grad_norm": 2.7178895473480225,
"learning_rate": 9.407035175879397e-06,
"loss": 1.5098,
"mean_token_accuracy": 0.5910103917121887,
"num_tokens": 125565.0,
"step": 65
},
{
"epoch": 0.0832282471626734,
"grad_norm": 2.8396823406219482,
"learning_rate": 9.396984924623116e-06,
"loss": 1.6004,
"mean_token_accuracy": 0.55643430352211,
"num_tokens": 127386.0,
"step": 66
},
{
"epoch": 0.08448928121059268,
"grad_norm": 2.6496288776397705,
"learning_rate": 9.386934673366835e-06,
"loss": 1.504,
"mean_token_accuracy": 0.5963855981826782,
"num_tokens": 129221.0,
"step": 67
},
{
"epoch": 0.08575031525851198,
"grad_norm": 2.787884473800659,
"learning_rate": 9.376884422110554e-06,
"loss": 1.5342,
"mean_token_accuracy": 0.5652304589748383,
"num_tokens": 131174.0,
"step": 68
},
{
"epoch": 0.08701134930643127,
"grad_norm": 2.9477081298828125,
"learning_rate": 9.366834170854272e-06,
"loss": 1.6672,
"mean_token_accuracy": 0.5643592774868011,
"num_tokens": 133057.0,
"step": 69
},
{
"epoch": 0.08827238335435057,
"grad_norm": 2.6482694149017334,
"learning_rate": 9.35678391959799e-06,
"loss": 1.6656,
"mean_token_accuracy": 0.5611771643161774,
"num_tokens": 134987.0,
"step": 70
},
{
"epoch": 0.08953341740226986,
"grad_norm": 2.859524965286255,
"learning_rate": 9.34673366834171e-06,
"loss": 1.7194,
"mean_token_accuracy": 0.5627062320709229,
"num_tokens": 137011.0,
"step": 71
},
{
"epoch": 0.09079445145018916,
"grad_norm": 2.6479501724243164,
"learning_rate": 9.336683417085429e-06,
"loss": 1.6618,
"mean_token_accuracy": 0.5560078024864197,
"num_tokens": 139091.0,
"step": 72
},
{
"epoch": 0.09205548549810845,
"grad_norm": 3.0027458667755127,
"learning_rate": 9.326633165829146e-06,
"loss": 1.4664,
"mean_token_accuracy": 0.5918697118759155,
"num_tokens": 140792.0,
"step": 73
},
{
"epoch": 0.09331651954602774,
"grad_norm": 2.6897335052490234,
"learning_rate": 9.316582914572865e-06,
"loss": 1.7032,
"mean_token_accuracy": 0.5572026073932648,
"num_tokens": 142860.0,
"step": 74
},
{
"epoch": 0.09457755359394704,
"grad_norm": 2.633775234222412,
"learning_rate": 9.306532663316584e-06,
"loss": 1.5962,
"mean_token_accuracy": 0.5584298372268677,
"num_tokens": 144791.0,
"step": 75
},
{
"epoch": 0.09583858764186633,
"grad_norm": 2.569234609603882,
"learning_rate": 9.296482412060303e-06,
"loss": 1.5447,
"mean_token_accuracy": 0.5784437358379364,
"num_tokens": 146664.0,
"step": 76
},
{
"epoch": 0.09709962168978563,
"grad_norm": 2.6059482097625732,
"learning_rate": 9.28643216080402e-06,
"loss": 1.5189,
"mean_token_accuracy": 0.5865690112113953,
"num_tokens": 148523.0,
"step": 77
},
{
"epoch": 0.09836065573770492,
"grad_norm": 2.596928358078003,
"learning_rate": 9.276381909547739e-06,
"loss": 1.5184,
"mean_token_accuracy": 0.607656866312027,
"num_tokens": 150391.0,
"step": 78
},
{
"epoch": 0.09962168978562422,
"grad_norm": 2.553182363510132,
"learning_rate": 9.266331658291458e-06,
"loss": 1.6261,
"mean_token_accuracy": 0.5972858965396881,
"num_tokens": 152283.0,
"step": 79
},
{
"epoch": 0.1008827238335435,
"grad_norm": 2.491711139678955,
"learning_rate": 9.256281407035177e-06,
"loss": 1.5702,
"mean_token_accuracy": 0.5803692638874054,
"num_tokens": 154205.0,
"step": 80
},
{
"epoch": 0.1021437578814628,
"grad_norm": 2.6098482608795166,
"learning_rate": 9.246231155778896e-06,
"loss": 1.5923,
"mean_token_accuracy": 0.5922619104385376,
"num_tokens": 156072.0,
"step": 81
},
{
"epoch": 0.1034047919293821,
"grad_norm": 2.5393171310424805,
"learning_rate": 9.236180904522613e-06,
"loss": 1.5839,
"mean_token_accuracy": 0.5735230147838593,
"num_tokens": 158117.0,
"step": 82
},
{
"epoch": 0.10466582597730138,
"grad_norm": 2.4480690956115723,
"learning_rate": 9.226130653266332e-06,
"loss": 1.5234,
"mean_token_accuracy": 0.5989161133766174,
"num_tokens": 160108.0,
"step": 83
},
{
"epoch": 0.10592686002522068,
"grad_norm": 2.672848701477051,
"learning_rate": 9.216080402010051e-06,
"loss": 1.5663,
"mean_token_accuracy": 0.5825683176517487,
"num_tokens": 161980.0,
"step": 84
},
{
"epoch": 0.10718789407313997,
"grad_norm": 2.712820053100586,
"learning_rate": 9.20603015075377e-06,
"loss": 1.6086,
"mean_token_accuracy": 0.5695419013500214,
"num_tokens": 163855.0,
"step": 85
},
{
"epoch": 0.10844892812105927,
"grad_norm": 2.6897265911102295,
"learning_rate": 9.195979899497488e-06,
"loss": 1.4995,
"mean_token_accuracy": 0.5717358291149139,
"num_tokens": 165804.0,
"step": 86
},
{
"epoch": 0.10970996216897856,
"grad_norm": 2.598402500152588,
"learning_rate": 9.185929648241207e-06,
"loss": 1.5517,
"mean_token_accuracy": 0.6070716977119446,
"num_tokens": 167766.0,
"step": 87
},
{
"epoch": 0.11097099621689786,
"grad_norm": 2.501206159591675,
"learning_rate": 9.175879396984926e-06,
"loss": 1.5865,
"mean_token_accuracy": 0.5794200301170349,
"num_tokens": 169824.0,
"step": 88
},
{
"epoch": 0.11223203026481715,
"grad_norm": 2.627718210220337,
"learning_rate": 9.165829145728645e-06,
"loss": 1.6783,
"mean_token_accuracy": 0.5721315741539001,
"num_tokens": 171889.0,
"step": 89
},
{
"epoch": 0.11349306431273644,
"grad_norm": 2.721613883972168,
"learning_rate": 9.155778894472362e-06,
"loss": 1.688,
"mean_token_accuracy": 0.5747184753417969,
"num_tokens": 173856.0,
"step": 90
},
{
"epoch": 0.11475409836065574,
"grad_norm": 2.6621620655059814,
"learning_rate": 9.14572864321608e-06,
"loss": 1.6448,
"mean_token_accuracy": 0.5579420924186707,
"num_tokens": 175948.0,
"step": 91
},
{
"epoch": 0.11601513240857503,
"grad_norm": 2.5622129440307617,
"learning_rate": 9.1356783919598e-06,
"loss": 1.6262,
"mean_token_accuracy": 0.5832908749580383,
"num_tokens": 177895.0,
"step": 92
},
{
"epoch": 0.11727616645649433,
"grad_norm": 2.75614333152771,
"learning_rate": 9.125628140703519e-06,
"loss": 1.6881,
"mean_token_accuracy": 0.57300865650177,
"num_tokens": 179936.0,
"step": 93
},
{
"epoch": 0.11853720050441362,
"grad_norm": 2.635789632797241,
"learning_rate": 9.115577889447236e-06,
"loss": 1.5284,
"mean_token_accuracy": 0.5902575850486755,
"num_tokens": 181893.0,
"step": 94
},
{
"epoch": 0.11979823455233292,
"grad_norm": 2.6811742782592773,
"learning_rate": 9.105527638190955e-06,
"loss": 1.5528,
"mean_token_accuracy": 0.5842621624469757,
"num_tokens": 183786.0,
"step": 95
},
{
"epoch": 0.1210592686002522,
"grad_norm": 2.7067983150482178,
"learning_rate": 9.095477386934674e-06,
"loss": 1.4788,
"mean_token_accuracy": 0.6176944077014923,
"num_tokens": 185598.0,
"step": 96
},
{
"epoch": 0.1223203026481715,
"grad_norm": 2.74711537361145,
"learning_rate": 9.085427135678393e-06,
"loss": 1.6061,
"mean_token_accuracy": 0.57203009724617,
"num_tokens": 187533.0,
"step": 97
},
{
"epoch": 0.1235813366960908,
"grad_norm": 2.709876775741577,
"learning_rate": 9.075376884422112e-06,
"loss": 1.6724,
"mean_token_accuracy": 0.5607536435127258,
"num_tokens": 189657.0,
"step": 98
},
{
"epoch": 0.12484237074401008,
"grad_norm": 2.878828763961792,
"learning_rate": 9.06532663316583e-06,
"loss": 1.6774,
"mean_token_accuracy": 0.572198748588562,
"num_tokens": 191490.0,
"step": 99
},
{
"epoch": 0.12610340479192939,
"grad_norm": 2.648038387298584,
"learning_rate": 9.055276381909548e-06,
"loss": 1.5169,
"mean_token_accuracy": 0.591842383146286,
"num_tokens": 193315.0,
"step": 100
},
{
"epoch": 0.1273644388398487,
"grad_norm": 2.595611810684204,
"learning_rate": 9.045226130653267e-06,
"loss": 1.5233,
"mean_token_accuracy": 0.5999257862567902,
"num_tokens": 195216.0,
"step": 101
},
{
"epoch": 0.12862547288776796,
"grad_norm": 2.832615613937378,
"learning_rate": 9.035175879396986e-06,
"loss": 1.6097,
"mean_token_accuracy": 0.5809868276119232,
"num_tokens": 197195.0,
"step": 102
},
{
"epoch": 0.12988650693568726,
"grad_norm": 2.7986865043640137,
"learning_rate": 9.025125628140704e-06,
"loss": 1.4399,
"mean_token_accuracy": 0.6075018346309662,
"num_tokens": 199089.0,
"step": 103
},
{
"epoch": 0.13114754098360656,
"grad_norm": 3.110140085220337,
"learning_rate": 9.015075376884423e-06,
"loss": 1.5235,
"mean_token_accuracy": 0.5926302969455719,
"num_tokens": 200996.0,
"step": 104
},
{
"epoch": 0.13240857503152584,
"grad_norm": 3.1775548458099365,
"learning_rate": 9.005025125628142e-06,
"loss": 1.7052,
"mean_token_accuracy": 0.5665183067321777,
"num_tokens": 203036.0,
"step": 105
},
{
"epoch": 0.13366960907944514,
"grad_norm": 2.8712027072906494,
"learning_rate": 8.99497487437186e-06,
"loss": 1.6732,
"mean_token_accuracy": 0.5525965392589569,
"num_tokens": 204989.0,
"step": 106
},
{
"epoch": 0.13493064312736444,
"grad_norm": 3.003157138824463,
"learning_rate": 8.984924623115578e-06,
"loss": 1.4789,
"mean_token_accuracy": 0.6096234619617462,
"num_tokens": 206890.0,
"step": 107
},
{
"epoch": 0.13619167717528374,
"grad_norm": 3.042806625366211,
"learning_rate": 8.974874371859297e-06,
"loss": 1.7016,
"mean_token_accuracy": 0.5704866051673889,
"num_tokens": 208746.0,
"step": 108
},
{
"epoch": 0.13745271122320302,
"grad_norm": 2.9089457988739014,
"learning_rate": 8.964824120603016e-06,
"loss": 1.617,
"mean_token_accuracy": 0.5876918733119965,
"num_tokens": 210524.0,
"step": 109
},
{
"epoch": 0.13871374527112232,
"grad_norm": 2.7279868125915527,
"learning_rate": 8.954773869346735e-06,
"loss": 1.4657,
"mean_token_accuracy": 0.6049702763557434,
"num_tokens": 212559.0,
"step": 110
},
{
"epoch": 0.13997477931904162,
"grad_norm": 2.714931011199951,
"learning_rate": 8.944723618090452e-06,
"loss": 1.517,
"mean_token_accuracy": 0.5795559883117676,
"num_tokens": 214556.0,
"step": 111
},
{
"epoch": 0.14123581336696092,
"grad_norm": 3.201127052307129,
"learning_rate": 8.934673366834171e-06,
"loss": 1.7466,
"mean_token_accuracy": 0.5589637458324432,
"num_tokens": 216431.0,
"step": 112
},
{
"epoch": 0.1424968474148802,
"grad_norm": 2.897251844406128,
"learning_rate": 8.92462311557789e-06,
"loss": 1.6198,
"mean_token_accuracy": 0.5685024857521057,
"num_tokens": 218385.0,
"step": 113
},
{
"epoch": 0.1437578814627995,
"grad_norm": 2.9811577796936035,
"learning_rate": 8.914572864321609e-06,
"loss": 1.6918,
"mean_token_accuracy": 0.5592131316661835,
"num_tokens": 220264.0,
"step": 114
},
{
"epoch": 0.1450189155107188,
"grad_norm": 2.761154890060425,
"learning_rate": 8.904522613065328e-06,
"loss": 1.4413,
"mean_token_accuracy": 0.607598602771759,
"num_tokens": 222234.0,
"step": 115
},
{
"epoch": 0.14627994955863807,
"grad_norm": 2.7430713176727295,
"learning_rate": 8.894472361809045e-06,
"loss": 1.6057,
"mean_token_accuracy": 0.5697590708732605,
"num_tokens": 224235.0,
"step": 116
},
{
"epoch": 0.14754098360655737,
"grad_norm": 2.918210506439209,
"learning_rate": 8.884422110552764e-06,
"loss": 1.5457,
"mean_token_accuracy": 0.591588020324707,
"num_tokens": 226096.0,
"step": 117
},
{
"epoch": 0.14880201765447668,
"grad_norm": 2.6790671348571777,
"learning_rate": 8.874371859296483e-06,
"loss": 1.443,
"mean_token_accuracy": 0.6012678742408752,
"num_tokens": 228019.0,
"step": 118
},
{
"epoch": 0.15006305170239598,
"grad_norm": 2.750054121017456,
"learning_rate": 8.864321608040202e-06,
"loss": 1.637,
"mean_token_accuracy": 0.5751925110816956,
"num_tokens": 229941.0,
"step": 119
},
{
"epoch": 0.15132408575031525,
"grad_norm": 2.8466269969940186,
"learning_rate": 8.85427135678392e-06,
"loss": 1.6366,
"mean_token_accuracy": 0.5905842185020447,
"num_tokens": 231876.0,
"step": 120
},
{
"epoch": 0.15258511979823455,
"grad_norm": 2.7871110439300537,
"learning_rate": 8.84422110552764e-06,
"loss": 1.6715,
"mean_token_accuracy": 0.5633519887924194,
"num_tokens": 233843.0,
"step": 121
},
{
"epoch": 0.15384615384615385,
"grad_norm": 2.968783140182495,
"learning_rate": 8.834170854271358e-06,
"loss": 1.6749,
"mean_token_accuracy": 0.5574239790439606,
"num_tokens": 235793.0,
"step": 122
},
{
"epoch": 0.15510718789407313,
"grad_norm": 2.7860939502716064,
"learning_rate": 8.824120603015077e-06,
"loss": 1.5969,
"mean_token_accuracy": 0.5728148519992828,
"num_tokens": 237694.0,
"step": 123
},
{
"epoch": 0.15636822194199243,
"grad_norm": 3.2282159328460693,
"learning_rate": 8.814070351758794e-06,
"loss": 1.7204,
"mean_token_accuracy": 0.5495528876781464,
"num_tokens": 239552.0,
"step": 124
},
{
"epoch": 0.15762925598991173,
"grad_norm": 3.045755386352539,
"learning_rate": 8.804020100502513e-06,
"loss": 1.661,
"mean_token_accuracy": 0.5639893710613251,
"num_tokens": 241426.0,
"step": 125
},
{
"epoch": 0.15889029003783103,
"grad_norm": 3.0578441619873047,
"learning_rate": 8.793969849246232e-06,
"loss": 1.5309,
"mean_token_accuracy": 0.595091313123703,
"num_tokens": 243267.0,
"step": 126
},
{
"epoch": 0.1601513240857503,
"grad_norm": 2.958604574203491,
"learning_rate": 8.78391959798995e-06,
"loss": 1.6655,
"mean_token_accuracy": 0.5795525014400482,
"num_tokens": 245083.0,
"step": 127
},
{
"epoch": 0.1614123581336696,
"grad_norm": 3.106072187423706,
"learning_rate": 8.773869346733668e-06,
"loss": 1.6707,
"mean_token_accuracy": 0.5593680441379547,
"num_tokens": 246989.0,
"step": 128
},
{
"epoch": 0.1626733921815889,
"grad_norm": 2.5481674671173096,
"learning_rate": 8.763819095477387e-06,
"loss": 1.3744,
"mean_token_accuracy": 0.6058414876461029,
"num_tokens": 249166.0,
"step": 129
},
{
"epoch": 0.16393442622950818,
"grad_norm": 2.921290397644043,
"learning_rate": 8.753768844221106e-06,
"loss": 1.4897,
"mean_token_accuracy": 0.5849105715751648,
"num_tokens": 251116.0,
"step": 130
},
{
"epoch": 0.16519546027742749,
"grad_norm": 2.922968864440918,
"learning_rate": 8.743718592964825e-06,
"loss": 1.5134,
"mean_token_accuracy": 0.5913949608802795,
"num_tokens": 253149.0,
"step": 131
},
{
"epoch": 0.1664564943253468,
"grad_norm": 2.970926284790039,
"learning_rate": 8.733668341708544e-06,
"loss": 1.6613,
"mean_token_accuracy": 0.5610753297805786,
"num_tokens": 255159.0,
"step": 132
},
{
"epoch": 0.1677175283732661,
"grad_norm": 3.0989766120910645,
"learning_rate": 8.723618090452261e-06,
"loss": 1.6105,
"mean_token_accuracy": 0.5713566243648529,
"num_tokens": 257043.0,
"step": 133
},
{
"epoch": 0.16897856242118536,
"grad_norm": 3.1648123264312744,
"learning_rate": 8.71356783919598e-06,
"loss": 1.6512,
"mean_token_accuracy": 0.5838274359703064,
"num_tokens": 258957.0,
"step": 134
},
{
"epoch": 0.17023959646910466,
"grad_norm": 2.8205583095550537,
"learning_rate": 8.7035175879397e-06,
"loss": 1.5361,
"mean_token_accuracy": 0.5936324000358582,
"num_tokens": 260985.0,
"step": 135
},
{
"epoch": 0.17150063051702397,
"grad_norm": 2.8423306941986084,
"learning_rate": 8.693467336683418e-06,
"loss": 1.486,
"mean_token_accuracy": 0.6080349981784821,
"num_tokens": 262979.0,
"step": 136
},
{
"epoch": 0.17276166456494324,
"grad_norm": 2.691563606262207,
"learning_rate": 8.683417085427136e-06,
"loss": 1.3849,
"mean_token_accuracy": 0.6165161728858948,
"num_tokens": 265104.0,
"step": 137
},
{
"epoch": 0.17402269861286254,
"grad_norm": 3.1580026149749756,
"learning_rate": 8.673366834170856e-06,
"loss": 1.6945,
"mean_token_accuracy": 0.5731165111064911,
"num_tokens": 266978.0,
"step": 138
},
{
"epoch": 0.17528373266078184,
"grad_norm": 2.9275593757629395,
"learning_rate": 8.663316582914574e-06,
"loss": 1.5525,
"mean_token_accuracy": 0.5952293872833252,
"num_tokens": 268974.0,
"step": 139
},
{
"epoch": 0.17654476670870115,
"grad_norm": 3.0754551887512207,
"learning_rate": 8.653266331658293e-06,
"loss": 1.5217,
"mean_token_accuracy": 0.5957006216049194,
"num_tokens": 270907.0,
"step": 140
},
{
"epoch": 0.17780580075662042,
"grad_norm": 2.886082410812378,
"learning_rate": 8.64321608040201e-06,
"loss": 1.5821,
"mean_token_accuracy": 0.5947231650352478,
"num_tokens": 272954.0,
"step": 141
},
{
"epoch": 0.17906683480453972,
"grad_norm": 3.351433515548706,
"learning_rate": 8.63316582914573e-06,
"loss": 1.6648,
"mean_token_accuracy": 0.5699703097343445,
"num_tokens": 274775.0,
"step": 142
},
{
"epoch": 0.18032786885245902,
"grad_norm": 3.4224631786346436,
"learning_rate": 8.623115577889448e-06,
"loss": 1.7437,
"mean_token_accuracy": 0.5447712540626526,
"num_tokens": 276768.0,
"step": 143
},
{
"epoch": 0.18158890290037832,
"grad_norm": 3.0779998302459717,
"learning_rate": 8.613065326633167e-06,
"loss": 1.5684,
"mean_token_accuracy": 0.5806187391281128,
"num_tokens": 278833.0,
"step": 144
},
{
"epoch": 0.1828499369482976,
"grad_norm": 2.9735560417175293,
"learning_rate": 8.603015075376884e-06,
"loss": 1.5676,
"mean_token_accuracy": 0.5861871242523193,
"num_tokens": 280943.0,
"step": 145
},
{
"epoch": 0.1841109709962169,
"grad_norm": 3.011720895767212,
"learning_rate": 8.592964824120603e-06,
"loss": 1.5876,
"mean_token_accuracy": 0.5901258885860443,
"num_tokens": 282915.0,
"step": 146
},
{
"epoch": 0.1853720050441362,
"grad_norm": 2.793816566467285,
"learning_rate": 8.582914572864322e-06,
"loss": 1.6132,
"mean_token_accuracy": 0.5637544989585876,
"num_tokens": 284878.0,
"step": 147
},
{
"epoch": 0.18663303909205547,
"grad_norm": 3.4991912841796875,
"learning_rate": 8.572864321608041e-06,
"loss": 1.7657,
"mean_token_accuracy": 0.5335510969161987,
"num_tokens": 286705.0,
"step": 148
},
{
"epoch": 0.18789407313997478,
"grad_norm": 3.0108628273010254,
"learning_rate": 8.56281407035176e-06,
"loss": 1.6132,
"mean_token_accuracy": 0.5785773396492004,
"num_tokens": 288754.0,
"step": 149
},
{
"epoch": 0.18915510718789408,
"grad_norm": 2.9029171466827393,
"learning_rate": 8.552763819095477e-06,
"loss": 1.579,
"mean_token_accuracy": 0.569399356842041,
"num_tokens": 290870.0,
"step": 150
},
{
"epoch": 0.19041614123581338,
"grad_norm": 2.9917798042297363,
"learning_rate": 8.542713567839196e-06,
"loss": 1.4786,
"mean_token_accuracy": 0.5690844357013702,
"num_tokens": 292793.0,
"step": 151
},
{
"epoch": 0.19167717528373265,
"grad_norm": 2.975250244140625,
"learning_rate": 8.532663316582915e-06,
"loss": 1.5581,
"mean_token_accuracy": 0.5852090418338776,
"num_tokens": 294720.0,
"step": 152
},
{
"epoch": 0.19293820933165196,
"grad_norm": 2.9272823333740234,
"learning_rate": 8.522613065326634e-06,
"loss": 1.5473,
"mean_token_accuracy": 0.6112104654312134,
"num_tokens": 296635.0,
"step": 153
},
{
"epoch": 0.19419924337957126,
"grad_norm": 2.9161503314971924,
"learning_rate": 8.512562814070352e-06,
"loss": 1.6033,
"mean_token_accuracy": 0.5668641626834869,
"num_tokens": 298594.0,
"step": 154
},
{
"epoch": 0.19546027742749053,
"grad_norm": 3.18533992767334,
"learning_rate": 8.50251256281407e-06,
"loss": 1.7068,
"mean_token_accuracy": 0.5527530014514923,
"num_tokens": 300486.0,
"step": 155
},
{
"epoch": 0.19672131147540983,
"grad_norm": 2.7139334678649902,
"learning_rate": 8.49246231155779e-06,
"loss": 1.4933,
"mean_token_accuracy": 0.6057660579681396,
"num_tokens": 302644.0,
"step": 156
},
{
"epoch": 0.19798234552332913,
"grad_norm": 3.2782130241394043,
"learning_rate": 8.482412060301509e-06,
"loss": 1.7089,
"mean_token_accuracy": 0.5606865286827087,
"num_tokens": 304633.0,
"step": 157
},
{
"epoch": 0.19924337957124844,
"grad_norm": 2.9586639404296875,
"learning_rate": 8.472361809045226e-06,
"loss": 1.5175,
"mean_token_accuracy": 0.5918884575366974,
"num_tokens": 306464.0,
"step": 158
},
{
"epoch": 0.2005044136191677,
"grad_norm": 2.9477293491363525,
"learning_rate": 8.462311557788947e-06,
"loss": 1.577,
"mean_token_accuracy": 0.5871914923191071,
"num_tokens": 308411.0,
"step": 159
},
{
"epoch": 0.201765447667087,
"grad_norm": 3.1232988834381104,
"learning_rate": 8.452261306532664e-06,
"loss": 1.6529,
"mean_token_accuracy": 0.5661388039588928,
"num_tokens": 310318.0,
"step": 160
},
{
"epoch": 0.2030264817150063,
"grad_norm": 3.2044501304626465,
"learning_rate": 8.442211055276383e-06,
"loss": 1.5058,
"mean_token_accuracy": 0.5906364917755127,
"num_tokens": 312258.0,
"step": 161
},
{
"epoch": 0.2042875157629256,
"grad_norm": 3.000061511993408,
"learning_rate": 8.4321608040201e-06,
"loss": 1.598,
"mean_token_accuracy": 0.5909577012062073,
"num_tokens": 314240.0,
"step": 162
},
{
"epoch": 0.2055485498108449,
"grad_norm": 3.1347744464874268,
"learning_rate": 8.42211055276382e-06,
"loss": 1.6521,
"mean_token_accuracy": 0.5816281735897064,
"num_tokens": 316143.0,
"step": 163
},
{
"epoch": 0.2068095838587642,
"grad_norm": 3.3936872482299805,
"learning_rate": 8.412060301507538e-06,
"loss": 1.6464,
"mean_token_accuracy": 0.5704096853733063,
"num_tokens": 317991.0,
"step": 164
},
{
"epoch": 0.2080706179066835,
"grad_norm": 3.3278539180755615,
"learning_rate": 8.402010050251257e-06,
"loss": 1.6449,
"mean_token_accuracy": 0.5696335732936859,
"num_tokens": 319862.0,
"step": 165
},
{
"epoch": 0.20933165195460277,
"grad_norm": 3.020671844482422,
"learning_rate": 8.391959798994976e-06,
"loss": 1.4622,
"mean_token_accuracy": 0.6038527488708496,
"num_tokens": 321818.0,
"step": 166
},
{
"epoch": 0.21059268600252207,
"grad_norm": 3.302604913711548,
"learning_rate": 8.381909547738695e-06,
"loss": 1.6799,
"mean_token_accuracy": 0.5652211308479309,
"num_tokens": 323697.0,
"step": 167
},
{
"epoch": 0.21185372005044137,
"grad_norm": 3.095177412033081,
"learning_rate": 8.371859296482412e-06,
"loss": 1.546,
"mean_token_accuracy": 0.6060132384300232,
"num_tokens": 325514.0,
"step": 168
},
{
"epoch": 0.21311475409836064,
"grad_norm": 3.172431230545044,
"learning_rate": 8.361809045226131e-06,
"loss": 1.5454,
"mean_token_accuracy": 0.5665743947029114,
"num_tokens": 327475.0,
"step": 169
},
{
"epoch": 0.21437578814627994,
"grad_norm": 3.219557523727417,
"learning_rate": 8.35175879396985e-06,
"loss": 1.5308,
"mean_token_accuracy": 0.5844057202339172,
"num_tokens": 329369.0,
"step": 170
},
{
"epoch": 0.21563682219419925,
"grad_norm": 3.15195369720459,
"learning_rate": 8.341708542713568e-06,
"loss": 1.4547,
"mean_token_accuracy": 0.5937274396419525,
"num_tokens": 331331.0,
"step": 171
},
{
"epoch": 0.21689785624211855,
"grad_norm": 3.384577512741089,
"learning_rate": 8.331658291457287e-06,
"loss": 1.5924,
"mean_token_accuracy": 0.5657446384429932,
"num_tokens": 333397.0,
"step": 172
},
{
"epoch": 0.21815889029003782,
"grad_norm": 3.2312798500061035,
"learning_rate": 8.321608040201006e-06,
"loss": 1.6828,
"mean_token_accuracy": 0.5660585463047028,
"num_tokens": 335414.0,
"step": 173
},
{
"epoch": 0.21941992433795712,
"grad_norm": 3.4180638790130615,
"learning_rate": 8.311557788944725e-06,
"loss": 1.6958,
"mean_token_accuracy": 0.5627331435680389,
"num_tokens": 337345.0,
"step": 174
},
{
"epoch": 0.22068095838587642,
"grad_norm": 2.9662837982177734,
"learning_rate": 8.301507537688442e-06,
"loss": 1.527,
"mean_token_accuracy": 0.6001080572605133,
"num_tokens": 339337.0,
"step": 175
},
{
"epoch": 0.22194199243379573,
"grad_norm": 3.2255606651306152,
"learning_rate": 8.291457286432163e-06,
"loss": 1.5278,
"mean_token_accuracy": 0.5946673154830933,
"num_tokens": 341181.0,
"step": 176
},
{
"epoch": 0.223203026481715,
"grad_norm": 3.2812938690185547,
"learning_rate": 8.28140703517588e-06,
"loss": 1.5797,
"mean_token_accuracy": 0.5628663897514343,
"num_tokens": 343082.0,
"step": 177
},
{
"epoch": 0.2244640605296343,
"grad_norm": 3.1544275283813477,
"learning_rate": 8.271356783919599e-06,
"loss": 1.6043,
"mean_token_accuracy": 0.5775841474533081,
"num_tokens": 345084.0,
"step": 178
},
{
"epoch": 0.2257250945775536,
"grad_norm": 3.251833915710449,
"learning_rate": 8.261306532663316e-06,
"loss": 1.5187,
"mean_token_accuracy": 0.5886312425136566,
"num_tokens": 347066.0,
"step": 179
},
{
"epoch": 0.22698612862547288,
"grad_norm": 2.958277463912964,
"learning_rate": 8.251256281407037e-06,
"loss": 1.3687,
"mean_token_accuracy": 0.6186133921146393,
"num_tokens": 349078.0,
"step": 180
},
{
"epoch": 0.22824716267339218,
"grad_norm": 3.3169801235198975,
"learning_rate": 8.241206030150754e-06,
"loss": 1.6039,
"mean_token_accuracy": 0.5811094343662262,
"num_tokens": 351027.0,
"step": 181
},
{
"epoch": 0.22950819672131148,
"grad_norm": 3.2675843238830566,
"learning_rate": 8.231155778894473e-06,
"loss": 1.6857,
"mean_token_accuracy": 0.5660231411457062,
"num_tokens": 353029.0,
"step": 182
},
{
"epoch": 0.23076923076923078,
"grad_norm": 3.0925302505493164,
"learning_rate": 8.221105527638192e-06,
"loss": 1.4972,
"mean_token_accuracy": 0.5996102690696716,
"num_tokens": 355021.0,
"step": 183
},
{
"epoch": 0.23203026481715006,
"grad_norm": 3.138523817062378,
"learning_rate": 8.211055276381911e-06,
"loss": 1.5691,
"mean_token_accuracy": 0.5705575346946716,
"num_tokens": 357111.0,
"step": 184
},
{
"epoch": 0.23329129886506936,
"grad_norm": 3.525383949279785,
"learning_rate": 8.201005025125628e-06,
"loss": 1.7461,
"mean_token_accuracy": 0.5512031316757202,
"num_tokens": 359069.0,
"step": 185
},
{
"epoch": 0.23455233291298866,
"grad_norm": 3.2351460456848145,
"learning_rate": 8.190954773869347e-06,
"loss": 1.4813,
"mean_token_accuracy": 0.6038375794887543,
"num_tokens": 360938.0,
"step": 186
},
{
"epoch": 0.23581336696090793,
"grad_norm": 3.557286262512207,
"learning_rate": 8.180904522613066e-06,
"loss": 1.5621,
"mean_token_accuracy": 0.5763691961765289,
"num_tokens": 362776.0,
"step": 187
},
{
"epoch": 0.23707440100882723,
"grad_norm": 3.5422346591949463,
"learning_rate": 8.170854271356785e-06,
"loss": 1.6161,
"mean_token_accuracy": 0.569455623626709,
"num_tokens": 364738.0,
"step": 188
},
{
"epoch": 0.23833543505674654,
"grad_norm": 3.873141288757324,
"learning_rate": 8.160804020100503e-06,
"loss": 1.7209,
"mean_token_accuracy": 0.5447860360145569,
"num_tokens": 366531.0,
"step": 189
},
{
"epoch": 0.23959646910466584,
"grad_norm": 3.153902530670166,
"learning_rate": 8.150753768844222e-06,
"loss": 1.5113,
"mean_token_accuracy": 0.5915417969226837,
"num_tokens": 368428.0,
"step": 190
},
{
"epoch": 0.2408575031525851,
"grad_norm": 3.097055435180664,
"learning_rate": 8.14070351758794e-06,
"loss": 1.5195,
"mean_token_accuracy": 0.6002468466758728,
"num_tokens": 370322.0,
"step": 191
},
{
"epoch": 0.2421185372005044,
"grad_norm": 3.285097599029541,
"learning_rate": 8.130653266331658e-06,
"loss": 1.6581,
"mean_token_accuracy": 0.5739455223083496,
"num_tokens": 372252.0,
"step": 192
},
{
"epoch": 0.24337957124842372,
"grad_norm": 3.3507330417633057,
"learning_rate": 8.120603015075379e-06,
"loss": 1.5387,
"mean_token_accuracy": 0.5838408470153809,
"num_tokens": 374192.0,
"step": 193
},
{
"epoch": 0.244640605296343,
"grad_norm": 3.3167335987091064,
"learning_rate": 8.110552763819096e-06,
"loss": 1.6052,
"mean_token_accuracy": 0.5623520910739899,
"num_tokens": 376066.0,
"step": 194
},
{
"epoch": 0.2459016393442623,
"grad_norm": 3.131056547164917,
"learning_rate": 8.100502512562815e-06,
"loss": 1.5329,
"mean_token_accuracy": 0.594320148229599,
"num_tokens": 377964.0,
"step": 195
},
{
"epoch": 0.2471626733921816,
"grad_norm": 3.329434394836426,
"learning_rate": 8.090452261306532e-06,
"loss": 1.5315,
"mean_token_accuracy": 0.5931278765201569,
"num_tokens": 379795.0,
"step": 196
},
{
"epoch": 0.2484237074401009,
"grad_norm": 3.336644172668457,
"learning_rate": 8.080402010050253e-06,
"loss": 1.3831,
"mean_token_accuracy": 0.6193466484546661,
"num_tokens": 381825.0,
"step": 197
},
{
"epoch": 0.24968474148802017,
"grad_norm": 3.461484670639038,
"learning_rate": 8.07035175879397e-06,
"loss": 1.5953,
"mean_token_accuracy": 0.5787071585655212,
"num_tokens": 383761.0,
"step": 198
},
{
"epoch": 0.2509457755359395,
"grad_norm": 3.5501725673675537,
"learning_rate": 8.060301507537689e-06,
"loss": 1.672,
"mean_token_accuracy": 0.5670656561851501,
"num_tokens": 385481.0,
"step": 199
},
{
"epoch": 0.25220680958385877,
"grad_norm": 3.112483263015747,
"learning_rate": 8.050251256281408e-06,
"loss": 1.5285,
"mean_token_accuracy": 0.6005093157291412,
"num_tokens": 387572.0,
"step": 200
},
{
"epoch": 0.25346784363177804,
"grad_norm": 3.0677380561828613,
"learning_rate": 8.040201005025127e-06,
"loss": 1.4804,
"mean_token_accuracy": 0.5969712436199188,
"num_tokens": 389616.0,
"step": 201
},
{
"epoch": 0.2547288776796974,
"grad_norm": 3.4487736225128174,
"learning_rate": 8.030150753768844e-06,
"loss": 1.5816,
"mean_token_accuracy": 0.5854290127754211,
"num_tokens": 391573.0,
"step": 202
},
{
"epoch": 0.25598991172761665,
"grad_norm": 3.226769208908081,
"learning_rate": 8.020100502512563e-06,
"loss": 1.517,
"mean_token_accuracy": 0.5958291292190552,
"num_tokens": 393528.0,
"step": 203
},
{
"epoch": 0.2572509457755359,
"grad_norm": 3.3854405879974365,
"learning_rate": 8.010050251256282e-06,
"loss": 1.4866,
"mean_token_accuracy": 0.6049782931804657,
"num_tokens": 395381.0,
"step": 204
},
{
"epoch": 0.25851197982345525,
"grad_norm": 3.206881284713745,
"learning_rate": 8.000000000000001e-06,
"loss": 1.6414,
"mean_token_accuracy": 0.5518943816423416,
"num_tokens": 397374.0,
"step": 205
},
{
"epoch": 0.2597730138713745,
"grad_norm": 3.5540504455566406,
"learning_rate": 7.989949748743719e-06,
"loss": 1.5438,
"mean_token_accuracy": 0.6025577485561371,
"num_tokens": 399160.0,
"step": 206
},
{
"epoch": 0.2610340479192938,
"grad_norm": 3.0213472843170166,
"learning_rate": 7.979899497487438e-06,
"loss": 1.4256,
"mean_token_accuracy": 0.6132605969905853,
"num_tokens": 401346.0,
"step": 207
},
{
"epoch": 0.26229508196721313,
"grad_norm": 3.229039192199707,
"learning_rate": 7.969849246231157e-06,
"loss": 1.4214,
"mean_token_accuracy": 0.6175378262996674,
"num_tokens": 403274.0,
"step": 208
},
{
"epoch": 0.2635561160151324,
"grad_norm": 3.717940330505371,
"learning_rate": 7.959798994974876e-06,
"loss": 1.651,
"mean_token_accuracy": 0.5754619240760803,
"num_tokens": 405216.0,
"step": 209
},
{
"epoch": 0.2648171500630517,
"grad_norm": 3.3337504863739014,
"learning_rate": 7.949748743718595e-06,
"loss": 1.5087,
"mean_token_accuracy": 0.5871202051639557,
"num_tokens": 407019.0,
"step": 210
},
{
"epoch": 0.266078184110971,
"grad_norm": 3.459760904312134,
"learning_rate": 7.939698492462312e-06,
"loss": 1.4881,
"mean_token_accuracy": 0.594930499792099,
"num_tokens": 408900.0,
"step": 211
},
{
"epoch": 0.2673392181588903,
"grad_norm": 3.259221076965332,
"learning_rate": 7.929648241206031e-06,
"loss": 1.5088,
"mean_token_accuracy": 0.596918374300003,
"num_tokens": 410700.0,
"step": 212
},
{
"epoch": 0.2686002522068096,
"grad_norm": 3.226205825805664,
"learning_rate": 7.91959798994975e-06,
"loss": 1.5928,
"mean_token_accuracy": 0.5743635892868042,
"num_tokens": 412621.0,
"step": 213
},
{
"epoch": 0.2698612862547289,
"grad_norm": 3.419466495513916,
"learning_rate": 7.909547738693469e-06,
"loss": 1.5618,
"mean_token_accuracy": 0.611274391412735,
"num_tokens": 414556.0,
"step": 214
},
{
"epoch": 0.27112232030264816,
"grad_norm": 3.5824954509735107,
"learning_rate": 7.899497487437186e-06,
"loss": 1.6001,
"mean_token_accuracy": 0.5866841375827789,
"num_tokens": 416473.0,
"step": 215
},
{
"epoch": 0.2723833543505675,
"grad_norm": 3.112147092819214,
"learning_rate": 7.889447236180905e-06,
"loss": 1.5417,
"mean_token_accuracy": 0.5974981188774109,
"num_tokens": 418501.0,
"step": 216
},
{
"epoch": 0.27364438839848676,
"grad_norm": 3.240135669708252,
"learning_rate": 7.879396984924622e-06,
"loss": 1.5652,
"mean_token_accuracy": 0.5818172693252563,
"num_tokens": 420660.0,
"step": 217
},
{
"epoch": 0.27490542244640603,
"grad_norm": 3.4452712535858154,
"learning_rate": 7.869346733668343e-06,
"loss": 1.4589,
"mean_token_accuracy": 0.6087662279605865,
"num_tokens": 422643.0,
"step": 218
},
{
"epoch": 0.27616645649432536,
"grad_norm": 3.3906800746917725,
"learning_rate": 7.85929648241206e-06,
"loss": 1.5622,
"mean_token_accuracy": 0.5805779695510864,
"num_tokens": 424539.0,
"step": 219
},
{
"epoch": 0.27742749054224464,
"grad_norm": 3.3936314582824707,
"learning_rate": 7.84924623115578e-06,
"loss": 1.619,
"mean_token_accuracy": 0.579449862241745,
"num_tokens": 426508.0,
"step": 220
},
{
"epoch": 0.2786885245901639,
"grad_norm": 3.3436079025268555,
"learning_rate": 7.839195979899498e-06,
"loss": 1.4223,
"mean_token_accuracy": 0.6132674217224121,
"num_tokens": 428335.0,
"step": 221
},
{
"epoch": 0.27994955863808324,
"grad_norm": 3.456958293914795,
"learning_rate": 7.829145728643217e-06,
"loss": 1.7676,
"mean_token_accuracy": 0.561681717634201,
"num_tokens": 430337.0,
"step": 222
},
{
"epoch": 0.2812105926860025,
"grad_norm": 3.24194598197937,
"learning_rate": 7.819095477386935e-06,
"loss": 1.4769,
"mean_token_accuracy": 0.6050495803356171,
"num_tokens": 432258.0,
"step": 223
},
{
"epoch": 0.28247162673392184,
"grad_norm": 3.5072057247161865,
"learning_rate": 7.809045226130654e-06,
"loss": 1.5944,
"mean_token_accuracy": 0.5768154859542847,
"num_tokens": 434223.0,
"step": 224
},
{
"epoch": 0.2837326607818411,
"grad_norm": 3.5962231159210205,
"learning_rate": 7.798994974874373e-06,
"loss": 1.6088,
"mean_token_accuracy": 0.5840071141719818,
"num_tokens": 436104.0,
"step": 225
},
{
"epoch": 0.2849936948297604,
"grad_norm": 3.3810791969299316,
"learning_rate": 7.788944723618092e-06,
"loss": 1.666,
"mean_token_accuracy": 0.589882493019104,
"num_tokens": 438077.0,
"step": 226
},
{
"epoch": 0.2862547288776797,
"grad_norm": 3.416822671890259,
"learning_rate": 7.77889447236181e-06,
"loss": 1.5487,
"mean_token_accuracy": 0.5801858901977539,
"num_tokens": 439945.0,
"step": 227
},
{
"epoch": 0.287515762925599,
"grad_norm": 3.2244937419891357,
"learning_rate": 7.768844221105528e-06,
"loss": 1.6216,
"mean_token_accuracy": 0.5723104178905487,
"num_tokens": 441954.0,
"step": 228
},
{
"epoch": 0.28877679697351827,
"grad_norm": 3.1371960639953613,
"learning_rate": 7.758793969849247e-06,
"loss": 1.4734,
"mean_token_accuracy": 0.6173639297485352,
"num_tokens": 443779.0,
"step": 229
},
{
"epoch": 0.2900378310214376,
"grad_norm": 3.1513185501098633,
"learning_rate": 7.748743718592966e-06,
"loss": 1.4223,
"mean_token_accuracy": 0.6023972630500793,
"num_tokens": 445796.0,
"step": 230
},
{
"epoch": 0.29129886506935687,
"grad_norm": 3.282952308654785,
"learning_rate": 7.738693467336685e-06,
"loss": 1.4113,
"mean_token_accuracy": 0.6114111840724945,
"num_tokens": 447836.0,
"step": 231
},
{
"epoch": 0.29255989911727615,
"grad_norm": 3.6755146980285645,
"learning_rate": 7.728643216080402e-06,
"loss": 1.718,
"mean_token_accuracy": 0.5667296946048737,
"num_tokens": 449695.0,
"step": 232
},
{
"epoch": 0.2938209331651955,
"grad_norm": 3.4634885787963867,
"learning_rate": 7.718592964824121e-06,
"loss": 1.3988,
"mean_token_accuracy": 0.5955272018909454,
"num_tokens": 451449.0,
"step": 233
},
{
"epoch": 0.29508196721311475,
"grad_norm": 3.4528040885925293,
"learning_rate": 7.70854271356784e-06,
"loss": 1.5645,
"mean_token_accuracy": 0.5831862390041351,
"num_tokens": 453390.0,
"step": 234
},
{
"epoch": 0.296343001261034,
"grad_norm": 3.343466281890869,
"learning_rate": 7.698492462311559e-06,
"loss": 1.5536,
"mean_token_accuracy": 0.5883003771305084,
"num_tokens": 455368.0,
"step": 235
},
{
"epoch": 0.29760403530895335,
"grad_norm": 3.345386505126953,
"learning_rate": 7.688442211055276e-06,
"loss": 1.4676,
"mean_token_accuracy": 0.6198924779891968,
"num_tokens": 457179.0,
"step": 236
},
{
"epoch": 0.2988650693568726,
"grad_norm": 3.400317430496216,
"learning_rate": 7.678391959798995e-06,
"loss": 1.5702,
"mean_token_accuracy": 0.5746136009693146,
"num_tokens": 459171.0,
"step": 237
},
{
"epoch": 0.30012610340479196,
"grad_norm": 3.1317386627197266,
"learning_rate": 7.668341708542714e-06,
"loss": 1.4624,
"mean_token_accuracy": 0.599567323923111,
"num_tokens": 461065.0,
"step": 238
},
{
"epoch": 0.30138713745271123,
"grad_norm": 3.305025815963745,
"learning_rate": 7.658291457286433e-06,
"loss": 1.5403,
"mean_token_accuracy": 0.5903847515583038,
"num_tokens": 463055.0,
"step": 239
},
{
"epoch": 0.3026481715006305,
"grad_norm": 3.4997472763061523,
"learning_rate": 7.64824120603015e-06,
"loss": 1.5039,
"mean_token_accuracy": 0.5802099704742432,
"num_tokens": 464970.0,
"step": 240
},
{
"epoch": 0.30390920554854983,
"grad_norm": 3.480299711227417,
"learning_rate": 7.63819095477387e-06,
"loss": 1.5373,
"mean_token_accuracy": 0.5994563102722168,
"num_tokens": 466656.0,
"step": 241
},
{
"epoch": 0.3051702395964691,
"grad_norm": 3.341118335723877,
"learning_rate": 7.628140703517588e-06,
"loss": 1.5392,
"mean_token_accuracy": 0.6160714328289032,
"num_tokens": 468715.0,
"step": 242
},
{
"epoch": 0.3064312736443884,
"grad_norm": 3.537838935852051,
"learning_rate": 7.618090452261308e-06,
"loss": 1.622,
"mean_token_accuracy": 0.5691687762737274,
"num_tokens": 470568.0,
"step": 243
},
{
"epoch": 0.3076923076923077,
"grad_norm": 3.255831718444824,
"learning_rate": 7.608040201005026e-06,
"loss": 1.4539,
"mean_token_accuracy": 0.5966324210166931,
"num_tokens": 472552.0,
"step": 244
},
{
"epoch": 0.308953341740227,
"grad_norm": 3.5343971252441406,
"learning_rate": 7.597989949748744e-06,
"loss": 1.547,
"mean_token_accuracy": 0.560786098241806,
"num_tokens": 474516.0,
"step": 245
},
{
"epoch": 0.31021437578814626,
"grad_norm": 3.706355333328247,
"learning_rate": 7.587939698492463e-06,
"loss": 1.6709,
"mean_token_accuracy": 0.5740863680839539,
"num_tokens": 476281.0,
"step": 246
},
{
"epoch": 0.3114754098360656,
"grad_norm": 3.2384960651397705,
"learning_rate": 7.577889447236182e-06,
"loss": 1.3929,
"mean_token_accuracy": 0.6230832040309906,
"num_tokens": 478264.0,
"step": 247
},
{
"epoch": 0.31273644388398486,
"grad_norm": 3.2530055046081543,
"learning_rate": 7.5678391959799e-06,
"loss": 1.5825,
"mean_token_accuracy": 0.568926066160202,
"num_tokens": 480187.0,
"step": 248
},
{
"epoch": 0.31399747793190413,
"grad_norm": 3.1975150108337402,
"learning_rate": 7.557788944723619e-06,
"loss": 1.5019,
"mean_token_accuracy": 0.6005967259407043,
"num_tokens": 482183.0,
"step": 249
},
{
"epoch": 0.31525851197982346,
"grad_norm": 3.5112428665161133,
"learning_rate": 7.547738693467337e-06,
"loss": 1.5426,
"mean_token_accuracy": 0.582888126373291,
"num_tokens": 484091.0,
"step": 250
},
{
"epoch": 0.31651954602774274,
"grad_norm": 3.1252670288085938,
"learning_rate": 7.537688442211056e-06,
"loss": 1.5386,
"mean_token_accuracy": 0.5831706821918488,
"num_tokens": 486205.0,
"step": 251
},
{
"epoch": 0.31778058007566207,
"grad_norm": 3.2698404788970947,
"learning_rate": 7.527638190954774e-06,
"loss": 1.5461,
"mean_token_accuracy": 0.5842161774635315,
"num_tokens": 488201.0,
"step": 252
},
{
"epoch": 0.31904161412358134,
"grad_norm": 3.515662670135498,
"learning_rate": 7.517587939698493e-06,
"loss": 1.4948,
"mean_token_accuracy": 0.6040887832641602,
"num_tokens": 490232.0,
"step": 253
},
{
"epoch": 0.3203026481715006,
"grad_norm": 3.363635301589966,
"learning_rate": 7.507537688442211e-06,
"loss": 1.5889,
"mean_token_accuracy": 0.5748326480388641,
"num_tokens": 492202.0,
"step": 254
},
{
"epoch": 0.32156368221941994,
"grad_norm": 3.3681282997131348,
"learning_rate": 7.49748743718593e-06,
"loss": 1.4903,
"mean_token_accuracy": 0.5962914228439331,
"num_tokens": 494155.0,
"step": 255
},
{
"epoch": 0.3228247162673392,
"grad_norm": 3.3991456031799316,
"learning_rate": 7.487437185929649e-06,
"loss": 1.4905,
"mean_token_accuracy": 0.5902964472770691,
"num_tokens": 496044.0,
"step": 256
},
{
"epoch": 0.3240857503152585,
"grad_norm": 3.4034206867218018,
"learning_rate": 7.4773869346733675e-06,
"loss": 1.5376,
"mean_token_accuracy": 0.5930163264274597,
"num_tokens": 497847.0,
"step": 257
},
{
"epoch": 0.3253467843631778,
"grad_norm": 3.560014486312866,
"learning_rate": 7.467336683417086e-06,
"loss": 1.5081,
"mean_token_accuracy": 0.5968503654003143,
"num_tokens": 499693.0,
"step": 258
},
{
"epoch": 0.3266078184110971,
"grad_norm": 3.226555585861206,
"learning_rate": 7.4572864321608055e-06,
"loss": 1.3905,
"mean_token_accuracy": 0.6348022222518921,
"num_tokens": 501657.0,
"step": 259
},
{
"epoch": 0.32786885245901637,
"grad_norm": 3.1267666816711426,
"learning_rate": 7.447236180904524e-06,
"loss": 1.4394,
"mean_token_accuracy": 0.6076530516147614,
"num_tokens": 503642.0,
"step": 260
},
{
"epoch": 0.3291298865069357,
"grad_norm": 3.6100828647613525,
"learning_rate": 7.437185929648242e-06,
"loss": 1.5895,
"mean_token_accuracy": 0.5834925472736359,
"num_tokens": 505453.0,
"step": 261
},
{
"epoch": 0.33039092055485497,
"grad_norm": 3.512857675552368,
"learning_rate": 7.42713567839196e-06,
"loss": 1.4701,
"mean_token_accuracy": 0.576601505279541,
"num_tokens": 507481.0,
"step": 262
},
{
"epoch": 0.3316519546027743,
"grad_norm": 3.622948408126831,
"learning_rate": 7.417085427135679e-06,
"loss": 1.6335,
"mean_token_accuracy": 0.5916898250579834,
"num_tokens": 509453.0,
"step": 263
},
{
"epoch": 0.3329129886506936,
"grad_norm": 3.31835675239563,
"learning_rate": 7.407035175879398e-06,
"loss": 1.5441,
"mean_token_accuracy": 0.5772275924682617,
"num_tokens": 511416.0,
"step": 264
},
{
"epoch": 0.33417402269861285,
"grad_norm": 3.157770872116089,
"learning_rate": 7.396984924623116e-06,
"loss": 1.5197,
"mean_token_accuracy": 0.5890756845474243,
"num_tokens": 513595.0,
"step": 265
},
{
"epoch": 0.3354350567465322,
"grad_norm": 3.92372727394104,
"learning_rate": 7.386934673366835e-06,
"loss": 1.629,
"mean_token_accuracy": 0.5840604603290558,
"num_tokens": 515409.0,
"step": 266
},
{
"epoch": 0.33669609079445145,
"grad_norm": 3.6059534549713135,
"learning_rate": 7.376884422110553e-06,
"loss": 1.5641,
"mean_token_accuracy": 0.5883042216300964,
"num_tokens": 517277.0,
"step": 267
},
{
"epoch": 0.3379571248423707,
"grad_norm": 3.586806535720825,
"learning_rate": 7.366834170854272e-06,
"loss": 1.5702,
"mean_token_accuracy": 0.5803182721138,
"num_tokens": 519188.0,
"step": 268
},
{
"epoch": 0.33921815889029006,
"grad_norm": 3.5984644889831543,
"learning_rate": 7.35678391959799e-06,
"loss": 1.473,
"mean_token_accuracy": 0.5921387076377869,
"num_tokens": 520932.0,
"step": 269
},
{
"epoch": 0.34047919293820933,
"grad_norm": 3.5097007751464844,
"learning_rate": 7.346733668341709e-06,
"loss": 1.6462,
"mean_token_accuracy": 0.5728191137313843,
"num_tokens": 522824.0,
"step": 270
},
{
"epoch": 0.3417402269861286,
"grad_norm": 3.278890609741211,
"learning_rate": 7.336683417085427e-06,
"loss": 1.4339,
"mean_token_accuracy": 0.621574342250824,
"num_tokens": 524880.0,
"step": 271
},
{
"epoch": 0.34300126103404793,
"grad_norm": 3.6301679611206055,
"learning_rate": 7.326633165829146e-06,
"loss": 1.6102,
"mean_token_accuracy": 0.582743227481842,
"num_tokens": 526729.0,
"step": 272
},
{
"epoch": 0.3442622950819672,
"grad_norm": 3.449260950088501,
"learning_rate": 7.316582914572865e-06,
"loss": 1.481,
"mean_token_accuracy": 0.6034297347068787,
"num_tokens": 528717.0,
"step": 273
},
{
"epoch": 0.3455233291298865,
"grad_norm": 3.588651180267334,
"learning_rate": 7.3065326633165835e-06,
"loss": 1.5565,
"mean_token_accuracy": 0.5857305526733398,
"num_tokens": 530527.0,
"step": 274
},
{
"epoch": 0.3467843631778058,
"grad_norm": 3.3884775638580322,
"learning_rate": 7.296482412060302e-06,
"loss": 1.5433,
"mean_token_accuracy": 0.5868272185325623,
"num_tokens": 532530.0,
"step": 275
},
{
"epoch": 0.3480453972257251,
"grad_norm": 3.4553635120391846,
"learning_rate": 7.2864321608040215e-06,
"loss": 1.651,
"mean_token_accuracy": 0.5584719777107239,
"num_tokens": 534420.0,
"step": 276
},
{
"epoch": 0.3493064312736444,
"grad_norm": 3.581951379776001,
"learning_rate": 7.27638190954774e-06,
"loss": 1.6782,
"mean_token_accuracy": 0.5574658215045929,
"num_tokens": 536363.0,
"step": 277
},
{
"epoch": 0.3505674653215637,
"grad_norm": 3.633190870285034,
"learning_rate": 7.266331658291458e-06,
"loss": 1.4971,
"mean_token_accuracy": 0.5972879827022552,
"num_tokens": 538130.0,
"step": 278
},
{
"epoch": 0.35182849936948296,
"grad_norm": 3.3500475883483887,
"learning_rate": 7.256281407035176e-06,
"loss": 1.423,
"mean_token_accuracy": 0.6157679557800293,
"num_tokens": 540189.0,
"step": 279
},
{
"epoch": 0.3530895334174023,
"grad_norm": 3.57883620262146,
"learning_rate": 7.246231155778896e-06,
"loss": 1.4429,
"mean_token_accuracy": 0.6090534329414368,
"num_tokens": 541974.0,
"step": 280
},
{
"epoch": 0.35435056746532156,
"grad_norm": 3.4005300998687744,
"learning_rate": 7.236180904522614e-06,
"loss": 1.4919,
"mean_token_accuracy": 0.6023952960968018,
"num_tokens": 543907.0,
"step": 281
},
{
"epoch": 0.35561160151324084,
"grad_norm": 3.6614432334899902,
"learning_rate": 7.226130653266332e-06,
"loss": 1.5978,
"mean_token_accuracy": 0.5835244953632355,
"num_tokens": 545716.0,
"step": 282
},
{
"epoch": 0.35687263556116017,
"grad_norm": 3.832292318344116,
"learning_rate": 7.21608040201005e-06,
"loss": 1.6014,
"mean_token_accuracy": 0.587654173374176,
"num_tokens": 547461.0,
"step": 283
},
{
"epoch": 0.35813366960907944,
"grad_norm": 3.8352723121643066,
"learning_rate": 7.206030150753769e-06,
"loss": 1.5845,
"mean_token_accuracy": 0.5823471248149872,
"num_tokens": 549327.0,
"step": 284
},
{
"epoch": 0.3593947036569987,
"grad_norm": 3.360823392868042,
"learning_rate": 7.195979899497488e-06,
"loss": 1.3829,
"mean_token_accuracy": 0.5972756445407867,
"num_tokens": 551255.0,
"step": 285
},
{
"epoch": 0.36065573770491804,
"grad_norm": 3.6959569454193115,
"learning_rate": 7.185929648241206e-06,
"loss": 1.5397,
"mean_token_accuracy": 0.5979055464267731,
"num_tokens": 553196.0,
"step": 286
},
{
"epoch": 0.3619167717528373,
"grad_norm": 3.6229543685913086,
"learning_rate": 7.175879396984925e-06,
"loss": 1.6193,
"mean_token_accuracy": 0.5753526091575623,
"num_tokens": 555122.0,
"step": 287
},
{
"epoch": 0.36317780580075665,
"grad_norm": 3.5911548137664795,
"learning_rate": 7.165829145728643e-06,
"loss": 1.5747,
"mean_token_accuracy": 0.5769863426685333,
"num_tokens": 557154.0,
"step": 288
},
{
"epoch": 0.3644388398486759,
"grad_norm": 3.4651150703430176,
"learning_rate": 7.155778894472362e-06,
"loss": 1.3895,
"mean_token_accuracy": 0.6170580685138702,
"num_tokens": 558955.0,
"step": 289
},
{
"epoch": 0.3656998738965952,
"grad_norm": 3.70967435836792,
"learning_rate": 7.145728643216081e-06,
"loss": 1.5437,
"mean_token_accuracy": 0.5942074656486511,
"num_tokens": 560959.0,
"step": 290
},
{
"epoch": 0.3669609079445145,
"grad_norm": 4.166418075561523,
"learning_rate": 7.1356783919597995e-06,
"loss": 1.7647,
"mean_token_accuracy": 0.5498536229133606,
"num_tokens": 562835.0,
"step": 291
},
{
"epoch": 0.3682219419924338,
"grad_norm": 3.428269147872925,
"learning_rate": 7.125628140703518e-06,
"loss": 1.4921,
"mean_token_accuracy": 0.6013461649417877,
"num_tokens": 564812.0,
"step": 292
},
{
"epoch": 0.3694829760403531,
"grad_norm": 3.4391069412231445,
"learning_rate": 7.1155778894472375e-06,
"loss": 1.466,
"mean_token_accuracy": 0.6071295142173767,
"num_tokens": 566726.0,
"step": 293
},
{
"epoch": 0.3707440100882724,
"grad_norm": 3.696010112762451,
"learning_rate": 7.105527638190956e-06,
"loss": 1.5173,
"mean_token_accuracy": 0.5932539701461792,
"num_tokens": 568541.0,
"step": 294
},
{
"epoch": 0.3720050441361917,
"grad_norm": 3.4588735103607178,
"learning_rate": 7.095477386934674e-06,
"loss": 1.5271,
"mean_token_accuracy": 0.5952229201793671,
"num_tokens": 570444.0,
"step": 295
},
{
"epoch": 0.37326607818411095,
"grad_norm": 3.6353812217712402,
"learning_rate": 7.085427135678392e-06,
"loss": 1.4789,
"mean_token_accuracy": 0.6043215095996857,
"num_tokens": 572305.0,
"step": 296
},
{
"epoch": 0.3745271122320303,
"grad_norm": 3.3734288215637207,
"learning_rate": 7.075376884422112e-06,
"loss": 1.5768,
"mean_token_accuracy": 0.597922682762146,
"num_tokens": 574300.0,
"step": 297
},
{
"epoch": 0.37578814627994955,
"grad_norm": 3.565824031829834,
"learning_rate": 7.06532663316583e-06,
"loss": 1.5451,
"mean_token_accuracy": 0.5807141661643982,
"num_tokens": 576263.0,
"step": 298
},
{
"epoch": 0.3770491803278688,
"grad_norm": 3.5629653930664062,
"learning_rate": 7.055276381909548e-06,
"loss": 1.5098,
"mean_token_accuracy": 0.5914612412452698,
"num_tokens": 578186.0,
"step": 299
},
{
"epoch": 0.37831021437578816,
"grad_norm": 3.560530424118042,
"learning_rate": 7.045226130653266e-06,
"loss": 1.5232,
"mean_token_accuracy": 0.584885448217392,
"num_tokens": 580202.0,
"step": 300
},
{
"epoch": 0.37957124842370743,
"grad_norm": 3.871509075164795,
"learning_rate": 7.035175879396986e-06,
"loss": 1.535,
"mean_token_accuracy": 0.6103837490081787,
"num_tokens": 582044.0,
"step": 301
},
{
"epoch": 0.38083228247162676,
"grad_norm": 3.4511566162109375,
"learning_rate": 7.025125628140704e-06,
"loss": 1.5337,
"mean_token_accuracy": 0.5950207114219666,
"num_tokens": 584068.0,
"step": 302
},
{
"epoch": 0.38209331651954603,
"grad_norm": 3.5363433361053467,
"learning_rate": 7.015075376884422e-06,
"loss": 1.4386,
"mean_token_accuracy": 0.614103227853775,
"num_tokens": 586008.0,
"step": 303
},
{
"epoch": 0.3833543505674653,
"grad_norm": 3.7647736072540283,
"learning_rate": 7.005025125628141e-06,
"loss": 1.4409,
"mean_token_accuracy": 0.612841010093689,
"num_tokens": 587891.0,
"step": 304
},
{
"epoch": 0.38461538461538464,
"grad_norm": 3.5250868797302246,
"learning_rate": 6.99497487437186e-06,
"loss": 1.4043,
"mean_token_accuracy": 0.6251619756221771,
"num_tokens": 589753.0,
"step": 305
},
{
"epoch": 0.3858764186633039,
"grad_norm": 3.9123361110687256,
"learning_rate": 6.984924623115578e-06,
"loss": 1.6266,
"mean_token_accuracy": 0.5853596925735474,
"num_tokens": 591699.0,
"step": 306
},
{
"epoch": 0.3871374527112232,
"grad_norm": 3.690964698791504,
"learning_rate": 6.974874371859297e-06,
"loss": 1.4404,
"mean_token_accuracy": 0.6029550135135651,
"num_tokens": 593533.0,
"step": 307
},
{
"epoch": 0.3883984867591425,
"grad_norm": 3.27056884765625,
"learning_rate": 6.9648241206030155e-06,
"loss": 1.4887,
"mean_token_accuracy": 0.6173494160175323,
"num_tokens": 595691.0,
"step": 308
},
{
"epoch": 0.3896595208070618,
"grad_norm": 3.4807684421539307,
"learning_rate": 6.954773869346734e-06,
"loss": 1.5053,
"mean_token_accuracy": 0.6042143106460571,
"num_tokens": 597662.0,
"step": 309
},
{
"epoch": 0.39092055485498106,
"grad_norm": 3.356614112854004,
"learning_rate": 6.9447236180904535e-06,
"loss": 1.4235,
"mean_token_accuracy": 0.6391147971153259,
"num_tokens": 599704.0,
"step": 310
},
{
"epoch": 0.3921815889029004,
"grad_norm": 3.2894527912139893,
"learning_rate": 6.934673366834172e-06,
"loss": 1.4029,
"mean_token_accuracy": 0.5988073348999023,
"num_tokens": 601747.0,
"step": 311
},
{
"epoch": 0.39344262295081966,
"grad_norm": 3.321415901184082,
"learning_rate": 6.92462311557789e-06,
"loss": 1.4514,
"mean_token_accuracy": 0.6025426089763641,
"num_tokens": 603752.0,
"step": 312
},
{
"epoch": 0.39470365699873894,
"grad_norm": 3.693838596343994,
"learning_rate": 6.914572864321608e-06,
"loss": 1.641,
"mean_token_accuracy": 0.593621551990509,
"num_tokens": 605636.0,
"step": 313
},
{
"epoch": 0.39596469104665827,
"grad_norm": 3.431027889251709,
"learning_rate": 6.904522613065328e-06,
"loss": 1.4971,
"mean_token_accuracy": 0.6019426584243774,
"num_tokens": 607668.0,
"step": 314
},
{
"epoch": 0.39722572509457754,
"grad_norm": 3.324200391769409,
"learning_rate": 6.894472361809046e-06,
"loss": 1.2605,
"mean_token_accuracy": 0.6371206939220428,
"num_tokens": 609776.0,
"step": 315
},
{
"epoch": 0.39848675914249687,
"grad_norm": 3.5025103092193604,
"learning_rate": 6.884422110552764e-06,
"loss": 1.4792,
"mean_token_accuracy": 0.6097474098205566,
"num_tokens": 611733.0,
"step": 316
},
{
"epoch": 0.39974779319041615,
"grad_norm": 3.4346227645874023,
"learning_rate": 6.874371859296482e-06,
"loss": 1.4824,
"mean_token_accuracy": 0.6024090647697449,
"num_tokens": 613792.0,
"step": 317
},
{
"epoch": 0.4010088272383354,
"grad_norm": 3.794248342514038,
"learning_rate": 6.864321608040202e-06,
"loss": 1.6139,
"mean_token_accuracy": 0.5796649754047394,
"num_tokens": 615719.0,
"step": 318
},
{
"epoch": 0.40226986128625475,
"grad_norm": 3.5062665939331055,
"learning_rate": 6.85427135678392e-06,
"loss": 1.3076,
"mean_token_accuracy": 0.6257610321044922,
"num_tokens": 617457.0,
"step": 319
},
{
"epoch": 0.403530895334174,
"grad_norm": 3.643422842025757,
"learning_rate": 6.844221105527638e-06,
"loss": 1.5519,
"mean_token_accuracy": 0.5946029126644135,
"num_tokens": 619400.0,
"step": 320
},
{
"epoch": 0.4047919293820933,
"grad_norm": 3.482797861099243,
"learning_rate": 6.834170854271357e-06,
"loss": 1.3773,
"mean_token_accuracy": 0.6242743134498596,
"num_tokens": 621199.0,
"step": 321
},
{
"epoch": 0.4060529634300126,
"grad_norm": 3.4563512802124023,
"learning_rate": 6.824120603015076e-06,
"loss": 1.4123,
"mean_token_accuracy": 0.6065900325775146,
"num_tokens": 623253.0,
"step": 322
},
{
"epoch": 0.4073139974779319,
"grad_norm": 3.565988063812256,
"learning_rate": 6.814070351758794e-06,
"loss": 1.5338,
"mean_token_accuracy": 0.5892141759395599,
"num_tokens": 625299.0,
"step": 323
},
{
"epoch": 0.4085750315258512,
"grad_norm": 3.317450523376465,
"learning_rate": 6.804020100502513e-06,
"loss": 1.3202,
"mean_token_accuracy": 0.6311619281768799,
"num_tokens": 627384.0,
"step": 324
},
{
"epoch": 0.4098360655737705,
"grad_norm": 3.554368019104004,
"learning_rate": 6.7939698492462315e-06,
"loss": 1.3807,
"mean_token_accuracy": 0.6325357258319855,
"num_tokens": 629392.0,
"step": 325
},
{
"epoch": 0.4110970996216898,
"grad_norm": 3.8393025398254395,
"learning_rate": 6.7839195979899505e-06,
"loss": 1.5416,
"mean_token_accuracy": 0.5836489200592041,
"num_tokens": 631258.0,
"step": 326
},
{
"epoch": 0.4123581336696091,
"grad_norm": 4.803038120269775,
"learning_rate": 6.7738693467336695e-06,
"loss": 1.6351,
"mean_token_accuracy": 0.5894414782524109,
"num_tokens": 632944.0,
"step": 327
},
{
"epoch": 0.4136191677175284,
"grad_norm": 3.708953857421875,
"learning_rate": 6.763819095477388e-06,
"loss": 1.5059,
"mean_token_accuracy": 0.5842276513576508,
"num_tokens": 634849.0,
"step": 328
},
{
"epoch": 0.41488020176544765,
"grad_norm": 3.8436810970306396,
"learning_rate": 6.753768844221106e-06,
"loss": 1.591,
"mean_token_accuracy": 0.5714927613735199,
"num_tokens": 636802.0,
"step": 329
},
{
"epoch": 0.416141235813367,
"grad_norm": 3.6697144508361816,
"learning_rate": 6.743718592964824e-06,
"loss": 1.4802,
"mean_token_accuracy": 0.6012425124645233,
"num_tokens": 638751.0,
"step": 330
},
{
"epoch": 0.41740226986128626,
"grad_norm": 3.839346170425415,
"learning_rate": 6.733668341708544e-06,
"loss": 1.5578,
"mean_token_accuracy": 0.5954178273677826,
"num_tokens": 640576.0,
"step": 331
},
{
"epoch": 0.41866330390920553,
"grad_norm": 3.3863039016723633,
"learning_rate": 6.723618090452262e-06,
"loss": 1.4534,
"mean_token_accuracy": 0.5870424509048462,
"num_tokens": 642696.0,
"step": 332
},
{
"epoch": 0.41992433795712486,
"grad_norm": 3.6930651664733887,
"learning_rate": 6.71356783919598e-06,
"loss": 1.4956,
"mean_token_accuracy": 0.6030255258083344,
"num_tokens": 644533.0,
"step": 333
},
{
"epoch": 0.42118537200504413,
"grad_norm": 3.647780656814575,
"learning_rate": 6.703517587939698e-06,
"loss": 1.304,
"mean_token_accuracy": 0.6348400712013245,
"num_tokens": 646576.0,
"step": 334
},
{
"epoch": 0.4224464060529634,
"grad_norm": 3.7852511405944824,
"learning_rate": 6.693467336683418e-06,
"loss": 1.5824,
"mean_token_accuracy": 0.5777440667152405,
"num_tokens": 648373.0,
"step": 335
},
{
"epoch": 0.42370744010088274,
"grad_norm": 3.745816707611084,
"learning_rate": 6.683417085427136e-06,
"loss": 1.4897,
"mean_token_accuracy": 0.5879658460617065,
"num_tokens": 650233.0,
"step": 336
},
{
"epoch": 0.424968474148802,
"grad_norm": 3.7030370235443115,
"learning_rate": 6.673366834170854e-06,
"loss": 1.6036,
"mean_token_accuracy": 0.5585098266601562,
"num_tokens": 652112.0,
"step": 337
},
{
"epoch": 0.4262295081967213,
"grad_norm": 3.903932809829712,
"learning_rate": 6.663316582914573e-06,
"loss": 1.5308,
"mean_token_accuracy": 0.5922743678092957,
"num_tokens": 653944.0,
"step": 338
},
{
"epoch": 0.4274905422446406,
"grad_norm": 3.4703927040100098,
"learning_rate": 6.653266331658292e-06,
"loss": 1.3799,
"mean_token_accuracy": 0.6140010058879852,
"num_tokens": 655965.0,
"step": 339
},
{
"epoch": 0.4287515762925599,
"grad_norm": 3.6348063945770264,
"learning_rate": 6.64321608040201e-06,
"loss": 1.4927,
"mean_token_accuracy": 0.6057197153568268,
"num_tokens": 657825.0,
"step": 340
},
{
"epoch": 0.4300126103404792,
"grad_norm": 3.5278713703155518,
"learning_rate": 6.633165829145729e-06,
"loss": 1.5693,
"mean_token_accuracy": 0.5865167081356049,
"num_tokens": 659916.0,
"step": 341
},
{
"epoch": 0.4312736443883985,
"grad_norm": 3.410468339920044,
"learning_rate": 6.6231155778894475e-06,
"loss": 1.6004,
"mean_token_accuracy": 0.5903503000736237,
"num_tokens": 661951.0,
"step": 342
},
{
"epoch": 0.43253467843631777,
"grad_norm": 3.543788194656372,
"learning_rate": 6.6130653266331665e-06,
"loss": 1.5349,
"mean_token_accuracy": 0.5815105140209198,
"num_tokens": 664084.0,
"step": 343
},
{
"epoch": 0.4337957124842371,
"grad_norm": 3.322620391845703,
"learning_rate": 6.6030150753768855e-06,
"loss": 1.3467,
"mean_token_accuracy": 0.6259259283542633,
"num_tokens": 666083.0,
"step": 344
},
{
"epoch": 0.43505674653215637,
"grad_norm": 3.5650386810302734,
"learning_rate": 6.592964824120604e-06,
"loss": 1.4744,
"mean_token_accuracy": 0.5977911651134491,
"num_tokens": 668111.0,
"step": 345
},
{
"epoch": 0.43631778058007564,
"grad_norm": 3.8183364868164062,
"learning_rate": 6.582914572864322e-06,
"loss": 1.4991,
"mean_token_accuracy": 0.6056190133094788,
"num_tokens": 670019.0,
"step": 346
},
{
"epoch": 0.43757881462799497,
"grad_norm": 3.7838871479034424,
"learning_rate": 6.572864321608042e-06,
"loss": 1.5455,
"mean_token_accuracy": 0.6015400588512421,
"num_tokens": 671942.0,
"step": 347
},
{
"epoch": 0.43883984867591425,
"grad_norm": 3.7491555213928223,
"learning_rate": 6.56281407035176e-06,
"loss": 1.4943,
"mean_token_accuracy": 0.5917006731033325,
"num_tokens": 673892.0,
"step": 348
},
{
"epoch": 0.4401008827238335,
"grad_norm": 3.5047571659088135,
"learning_rate": 6.552763819095478e-06,
"loss": 1.4276,
"mean_token_accuracy": 0.6164177656173706,
"num_tokens": 675989.0,
"step": 349
},
{
"epoch": 0.44136191677175285,
"grad_norm": 3.5856776237487793,
"learning_rate": 6.542713567839196e-06,
"loss": 1.5142,
"mean_token_accuracy": 0.6096100211143494,
"num_tokens": 677846.0,
"step": 350
},
{
"epoch": 0.4426229508196721,
"grad_norm": 3.9831364154815674,
"learning_rate": 6.532663316582916e-06,
"loss": 1.4138,
"mean_token_accuracy": 0.6201036274433136,
"num_tokens": 679857.0,
"step": 351
},
{
"epoch": 0.44388398486759145,
"grad_norm": 3.3696045875549316,
"learning_rate": 6.522613065326634e-06,
"loss": 1.4871,
"mean_token_accuracy": 0.5868003368377686,
"num_tokens": 681938.0,
"step": 352
},
{
"epoch": 0.4451450189155107,
"grad_norm": 3.821829319000244,
"learning_rate": 6.512562814070352e-06,
"loss": 1.6191,
"mean_token_accuracy": 0.5949346721172333,
"num_tokens": 683945.0,
"step": 353
},
{
"epoch": 0.44640605296343,
"grad_norm": 4.1476149559021,
"learning_rate": 6.50251256281407e-06,
"loss": 1.6671,
"mean_token_accuracy": 0.58425173163414,
"num_tokens": 685857.0,
"step": 354
},
{
"epoch": 0.44766708701134933,
"grad_norm": 3.5411603450775146,
"learning_rate": 6.492462311557789e-06,
"loss": 1.4401,
"mean_token_accuracy": 0.6158699989318848,
"num_tokens": 687900.0,
"step": 355
},
{
"epoch": 0.4489281210592686,
"grad_norm": 3.8513166904449463,
"learning_rate": 6.482412060301508e-06,
"loss": 1.4646,
"mean_token_accuracy": 0.6156655848026276,
"num_tokens": 689828.0,
"step": 356
},
{
"epoch": 0.4501891551071879,
"grad_norm": 3.3844692707061768,
"learning_rate": 6.472361809045226e-06,
"loss": 1.4562,
"mean_token_accuracy": 0.5778538584709167,
"num_tokens": 691883.0,
"step": 357
},
{
"epoch": 0.4514501891551072,
"grad_norm": 3.508922815322876,
"learning_rate": 6.462311557788945e-06,
"loss": 1.4385,
"mean_token_accuracy": 0.5973396003246307,
"num_tokens": 694006.0,
"step": 358
},
{
"epoch": 0.4527112232030265,
"grad_norm": 3.9150846004486084,
"learning_rate": 6.4522613065326635e-06,
"loss": 1.508,
"mean_token_accuracy": 0.6005319356918335,
"num_tokens": 695920.0,
"step": 359
},
{
"epoch": 0.45397225725094575,
"grad_norm": 4.307052135467529,
"learning_rate": 6.4422110552763825e-06,
"loss": 1.4884,
"mean_token_accuracy": 0.6120457649230957,
"num_tokens": 697867.0,
"step": 360
},
{
"epoch": 0.4552332912988651,
"grad_norm": 3.687908172607422,
"learning_rate": 6.4321608040201015e-06,
"loss": 1.4761,
"mean_token_accuracy": 0.5905785262584686,
"num_tokens": 699743.0,
"step": 361
},
{
"epoch": 0.45649432534678436,
"grad_norm": 3.474442720413208,
"learning_rate": 6.42211055276382e-06,
"loss": 1.4165,
"mean_token_accuracy": 0.6130950152873993,
"num_tokens": 701729.0,
"step": 362
},
{
"epoch": 0.45775535939470363,
"grad_norm": 4.1334452629089355,
"learning_rate": 6.412060301507538e-06,
"loss": 1.5196,
"mean_token_accuracy": 0.5886242687702179,
"num_tokens": 703552.0,
"step": 363
},
{
"epoch": 0.45901639344262296,
"grad_norm": 3.5810012817382812,
"learning_rate": 6.402010050251258e-06,
"loss": 1.5103,
"mean_token_accuracy": 0.5963969528675079,
"num_tokens": 705585.0,
"step": 364
},
{
"epoch": 0.46027742749054223,
"grad_norm": 4.0280256271362305,
"learning_rate": 6.391959798994976e-06,
"loss": 1.6474,
"mean_token_accuracy": 0.5783868432044983,
"num_tokens": 707375.0,
"step": 365
},
{
"epoch": 0.46153846153846156,
"grad_norm": 3.934354305267334,
"learning_rate": 6.381909547738694e-06,
"loss": 1.6371,
"mean_token_accuracy": 0.5782681107521057,
"num_tokens": 709198.0,
"step": 366
},
{
"epoch": 0.46279949558638084,
"grad_norm": 3.8574609756469727,
"learning_rate": 6.371859296482412e-06,
"loss": 1.4247,
"mean_token_accuracy": 0.605307936668396,
"num_tokens": 711260.0,
"step": 367
},
{
"epoch": 0.4640605296343001,
"grad_norm": 4.1762518882751465,
"learning_rate": 6.361809045226132e-06,
"loss": 1.5521,
"mean_token_accuracy": 0.5752246379852295,
"num_tokens": 712956.0,
"step": 368
},
{
"epoch": 0.46532156368221944,
"grad_norm": 3.9887688159942627,
"learning_rate": 6.35175879396985e-06,
"loss": 1.4642,
"mean_token_accuracy": 0.612546294927597,
"num_tokens": 714675.0,
"step": 369
},
{
"epoch": 0.4665825977301387,
"grad_norm": 3.601942777633667,
"learning_rate": 6.341708542713568e-06,
"loss": 1.387,
"mean_token_accuracy": 0.6236040592193604,
"num_tokens": 716725.0,
"step": 370
},
{
"epoch": 0.467843631778058,
"grad_norm": 3.91477108001709,
"learning_rate": 6.331658291457286e-06,
"loss": 1.4851,
"mean_token_accuracy": 0.6054001152515411,
"num_tokens": 718449.0,
"step": 371
},
{
"epoch": 0.4691046658259773,
"grad_norm": 3.6185572147369385,
"learning_rate": 6.321608040201006e-06,
"loss": 1.3861,
"mean_token_accuracy": 0.6127656102180481,
"num_tokens": 720577.0,
"step": 372
},
{
"epoch": 0.4703656998738966,
"grad_norm": 3.6203348636627197,
"learning_rate": 6.311557788944724e-06,
"loss": 1.4493,
"mean_token_accuracy": 0.6147701144218445,
"num_tokens": 722444.0,
"step": 373
},
{
"epoch": 0.47162673392181587,
"grad_norm": 3.9971113204956055,
"learning_rate": 6.301507537688442e-06,
"loss": 1.5187,
"mean_token_accuracy": 0.5887904763221741,
"num_tokens": 724283.0,
"step": 374
},
{
"epoch": 0.4728877679697352,
"grad_norm": 3.7521986961364746,
"learning_rate": 6.291457286432161e-06,
"loss": 1.4663,
"mean_token_accuracy": 0.6025364696979523,
"num_tokens": 726415.0,
"step": 375
},
{
"epoch": 0.47414880201765447,
"grad_norm": 3.8074052333831787,
"learning_rate": 6.28140703517588e-06,
"loss": 1.5155,
"mean_token_accuracy": 0.5930048227310181,
"num_tokens": 728281.0,
"step": 376
},
{
"epoch": 0.47540983606557374,
"grad_norm": 4.051740646362305,
"learning_rate": 6.2713567839195985e-06,
"loss": 1.6541,
"mean_token_accuracy": 0.5947281718254089,
"num_tokens": 730266.0,
"step": 377
},
{
"epoch": 0.4766708701134931,
"grad_norm": 3.7151358127593994,
"learning_rate": 6.2613065326633175e-06,
"loss": 1.3374,
"mean_token_accuracy": 0.612007737159729,
"num_tokens": 732202.0,
"step": 378
},
{
"epoch": 0.47793190416141235,
"grad_norm": 4.122408390045166,
"learning_rate": 6.251256281407036e-06,
"loss": 1.4805,
"mean_token_accuracy": 0.5938221216201782,
"num_tokens": 734091.0,
"step": 379
},
{
"epoch": 0.4791929382093317,
"grad_norm": 3.908778667449951,
"learning_rate": 6.241206030150754e-06,
"loss": 1.5331,
"mean_token_accuracy": 0.5997506678104401,
"num_tokens": 736074.0,
"step": 380
},
{
"epoch": 0.48045397225725095,
"grad_norm": 4.155081748962402,
"learning_rate": 6.231155778894474e-06,
"loss": 1.5731,
"mean_token_accuracy": 0.5773967504501343,
"num_tokens": 737974.0,
"step": 381
},
{
"epoch": 0.4817150063051702,
"grad_norm": 3.9985005855560303,
"learning_rate": 6.221105527638192e-06,
"loss": 1.4398,
"mean_token_accuracy": 0.6126863658428192,
"num_tokens": 739838.0,
"step": 382
},
{
"epoch": 0.48297604035308955,
"grad_norm": 3.937734842300415,
"learning_rate": 6.21105527638191e-06,
"loss": 1.525,
"mean_token_accuracy": 0.6130150854587555,
"num_tokens": 741692.0,
"step": 383
},
{
"epoch": 0.4842370744010088,
"grad_norm": 3.8296337127685547,
"learning_rate": 6.201005025125628e-06,
"loss": 1.4805,
"mean_token_accuracy": 0.6192940175533295,
"num_tokens": 743636.0,
"step": 384
},
{
"epoch": 0.4854981084489281,
"grad_norm": 3.6199262142181396,
"learning_rate": 6.190954773869348e-06,
"loss": 1.3644,
"mean_token_accuracy": 0.6284566223621368,
"num_tokens": 745627.0,
"step": 385
},
{
"epoch": 0.48675914249684743,
"grad_norm": 3.931962490081787,
"learning_rate": 6.180904522613066e-06,
"loss": 1.6314,
"mean_token_accuracy": 0.575654000043869,
"num_tokens": 747592.0,
"step": 386
},
{
"epoch": 0.4880201765447667,
"grad_norm": 3.6779048442840576,
"learning_rate": 6.170854271356784e-06,
"loss": 1.4776,
"mean_token_accuracy": 0.594518393278122,
"num_tokens": 749615.0,
"step": 387
},
{
"epoch": 0.489281210592686,
"grad_norm": 3.832871198654175,
"learning_rate": 6.160804020100502e-06,
"loss": 1.4508,
"mean_token_accuracy": 0.6090913116931915,
"num_tokens": 751515.0,
"step": 388
},
{
"epoch": 0.4905422446406053,
"grad_norm": 3.9489598274230957,
"learning_rate": 6.150753768844222e-06,
"loss": 1.5567,
"mean_token_accuracy": 0.5768946707248688,
"num_tokens": 753381.0,
"step": 389
},
{
"epoch": 0.4918032786885246,
"grad_norm": 3.927081823348999,
"learning_rate": 6.14070351758794e-06,
"loss": 1.5296,
"mean_token_accuracy": 0.5951372683048248,
"num_tokens": 755255.0,
"step": 390
},
{
"epoch": 0.4930643127364439,
"grad_norm": 3.9887728691101074,
"learning_rate": 6.130653266331658e-06,
"loss": 1.5766,
"mean_token_accuracy": 0.587302178144455,
"num_tokens": 757058.0,
"step": 391
},
{
"epoch": 0.4943253467843632,
"grad_norm": 3.8660635948181152,
"learning_rate": 6.120603015075377e-06,
"loss": 1.3367,
"mean_token_accuracy": 0.6324070990085602,
"num_tokens": 758996.0,
"step": 392
},
{
"epoch": 0.49558638083228246,
"grad_norm": 4.087164878845215,
"learning_rate": 6.110552763819096e-06,
"loss": 1.5956,
"mean_token_accuracy": 0.5824276208877563,
"num_tokens": 760934.0,
"step": 393
},
{
"epoch": 0.4968474148802018,
"grad_norm": 3.9089858531951904,
"learning_rate": 6.1005025125628145e-06,
"loss": 1.406,
"mean_token_accuracy": 0.6238544881343842,
"num_tokens": 762847.0,
"step": 394
},
{
"epoch": 0.49810844892812106,
"grad_norm": 3.826852560043335,
"learning_rate": 6.0904522613065335e-06,
"loss": 1.3496,
"mean_token_accuracy": 0.621149480342865,
"num_tokens": 764885.0,
"step": 395
},
{
"epoch": 0.49936948297604034,
"grad_norm": 3.906343936920166,
"learning_rate": 6.080402010050252e-06,
"loss": 1.5116,
"mean_token_accuracy": 0.5996163189411163,
"num_tokens": 766879.0,
"step": 396
},
{
"epoch": 0.5006305170239597,
"grad_norm": 3.915363073348999,
"learning_rate": 6.070351758793971e-06,
"loss": 1.4962,
"mean_token_accuracy": 0.5924739837646484,
"num_tokens": 768855.0,
"step": 397
},
{
"epoch": 0.501891551071879,
"grad_norm": 3.8229563236236572,
"learning_rate": 6.06030150753769e-06,
"loss": 1.5012,
"mean_token_accuracy": 0.5963457524776459,
"num_tokens": 770683.0,
"step": 398
},
{
"epoch": 0.5031525851197982,
"grad_norm": 3.8112332820892334,
"learning_rate": 6.050251256281408e-06,
"loss": 1.4278,
"mean_token_accuracy": 0.6047469675540924,
"num_tokens": 772607.0,
"step": 399
},
{
"epoch": 0.5044136191677175,
"grad_norm": 3.7356104850769043,
"learning_rate": 6.040201005025126e-06,
"loss": 1.4171,
"mean_token_accuracy": 0.6060886383056641,
"num_tokens": 774528.0,
"step": 400
},
{
"epoch": 0.5056746532156369,
"grad_norm": 3.9361140727996826,
"learning_rate": 6.030150753768844e-06,
"loss": 1.5603,
"mean_token_accuracy": 0.5814925730228424,
"num_tokens": 776337.0,
"step": 401
},
{
"epoch": 0.5069356872635561,
"grad_norm": 3.743752956390381,
"learning_rate": 6.020100502512564e-06,
"loss": 1.4307,
"mean_token_accuracy": 0.6159599125385284,
"num_tokens": 778242.0,
"step": 402
},
{
"epoch": 0.5081967213114754,
"grad_norm": 3.4296321868896484,
"learning_rate": 6.010050251256282e-06,
"loss": 1.2756,
"mean_token_accuracy": 0.6444790065288544,
"num_tokens": 780168.0,
"step": 403
},
{
"epoch": 0.5094577553593947,
"grad_norm": 4.4306864738464355,
"learning_rate": 6e-06,
"loss": 1.6425,
"mean_token_accuracy": 0.5818255543708801,
"num_tokens": 781990.0,
"step": 404
},
{
"epoch": 0.510718789407314,
"grad_norm": 4.045053482055664,
"learning_rate": 5.989949748743718e-06,
"loss": 1.5048,
"mean_token_accuracy": 0.5999021828174591,
"num_tokens": 783882.0,
"step": 405
},
{
"epoch": 0.5119798234552333,
"grad_norm": 3.576313018798828,
"learning_rate": 5.979899497487438e-06,
"loss": 1.4715,
"mean_token_accuracy": 0.5854345858097076,
"num_tokens": 785888.0,
"step": 406
},
{
"epoch": 0.5132408575031526,
"grad_norm": 3.9152069091796875,
"learning_rate": 5.969849246231156e-06,
"loss": 1.421,
"mean_token_accuracy": 0.6086308360099792,
"num_tokens": 787821.0,
"step": 407
},
{
"epoch": 0.5145018915510718,
"grad_norm": 4.192541599273682,
"learning_rate": 5.959798994974874e-06,
"loss": 1.5941,
"mean_token_accuracy": 0.591251790523529,
"num_tokens": 789593.0,
"step": 408
},
{
"epoch": 0.5157629255989912,
"grad_norm": 3.80621337890625,
"learning_rate": 5.949748743718593e-06,
"loss": 1.5137,
"mean_token_accuracy": 0.5962553918361664,
"num_tokens": 791559.0,
"step": 409
},
{
"epoch": 0.5170239596469105,
"grad_norm": 3.8430941104888916,
"learning_rate": 5.939698492462312e-06,
"loss": 1.456,
"mean_token_accuracy": 0.6207826733589172,
"num_tokens": 793446.0,
"step": 410
},
{
"epoch": 0.5182849936948297,
"grad_norm": 3.751850128173828,
"learning_rate": 5.9296482412060305e-06,
"loss": 1.4189,
"mean_token_accuracy": 0.6290108561515808,
"num_tokens": 795278.0,
"step": 411
},
{
"epoch": 0.519546027742749,
"grad_norm": 4.246517658233643,
"learning_rate": 5.9195979899497495e-06,
"loss": 1.6446,
"mean_token_accuracy": 0.5721382200717926,
"num_tokens": 797144.0,
"step": 412
},
{
"epoch": 0.5208070617906684,
"grad_norm": 3.7303225994110107,
"learning_rate": 5.909547738693468e-06,
"loss": 1.397,
"mean_token_accuracy": 0.6230327188968658,
"num_tokens": 799007.0,
"step": 413
},
{
"epoch": 0.5220680958385876,
"grad_norm": 3.9755711555480957,
"learning_rate": 5.899497487437187e-06,
"loss": 1.5544,
"mean_token_accuracy": 0.5930447280406952,
"num_tokens": 800807.0,
"step": 414
},
{
"epoch": 0.5233291298865069,
"grad_norm": 4.000508785247803,
"learning_rate": 5.889447236180905e-06,
"loss": 1.6292,
"mean_token_accuracy": 0.5693787634372711,
"num_tokens": 802783.0,
"step": 415
},
{
"epoch": 0.5245901639344263,
"grad_norm": 4.1713337898254395,
"learning_rate": 5.879396984924624e-06,
"loss": 1.5307,
"mean_token_accuracy": 0.6000028848648071,
"num_tokens": 804596.0,
"step": 416
},
{
"epoch": 0.5258511979823455,
"grad_norm": 3.8140196800231934,
"learning_rate": 5.869346733668342e-06,
"loss": 1.4934,
"mean_token_accuracy": 0.6128242611885071,
"num_tokens": 806564.0,
"step": 417
},
{
"epoch": 0.5271122320302648,
"grad_norm": 3.724236488342285,
"learning_rate": 5.859296482412061e-06,
"loss": 1.4355,
"mean_token_accuracy": 0.6146958768367767,
"num_tokens": 808550.0,
"step": 418
},
{
"epoch": 0.5283732660781841,
"grad_norm": 4.29474401473999,
"learning_rate": 5.84924623115578e-06,
"loss": 1.5489,
"mean_token_accuracy": 0.5955554842948914,
"num_tokens": 810335.0,
"step": 419
},
{
"epoch": 0.5296343001261034,
"grad_norm": 3.720149040222168,
"learning_rate": 5.839195979899498e-06,
"loss": 1.3334,
"mean_token_accuracy": 0.6176763474941254,
"num_tokens": 812250.0,
"step": 420
},
{
"epoch": 0.5308953341740227,
"grad_norm": 4.0187201499938965,
"learning_rate": 5.829145728643216e-06,
"loss": 1.5864,
"mean_token_accuracy": 0.5855948626995087,
"num_tokens": 814160.0,
"step": 421
},
{
"epoch": 0.532156368221942,
"grad_norm": 3.415576219558716,
"learning_rate": 5.819095477386936e-06,
"loss": 1.3088,
"mean_token_accuracy": 0.6266660094261169,
"num_tokens": 816237.0,
"step": 422
},
{
"epoch": 0.5334174022698613,
"grad_norm": 3.7157742977142334,
"learning_rate": 5.809045226130654e-06,
"loss": 1.5874,
"mean_token_accuracy": 0.5831565260887146,
"num_tokens": 818263.0,
"step": 423
},
{
"epoch": 0.5346784363177806,
"grad_norm": 3.988062620162964,
"learning_rate": 5.798994974874372e-06,
"loss": 1.4411,
"mean_token_accuracy": 0.6083543598651886,
"num_tokens": 820136.0,
"step": 424
},
{
"epoch": 0.5359394703656999,
"grad_norm": 3.623262405395508,
"learning_rate": 5.78894472361809e-06,
"loss": 1.3701,
"mean_token_accuracy": 0.6176230311393738,
"num_tokens": 822213.0,
"step": 425
},
{
"epoch": 0.5372005044136192,
"grad_norm": 3.895170211791992,
"learning_rate": 5.778894472361809e-06,
"loss": 1.353,
"mean_token_accuracy": 0.6239413321018219,
"num_tokens": 824114.0,
"step": 426
},
{
"epoch": 0.5384615384615384,
"grad_norm": 3.889404296875,
"learning_rate": 5.768844221105528e-06,
"loss": 1.4688,
"mean_token_accuracy": 0.6074230074882507,
"num_tokens": 826150.0,
"step": 427
},
{
"epoch": 0.5397225725094578,
"grad_norm": 4.306681156158447,
"learning_rate": 5.7587939698492465e-06,
"loss": 1.5199,
"mean_token_accuracy": 0.5849421322345734,
"num_tokens": 827971.0,
"step": 428
},
{
"epoch": 0.5409836065573771,
"grad_norm": 4.184123516082764,
"learning_rate": 5.7487437185929655e-06,
"loss": 1.4457,
"mean_token_accuracy": 0.6024906635284424,
"num_tokens": 829886.0,
"step": 429
},
{
"epoch": 0.5422446406052963,
"grad_norm": 4.066215515136719,
"learning_rate": 5.738693467336684e-06,
"loss": 1.5743,
"mean_token_accuracy": 0.5810816884040833,
"num_tokens": 831780.0,
"step": 430
},
{
"epoch": 0.5435056746532156,
"grad_norm": 3.6609530448913574,
"learning_rate": 5.728643216080403e-06,
"loss": 1.3719,
"mean_token_accuracy": 0.615295022726059,
"num_tokens": 833782.0,
"step": 431
},
{
"epoch": 0.544766708701135,
"grad_norm": 3.7818527221679688,
"learning_rate": 5.718592964824121e-06,
"loss": 1.4128,
"mean_token_accuracy": 0.6150719523429871,
"num_tokens": 835728.0,
"step": 432
},
{
"epoch": 0.5460277427490542,
"grad_norm": 3.8660855293273926,
"learning_rate": 5.70854271356784e-06,
"loss": 1.4527,
"mean_token_accuracy": 0.6020141839981079,
"num_tokens": 837648.0,
"step": 433
},
{
"epoch": 0.5472887767969735,
"grad_norm": 3.8554327487945557,
"learning_rate": 5.698492462311558e-06,
"loss": 1.3445,
"mean_token_accuracy": 0.6199198365211487,
"num_tokens": 839518.0,
"step": 434
},
{
"epoch": 0.5485498108448928,
"grad_norm": 4.233033657073975,
"learning_rate": 5.688442211055277e-06,
"loss": 1.6077,
"mean_token_accuracy": 0.5790873765945435,
"num_tokens": 841311.0,
"step": 435
},
{
"epoch": 0.5498108448928121,
"grad_norm": 3.7273433208465576,
"learning_rate": 5.678391959798996e-06,
"loss": 1.3938,
"mean_token_accuracy": 0.6058596968650818,
"num_tokens": 843396.0,
"step": 436
},
{
"epoch": 0.5510718789407314,
"grad_norm": 4.387331008911133,
"learning_rate": 5.668341708542714e-06,
"loss": 1.5958,
"mean_token_accuracy": 0.5863657295703888,
"num_tokens": 845309.0,
"step": 437
},
{
"epoch": 0.5523329129886507,
"grad_norm": 3.9680373668670654,
"learning_rate": 5.658291457286432e-06,
"loss": 1.4952,
"mean_token_accuracy": 0.6114929616451263,
"num_tokens": 847210.0,
"step": 438
},
{
"epoch": 0.5535939470365699,
"grad_norm": 3.9168810844421387,
"learning_rate": 5.648241206030152e-06,
"loss": 1.4169,
"mean_token_accuracy": 0.6114987432956696,
"num_tokens": 849213.0,
"step": 439
},
{
"epoch": 0.5548549810844893,
"grad_norm": 4.25286340713501,
"learning_rate": 5.63819095477387e-06,
"loss": 1.6493,
"mean_token_accuracy": 0.570509523153305,
"num_tokens": 851092.0,
"step": 440
},
{
"epoch": 0.5561160151324086,
"grad_norm": 3.875514507293701,
"learning_rate": 5.628140703517588e-06,
"loss": 1.4016,
"mean_token_accuracy": 0.6159034967422485,
"num_tokens": 852991.0,
"step": 441
},
{
"epoch": 0.5573770491803278,
"grad_norm": 3.8609631061553955,
"learning_rate": 5.618090452261306e-06,
"loss": 1.3906,
"mean_token_accuracy": 0.6153773963451385,
"num_tokens": 855038.0,
"step": 442
},
{
"epoch": 0.5586380832282472,
"grad_norm": 3.846491575241089,
"learning_rate": 5.608040201005026e-06,
"loss": 1.4185,
"mean_token_accuracy": 0.6202840507030487,
"num_tokens": 857076.0,
"step": 443
},
{
"epoch": 0.5598991172761665,
"grad_norm": 4.081157207489014,
"learning_rate": 5.597989949748744e-06,
"loss": 1.5429,
"mean_token_accuracy": 0.5890442430973053,
"num_tokens": 859076.0,
"step": 444
},
{
"epoch": 0.5611601513240857,
"grad_norm": 3.8386566638946533,
"learning_rate": 5.5879396984924625e-06,
"loss": 1.4287,
"mean_token_accuracy": 0.6156394481658936,
"num_tokens": 861047.0,
"step": 445
},
{
"epoch": 0.562421185372005,
"grad_norm": 3.7738113403320312,
"learning_rate": 5.577889447236181e-06,
"loss": 1.4732,
"mean_token_accuracy": 0.5908475816249847,
"num_tokens": 863083.0,
"step": 446
},
{
"epoch": 0.5636822194199244,
"grad_norm": 3.8559048175811768,
"learning_rate": 5.5678391959799e-06,
"loss": 1.3873,
"mean_token_accuracy": 0.6156317889690399,
"num_tokens": 865068.0,
"step": 447
},
{
"epoch": 0.5649432534678437,
"grad_norm": 3.549217939376831,
"learning_rate": 5.557788944723619e-06,
"loss": 1.3209,
"mean_token_accuracy": 0.6351846158504486,
"num_tokens": 867085.0,
"step": 448
},
{
"epoch": 0.5662042875157629,
"grad_norm": 4.134774208068848,
"learning_rate": 5.547738693467337e-06,
"loss": 1.6058,
"mean_token_accuracy": 0.5775579214096069,
"num_tokens": 869059.0,
"step": 449
},
{
"epoch": 0.5674653215636822,
"grad_norm": 3.7273738384246826,
"learning_rate": 5.537688442211056e-06,
"loss": 1.4557,
"mean_token_accuracy": 0.6038585305213928,
"num_tokens": 871051.0,
"step": 450
},
{
"epoch": 0.5687263556116016,
"grad_norm": 4.377529621124268,
"learning_rate": 5.527638190954774e-06,
"loss": 1.4937,
"mean_token_accuracy": 0.5913034677505493,
"num_tokens": 872900.0,
"step": 451
},
{
"epoch": 0.5699873896595208,
"grad_norm": 4.095523834228516,
"learning_rate": 5.517587939698493e-06,
"loss": 1.5148,
"mean_token_accuracy": 0.6133293807506561,
"num_tokens": 874801.0,
"step": 452
},
{
"epoch": 0.5712484237074401,
"grad_norm": 3.8340799808502197,
"learning_rate": 5.507537688442212e-06,
"loss": 1.4924,
"mean_token_accuracy": 0.5960003435611725,
"num_tokens": 876646.0,
"step": 453
},
{
"epoch": 0.5725094577553594,
"grad_norm": 3.5290746688842773,
"learning_rate": 5.49748743718593e-06,
"loss": 1.326,
"mean_token_accuracy": 0.6205747723579407,
"num_tokens": 878739.0,
"step": 454
},
{
"epoch": 0.5737704918032787,
"grad_norm": 4.257903099060059,
"learning_rate": 5.487437185929648e-06,
"loss": 1.5482,
"mean_token_accuracy": 0.5878141522407532,
"num_tokens": 880465.0,
"step": 455
},
{
"epoch": 0.575031525851198,
"grad_norm": 3.8454947471618652,
"learning_rate": 5.477386934673368e-06,
"loss": 1.4434,
"mean_token_accuracy": 0.6128550171852112,
"num_tokens": 882500.0,
"step": 456
},
{
"epoch": 0.5762925598991173,
"grad_norm": 3.810124158859253,
"learning_rate": 5.467336683417086e-06,
"loss": 1.4584,
"mean_token_accuracy": 0.6083512902259827,
"num_tokens": 884588.0,
"step": 457
},
{
"epoch": 0.5775535939470365,
"grad_norm": 3.951253652572632,
"learning_rate": 5.457286432160804e-06,
"loss": 1.3561,
"mean_token_accuracy": 0.6184321939945221,
"num_tokens": 886527.0,
"step": 458
},
{
"epoch": 0.5788146279949559,
"grad_norm": 4.112410545349121,
"learning_rate": 5.447236180904522e-06,
"loss": 1.4046,
"mean_token_accuracy": 0.6035341024398804,
"num_tokens": 888317.0,
"step": 459
},
{
"epoch": 0.5800756620428752,
"grad_norm": 3.8871636390686035,
"learning_rate": 5.437185929648242e-06,
"loss": 1.4532,
"mean_token_accuracy": 0.614162027835846,
"num_tokens": 890191.0,
"step": 460
},
{
"epoch": 0.5813366960907944,
"grad_norm": 3.9813950061798096,
"learning_rate": 5.42713567839196e-06,
"loss": 1.4054,
"mean_token_accuracy": 0.613822340965271,
"num_tokens": 892069.0,
"step": 461
},
{
"epoch": 0.5825977301387137,
"grad_norm": 3.930206298828125,
"learning_rate": 5.4170854271356785e-06,
"loss": 1.5206,
"mean_token_accuracy": 0.5886333584785461,
"num_tokens": 894011.0,
"step": 462
},
{
"epoch": 0.5838587641866331,
"grad_norm": 3.8400161266326904,
"learning_rate": 5.407035175879397e-06,
"loss": 1.278,
"mean_token_accuracy": 0.6264286637306213,
"num_tokens": 895826.0,
"step": 463
},
{
"epoch": 0.5851197982345523,
"grad_norm": 4.199733734130859,
"learning_rate": 5.3969849246231165e-06,
"loss": 1.429,
"mean_token_accuracy": 0.612981379032135,
"num_tokens": 897788.0,
"step": 464
},
{
"epoch": 0.5863808322824716,
"grad_norm": 4.702413082122803,
"learning_rate": 5.386934673366835e-06,
"loss": 1.6482,
"mean_token_accuracy": 0.5548197329044342,
"num_tokens": 899611.0,
"step": 465
},
{
"epoch": 0.587641866330391,
"grad_norm": 4.299721717834473,
"learning_rate": 5.376884422110553e-06,
"loss": 1.5122,
"mean_token_accuracy": 0.591781735420227,
"num_tokens": 901499.0,
"step": 466
},
{
"epoch": 0.5889029003783102,
"grad_norm": 4.097691535949707,
"learning_rate": 5.366834170854272e-06,
"loss": 1.4837,
"mean_token_accuracy": 0.5834101438522339,
"num_tokens": 903365.0,
"step": 467
},
{
"epoch": 0.5901639344262295,
"grad_norm": 3.9655864238739014,
"learning_rate": 5.356783919597991e-06,
"loss": 1.3773,
"mean_token_accuracy": 0.6133775413036346,
"num_tokens": 905349.0,
"step": 468
},
{
"epoch": 0.5914249684741488,
"grad_norm": 3.8645100593566895,
"learning_rate": 5.346733668341709e-06,
"loss": 1.3445,
"mean_token_accuracy": 0.6294296383857727,
"num_tokens": 907283.0,
"step": 469
},
{
"epoch": 0.592686002522068,
"grad_norm": 3.720658540725708,
"learning_rate": 5.336683417085428e-06,
"loss": 1.3897,
"mean_token_accuracy": 0.610343724489212,
"num_tokens": 909322.0,
"step": 470
},
{
"epoch": 0.5939470365699874,
"grad_norm": 4.123734951019287,
"learning_rate": 5.326633165829146e-06,
"loss": 1.4428,
"mean_token_accuracy": 0.6127735376358032,
"num_tokens": 911283.0,
"step": 471
},
{
"epoch": 0.5952080706179067,
"grad_norm": 4.503159999847412,
"learning_rate": 5.316582914572864e-06,
"loss": 1.5428,
"mean_token_accuracy": 0.5835637450218201,
"num_tokens": 913205.0,
"step": 472
},
{
"epoch": 0.5964691046658259,
"grad_norm": 4.117824554443359,
"learning_rate": 5.306532663316584e-06,
"loss": 1.2964,
"mean_token_accuracy": 0.6239383518695831,
"num_tokens": 914962.0,
"step": 473
},
{
"epoch": 0.5977301387137453,
"grad_norm": 4.203347682952881,
"learning_rate": 5.296482412060302e-06,
"loss": 1.4524,
"mean_token_accuracy": 0.6028485894203186,
"num_tokens": 916915.0,
"step": 474
},
{
"epoch": 0.5989911727616646,
"grad_norm": 4.006253719329834,
"learning_rate": 5.28643216080402e-06,
"loss": 1.4569,
"mean_token_accuracy": 0.6102553606033325,
"num_tokens": 918862.0,
"step": 475
},
{
"epoch": 0.6002522068095839,
"grad_norm": 4.148035049438477,
"learning_rate": 5.2763819095477384e-06,
"loss": 1.5128,
"mean_token_accuracy": 0.5903106033802032,
"num_tokens": 920708.0,
"step": 476
},
{
"epoch": 0.6015132408575031,
"grad_norm": 4.400207042694092,
"learning_rate": 5.266331658291458e-06,
"loss": 1.5556,
"mean_token_accuracy": 0.5801825821399689,
"num_tokens": 922440.0,
"step": 477
},
{
"epoch": 0.6027742749054225,
"grad_norm": 3.911195993423462,
"learning_rate": 5.256281407035176e-06,
"loss": 1.3527,
"mean_token_accuracy": 0.6198071241378784,
"num_tokens": 924327.0,
"step": 478
},
{
"epoch": 0.6040353089533418,
"grad_norm": 4.530071258544922,
"learning_rate": 5.2462311557788945e-06,
"loss": 1.5821,
"mean_token_accuracy": 0.580609917640686,
"num_tokens": 926140.0,
"step": 479
},
{
"epoch": 0.605296343001261,
"grad_norm": 3.8630685806274414,
"learning_rate": 5.236180904522613e-06,
"loss": 1.3839,
"mean_token_accuracy": 0.6292921006679535,
"num_tokens": 928105.0,
"step": 480
},
{
"epoch": 0.6065573770491803,
"grad_norm": 3.9717369079589844,
"learning_rate": 5.2261306532663325e-06,
"loss": 1.423,
"mean_token_accuracy": 0.6069486737251282,
"num_tokens": 930094.0,
"step": 481
},
{
"epoch": 0.6078184110970997,
"grad_norm": 3.7706334590911865,
"learning_rate": 5.216080402010051e-06,
"loss": 1.4392,
"mean_token_accuracy": 0.6127463579177856,
"num_tokens": 932075.0,
"step": 482
},
{
"epoch": 0.6090794451450189,
"grad_norm": 4.312561511993408,
"learning_rate": 5.206030150753769e-06,
"loss": 1.5057,
"mean_token_accuracy": 0.5967552363872528,
"num_tokens": 933860.0,
"step": 483
},
{
"epoch": 0.6103404791929382,
"grad_norm": 4.139890193939209,
"learning_rate": 5.195979899497488e-06,
"loss": 1.425,
"mean_token_accuracy": 0.602910578250885,
"num_tokens": 935729.0,
"step": 484
},
{
"epoch": 0.6116015132408575,
"grad_norm": 4.032483100891113,
"learning_rate": 5.185929648241207e-06,
"loss": 1.4033,
"mean_token_accuracy": 0.5998988151550293,
"num_tokens": 937684.0,
"step": 485
},
{
"epoch": 0.6128625472887768,
"grad_norm": 4.146860122680664,
"learning_rate": 5.175879396984925e-06,
"loss": 1.4753,
"mean_token_accuracy": 0.6013561189174652,
"num_tokens": 939604.0,
"step": 486
},
{
"epoch": 0.6141235813366961,
"grad_norm": 4.384416580200195,
"learning_rate": 5.165829145728644e-06,
"loss": 1.3525,
"mean_token_accuracy": 0.6071769595146179,
"num_tokens": 941669.0,
"step": 487
},
{
"epoch": 0.6153846153846154,
"grad_norm": 4.441947937011719,
"learning_rate": 5.155778894472362e-06,
"loss": 1.4166,
"mean_token_accuracy": 0.6223839223384857,
"num_tokens": 943543.0,
"step": 488
},
{
"epoch": 0.6166456494325346,
"grad_norm": 4.2887115478515625,
"learning_rate": 5.145728643216081e-06,
"loss": 1.3786,
"mean_token_accuracy": 0.6249210238456726,
"num_tokens": 945387.0,
"step": 489
},
{
"epoch": 0.617906683480454,
"grad_norm": 4.079659461975098,
"learning_rate": 5.1356783919598e-06,
"loss": 1.342,
"mean_token_accuracy": 0.6278223991394043,
"num_tokens": 947294.0,
"step": 490
},
{
"epoch": 0.6191677175283733,
"grad_norm": 4.367201328277588,
"learning_rate": 5.125628140703518e-06,
"loss": 1.6105,
"mean_token_accuracy": 0.5611352920532227,
"num_tokens": 949245.0,
"step": 491
},
{
"epoch": 0.6204287515762925,
"grad_norm": 4.136919021606445,
"learning_rate": 5.115577889447236e-06,
"loss": 1.5049,
"mean_token_accuracy": 0.6014758944511414,
"num_tokens": 951077.0,
"step": 492
},
{
"epoch": 0.6216897856242118,
"grad_norm": 4.354308605194092,
"learning_rate": 5.1055276381909544e-06,
"loss": 1.3732,
"mean_token_accuracy": 0.6369976699352264,
"num_tokens": 952980.0,
"step": 493
},
{
"epoch": 0.6229508196721312,
"grad_norm": 4.78447961807251,
"learning_rate": 5.095477386934674e-06,
"loss": 1.5297,
"mean_token_accuracy": 0.5905880928039551,
"num_tokens": 954830.0,
"step": 494
},
{
"epoch": 0.6242118537200504,
"grad_norm": 4.451809883117676,
"learning_rate": 5.085427135678392e-06,
"loss": 1.51,
"mean_token_accuracy": 0.5860687792301178,
"num_tokens": 956642.0,
"step": 495
},
{
"epoch": 0.6254728877679697,
"grad_norm": 4.794120788574219,
"learning_rate": 5.0753768844221105e-06,
"loss": 1.5092,
"mean_token_accuracy": 0.6137255132198334,
"num_tokens": 958342.0,
"step": 496
},
{
"epoch": 0.626733921815889,
"grad_norm": 4.356624603271484,
"learning_rate": 5.065326633165829e-06,
"loss": 1.4437,
"mean_token_accuracy": 0.5994245707988739,
"num_tokens": 960176.0,
"step": 497
},
{
"epoch": 0.6279949558638083,
"grad_norm": 4.708098888397217,
"learning_rate": 5.0552763819095485e-06,
"loss": 1.5365,
"mean_token_accuracy": 0.6074124574661255,
"num_tokens": 962049.0,
"step": 498
},
{
"epoch": 0.6292559899117276,
"grad_norm": 4.1611785888671875,
"learning_rate": 5.045226130653267e-06,
"loss": 1.5711,
"mean_token_accuracy": 0.5907929241657257,
"num_tokens": 964044.0,
"step": 499
},
{
"epoch": 0.6305170239596469,
"grad_norm": 4.215281963348389,
"learning_rate": 5.035175879396985e-06,
"loss": 1.359,
"mean_token_accuracy": 0.6179487109184265,
"num_tokens": 966015.0,
"step": 500
},
{
"epoch": 0.6317780580075663,
"grad_norm": 4.392022132873535,
"learning_rate": 5.025125628140704e-06,
"loss": 1.5106,
"mean_token_accuracy": 0.6017323136329651,
"num_tokens": 967891.0,
"step": 501
},
{
"epoch": 0.6330390920554855,
"grad_norm": 3.9430975914001465,
"learning_rate": 5.015075376884423e-06,
"loss": 1.3659,
"mean_token_accuracy": 0.622453361749649,
"num_tokens": 969888.0,
"step": 502
},
{
"epoch": 0.6343001261034048,
"grad_norm": 3.892613172531128,
"learning_rate": 5.005025125628141e-06,
"loss": 1.4432,
"mean_token_accuracy": 0.6056107580661774,
"num_tokens": 972007.0,
"step": 503
},
{
"epoch": 0.6355611601513241,
"grad_norm": 4.147119522094727,
"learning_rate": 4.99497487437186e-06,
"loss": 1.3801,
"mean_token_accuracy": 0.6145619451999664,
"num_tokens": 974059.0,
"step": 504
},
{
"epoch": 0.6368221941992434,
"grad_norm": 3.900256872177124,
"learning_rate": 4.984924623115578e-06,
"loss": 1.5498,
"mean_token_accuracy": 0.6014587879180908,
"num_tokens": 976086.0,
"step": 505
},
{
"epoch": 0.6380832282471627,
"grad_norm": 4.286009788513184,
"learning_rate": 4.974874371859297e-06,
"loss": 1.4711,
"mean_token_accuracy": 0.5997771620750427,
"num_tokens": 978000.0,
"step": 506
},
{
"epoch": 0.639344262295082,
"grad_norm": 4.044436931610107,
"learning_rate": 4.964824120603016e-06,
"loss": 1.3599,
"mean_token_accuracy": 0.6190573871135712,
"num_tokens": 980122.0,
"step": 507
},
{
"epoch": 0.6406052963430012,
"grad_norm": 3.8622653484344482,
"learning_rate": 4.954773869346734e-06,
"loss": 1.299,
"mean_token_accuracy": 0.6397116184234619,
"num_tokens": 982107.0,
"step": 508
},
{
"epoch": 0.6418663303909206,
"grad_norm": 4.036622524261475,
"learning_rate": 4.944723618090453e-06,
"loss": 1.3907,
"mean_token_accuracy": 0.5995530188083649,
"num_tokens": 984115.0,
"step": 509
},
{
"epoch": 0.6431273644388399,
"grad_norm": 4.12971830368042,
"learning_rate": 4.934673366834171e-06,
"loss": 1.49,
"mean_token_accuracy": 0.6049287021160126,
"num_tokens": 986018.0,
"step": 510
},
{
"epoch": 0.6443883984867591,
"grad_norm": 4.413822650909424,
"learning_rate": 4.92462311557789e-06,
"loss": 1.5757,
"mean_token_accuracy": 0.5891340374946594,
"num_tokens": 988040.0,
"step": 511
},
{
"epoch": 0.6456494325346784,
"grad_norm": 4.282911777496338,
"learning_rate": 4.914572864321608e-06,
"loss": 1.5484,
"mean_token_accuracy": 0.5913865864276886,
"num_tokens": 990049.0,
"step": 512
},
{
"epoch": 0.6469104665825978,
"grad_norm": 4.138622283935547,
"learning_rate": 4.904522613065327e-06,
"loss": 1.3762,
"mean_token_accuracy": 0.6090503036975861,
"num_tokens": 991939.0,
"step": 513
},
{
"epoch": 0.648171500630517,
"grad_norm": 4.5050435066223145,
"learning_rate": 4.8944723618090455e-06,
"loss": 1.4768,
"mean_token_accuracy": 0.5987947881221771,
"num_tokens": 993670.0,
"step": 514
},
{
"epoch": 0.6494325346784363,
"grad_norm": 4.05117130279541,
"learning_rate": 4.8844221105527645e-06,
"loss": 1.4625,
"mean_token_accuracy": 0.6028375625610352,
"num_tokens": 995631.0,
"step": 515
},
{
"epoch": 0.6506935687263556,
"grad_norm": 4.307063102722168,
"learning_rate": 4.874371859296483e-06,
"loss": 1.3849,
"mean_token_accuracy": 0.6143765449523926,
"num_tokens": 997576.0,
"step": 516
},
{
"epoch": 0.6519546027742749,
"grad_norm": 4.198483467102051,
"learning_rate": 4.864321608040201e-06,
"loss": 1.459,
"mean_token_accuracy": 0.591049462556839,
"num_tokens": 999431.0,
"step": 517
},
{
"epoch": 0.6532156368221942,
"grad_norm": 4.2448835372924805,
"learning_rate": 4.85427135678392e-06,
"loss": 1.5409,
"mean_token_accuracy": 0.5864794254302979,
"num_tokens": 1001402.0,
"step": 518
},
{
"epoch": 0.6544766708701135,
"grad_norm": 3.9349770545959473,
"learning_rate": 4.844221105527638e-06,
"loss": 1.3259,
"mean_token_accuracy": 0.6447261869907379,
"num_tokens": 1003371.0,
"step": 519
},
{
"epoch": 0.6557377049180327,
"grad_norm": 4.384145259857178,
"learning_rate": 4.834170854271357e-06,
"loss": 1.4976,
"mean_token_accuracy": 0.60174959897995,
"num_tokens": 1005205.0,
"step": 520
},
{
"epoch": 0.6569987389659521,
"grad_norm": 4.253158092498779,
"learning_rate": 4.824120603015076e-06,
"loss": 1.3808,
"mean_token_accuracy": 0.6132340133190155,
"num_tokens": 1007120.0,
"step": 521
},
{
"epoch": 0.6582597730138714,
"grad_norm": 4.16796875,
"learning_rate": 4.814070351758794e-06,
"loss": 1.5749,
"mean_token_accuracy": 0.6019249558448792,
"num_tokens": 1009045.0,
"step": 522
},
{
"epoch": 0.6595208070617906,
"grad_norm": 4.176485538482666,
"learning_rate": 4.804020100502513e-06,
"loss": 1.489,
"mean_token_accuracy": 0.6130034625530243,
"num_tokens": 1010978.0,
"step": 523
},
{
"epoch": 0.6607818411097099,
"grad_norm": 4.173998832702637,
"learning_rate": 4.793969849246232e-06,
"loss": 1.4154,
"mean_token_accuracy": 0.6257044970989227,
"num_tokens": 1012941.0,
"step": 524
},
{
"epoch": 0.6620428751576293,
"grad_norm": 4.313329219818115,
"learning_rate": 4.78391959798995e-06,
"loss": 1.4384,
"mean_token_accuracy": 0.5984161496162415,
"num_tokens": 1014844.0,
"step": 525
},
{
"epoch": 0.6633039092055486,
"grad_norm": 4.119398593902588,
"learning_rate": 4.773869346733669e-06,
"loss": 1.4806,
"mean_token_accuracy": 0.6089716553688049,
"num_tokens": 1016768.0,
"step": 526
},
{
"epoch": 0.6645649432534678,
"grad_norm": 3.8196797370910645,
"learning_rate": 4.763819095477387e-06,
"loss": 1.4531,
"mean_token_accuracy": 0.6100893914699554,
"num_tokens": 1018940.0,
"step": 527
},
{
"epoch": 0.6658259773013872,
"grad_norm": 3.7822678089141846,
"learning_rate": 4.753768844221106e-06,
"loss": 1.3299,
"mean_token_accuracy": 0.6366003751754761,
"num_tokens": 1021088.0,
"step": 528
},
{
"epoch": 0.6670870113493065,
"grad_norm": 4.061533451080322,
"learning_rate": 4.743718592964824e-06,
"loss": 1.4227,
"mean_token_accuracy": 0.6176785826683044,
"num_tokens": 1023129.0,
"step": 529
},
{
"epoch": 0.6683480453972257,
"grad_norm": 4.252097129821777,
"learning_rate": 4.733668341708543e-06,
"loss": 1.3714,
"mean_token_accuracy": 0.6185733079910278,
"num_tokens": 1025194.0,
"step": 530
},
{
"epoch": 0.669609079445145,
"grad_norm": 3.9393389225006104,
"learning_rate": 4.7236180904522615e-06,
"loss": 1.4285,
"mean_token_accuracy": 0.6074325740337372,
"num_tokens": 1027265.0,
"step": 531
},
{
"epoch": 0.6708701134930644,
"grad_norm": 4.501029968261719,
"learning_rate": 4.7135678391959805e-06,
"loss": 1.5284,
"mean_token_accuracy": 0.5852837264537811,
"num_tokens": 1029237.0,
"step": 532
},
{
"epoch": 0.6721311475409836,
"grad_norm": 4.294065475463867,
"learning_rate": 4.703517587939699e-06,
"loss": 1.3974,
"mean_token_accuracy": 0.602262407541275,
"num_tokens": 1031116.0,
"step": 533
},
{
"epoch": 0.6733921815889029,
"grad_norm": 4.042904376983643,
"learning_rate": 4.693467336683418e-06,
"loss": 1.3956,
"mean_token_accuracy": 0.6112872660160065,
"num_tokens": 1033181.0,
"step": 534
},
{
"epoch": 0.6746532156368222,
"grad_norm": 4.032866954803467,
"learning_rate": 4.683417085427136e-06,
"loss": 1.4685,
"mean_token_accuracy": 0.6069096922874451,
"num_tokens": 1035145.0,
"step": 535
},
{
"epoch": 0.6759142496847415,
"grad_norm": 4.177650451660156,
"learning_rate": 4.673366834170855e-06,
"loss": 1.3029,
"mean_token_accuracy": 0.6414700150489807,
"num_tokens": 1037178.0,
"step": 536
},
{
"epoch": 0.6771752837326608,
"grad_norm": 4.131723880767822,
"learning_rate": 4.663316582914573e-06,
"loss": 1.4881,
"mean_token_accuracy": 0.5847931206226349,
"num_tokens": 1039259.0,
"step": 537
},
{
"epoch": 0.6784363177805801,
"grad_norm": 4.277894496917725,
"learning_rate": 4.653266331658292e-06,
"loss": 1.4666,
"mean_token_accuracy": 0.5960581600666046,
"num_tokens": 1041108.0,
"step": 538
},
{
"epoch": 0.6796973518284993,
"grad_norm": 4.391327857971191,
"learning_rate": 4.64321608040201e-06,
"loss": 1.4966,
"mean_token_accuracy": 0.5966735780239105,
"num_tokens": 1043111.0,
"step": 539
},
{
"epoch": 0.6809583858764187,
"grad_norm": 4.3835673332214355,
"learning_rate": 4.633165829145729e-06,
"loss": 1.4816,
"mean_token_accuracy": 0.5919269919395447,
"num_tokens": 1045039.0,
"step": 540
},
{
"epoch": 0.682219419924338,
"grad_norm": 4.380410671234131,
"learning_rate": 4.623115577889448e-06,
"loss": 1.4864,
"mean_token_accuracy": 0.5843121707439423,
"num_tokens": 1047056.0,
"step": 541
},
{
"epoch": 0.6834804539722572,
"grad_norm": 4.51450252532959,
"learning_rate": 4.613065326633166e-06,
"loss": 1.5087,
"mean_token_accuracy": 0.5990866422653198,
"num_tokens": 1049015.0,
"step": 542
},
{
"epoch": 0.6847414880201765,
"grad_norm": 4.474623203277588,
"learning_rate": 4.603015075376885e-06,
"loss": 1.5369,
"mean_token_accuracy": 0.5826748311519623,
"num_tokens": 1051016.0,
"step": 543
},
{
"epoch": 0.6860025220680959,
"grad_norm": 4.619833946228027,
"learning_rate": 4.592964824120603e-06,
"loss": 1.5901,
"mean_token_accuracy": 0.5855450630187988,
"num_tokens": 1052884.0,
"step": 544
},
{
"epoch": 0.6872635561160151,
"grad_norm": 4.460485935211182,
"learning_rate": 4.582914572864322e-06,
"loss": 1.476,
"mean_token_accuracy": 0.5999694764614105,
"num_tokens": 1054726.0,
"step": 545
},
{
"epoch": 0.6885245901639344,
"grad_norm": 4.181125640869141,
"learning_rate": 4.57286432160804e-06,
"loss": 1.365,
"mean_token_accuracy": 0.6187720596790314,
"num_tokens": 1056663.0,
"step": 546
},
{
"epoch": 0.6897856242118537,
"grad_norm": 4.0265583992004395,
"learning_rate": 4.562814070351759e-06,
"loss": 1.4209,
"mean_token_accuracy": 0.6083628833293915,
"num_tokens": 1058657.0,
"step": 547
},
{
"epoch": 0.691046658259773,
"grad_norm": 4.170673847198486,
"learning_rate": 4.5527638190954775e-06,
"loss": 1.3842,
"mean_token_accuracy": 0.6233990788459778,
"num_tokens": 1060720.0,
"step": 548
},
{
"epoch": 0.6923076923076923,
"grad_norm": 4.298334121704102,
"learning_rate": 4.5427135678391965e-06,
"loss": 1.5274,
"mean_token_accuracy": 0.5755627751350403,
"num_tokens": 1062706.0,
"step": 549
},
{
"epoch": 0.6935687263556116,
"grad_norm": 4.409641742706299,
"learning_rate": 4.532663316582915e-06,
"loss": 1.4234,
"mean_token_accuracy": 0.6005873680114746,
"num_tokens": 1064567.0,
"step": 550
},
{
"epoch": 0.694829760403531,
"grad_norm": 4.0575175285339355,
"learning_rate": 4.522613065326634e-06,
"loss": 1.47,
"mean_token_accuracy": 0.6192085146903992,
"num_tokens": 1066620.0,
"step": 551
},
{
"epoch": 0.6960907944514502,
"grad_norm": 4.188862323760986,
"learning_rate": 4.512562814070352e-06,
"loss": 1.4006,
"mean_token_accuracy": 0.609145849943161,
"num_tokens": 1068663.0,
"step": 552
},
{
"epoch": 0.6973518284993695,
"grad_norm": 4.311684608459473,
"learning_rate": 4.502512562814071e-06,
"loss": 1.4287,
"mean_token_accuracy": 0.5967524945735931,
"num_tokens": 1070500.0,
"step": 553
},
{
"epoch": 0.6986128625472888,
"grad_norm": 4.478208065032959,
"learning_rate": 4.492462311557789e-06,
"loss": 1.5679,
"mean_token_accuracy": 0.5677984654903412,
"num_tokens": 1072399.0,
"step": 554
},
{
"epoch": 0.699873896595208,
"grad_norm": 4.476880073547363,
"learning_rate": 4.482412060301508e-06,
"loss": 1.5745,
"mean_token_accuracy": 0.5820361375808716,
"num_tokens": 1074229.0,
"step": 555
},
{
"epoch": 0.7011349306431274,
"grad_norm": 4.118135929107666,
"learning_rate": 4.472361809045226e-06,
"loss": 1.3818,
"mean_token_accuracy": 0.6199574172496796,
"num_tokens": 1076179.0,
"step": 556
},
{
"epoch": 0.7023959646910467,
"grad_norm": 4.258944988250732,
"learning_rate": 4.462311557788945e-06,
"loss": 1.4556,
"mean_token_accuracy": 0.6048978269100189,
"num_tokens": 1078196.0,
"step": 557
},
{
"epoch": 0.7036569987389659,
"grad_norm": 4.3716206550598145,
"learning_rate": 4.452261306532664e-06,
"loss": 1.4638,
"mean_token_accuracy": 0.6035429835319519,
"num_tokens": 1080054.0,
"step": 558
},
{
"epoch": 0.7049180327868853,
"grad_norm": 4.203842639923096,
"learning_rate": 4.442211055276382e-06,
"loss": 1.519,
"mean_token_accuracy": 0.5738790929317474,
"num_tokens": 1082139.0,
"step": 559
},
{
"epoch": 0.7061790668348046,
"grad_norm": 3.9735658168792725,
"learning_rate": 4.432160804020101e-06,
"loss": 1.2718,
"mean_token_accuracy": 0.6296245157718658,
"num_tokens": 1084184.0,
"step": 560
},
{
"epoch": 0.7074401008827238,
"grad_norm": 4.601116180419922,
"learning_rate": 4.42211055276382e-06,
"loss": 1.4758,
"mean_token_accuracy": 0.5895026922225952,
"num_tokens": 1086114.0,
"step": 561
},
{
"epoch": 0.7087011349306431,
"grad_norm": 4.092045307159424,
"learning_rate": 4.412060301507538e-06,
"loss": 1.4217,
"mean_token_accuracy": 0.6099154055118561,
"num_tokens": 1088186.0,
"step": 562
},
{
"epoch": 0.7099621689785625,
"grad_norm": 4.286192417144775,
"learning_rate": 4.4020100502512564e-06,
"loss": 1.5134,
"mean_token_accuracy": 0.5960166454315186,
"num_tokens": 1090066.0,
"step": 563
},
{
"epoch": 0.7112232030264817,
"grad_norm": 4.366318225860596,
"learning_rate": 4.391959798994975e-06,
"loss": 1.3797,
"mean_token_accuracy": 0.6279633641242981,
"num_tokens": 1091943.0,
"step": 564
},
{
"epoch": 0.712484237074401,
"grad_norm": 4.332135200500488,
"learning_rate": 4.3819095477386936e-06,
"loss": 1.4498,
"mean_token_accuracy": 0.6036017537117004,
"num_tokens": 1093927.0,
"step": 565
},
{
"epoch": 0.7137452711223203,
"grad_norm": 4.145910739898682,
"learning_rate": 4.3718592964824125e-06,
"loss": 1.467,
"mean_token_accuracy": 0.5955266952514648,
"num_tokens": 1095943.0,
"step": 566
},
{
"epoch": 0.7150063051702396,
"grad_norm": 4.207005500793457,
"learning_rate": 4.361809045226131e-06,
"loss": 1.4077,
"mean_token_accuracy": 0.6239123344421387,
"num_tokens": 1097960.0,
"step": 567
},
{
"epoch": 0.7162673392181589,
"grad_norm": 4.264434337615967,
"learning_rate": 4.35175879396985e-06,
"loss": 1.5408,
"mean_token_accuracy": 0.5840959250926971,
"num_tokens": 1100027.0,
"step": 568
},
{
"epoch": 0.7175283732660782,
"grad_norm": 4.43059778213501,
"learning_rate": 4.341708542713568e-06,
"loss": 1.417,
"mean_token_accuracy": 0.6145318448543549,
"num_tokens": 1101888.0,
"step": 569
},
{
"epoch": 0.7187894073139974,
"grad_norm": 4.425664901733398,
"learning_rate": 4.331658291457287e-06,
"loss": 1.4231,
"mean_token_accuracy": 0.6067825257778168,
"num_tokens": 1103765.0,
"step": 570
},
{
"epoch": 0.7200504413619168,
"grad_norm": 4.427340507507324,
"learning_rate": 4.321608040201005e-06,
"loss": 1.4333,
"mean_token_accuracy": 0.6241535246372223,
"num_tokens": 1105637.0,
"step": 571
},
{
"epoch": 0.7213114754098361,
"grad_norm": 4.249168872833252,
"learning_rate": 4.311557788944724e-06,
"loss": 1.334,
"mean_token_accuracy": 0.6373594701290131,
"num_tokens": 1107675.0,
"step": 572
},
{
"epoch": 0.7225725094577553,
"grad_norm": 4.880923271179199,
"learning_rate": 4.301507537688442e-06,
"loss": 1.6842,
"mean_token_accuracy": 0.5585279166698456,
"num_tokens": 1109568.0,
"step": 573
},
{
"epoch": 0.7238335435056746,
"grad_norm": 4.138409614562988,
"learning_rate": 4.291457286432161e-06,
"loss": 1.4518,
"mean_token_accuracy": 0.6146091818809509,
"num_tokens": 1111491.0,
"step": 574
},
{
"epoch": 0.725094577553594,
"grad_norm": 4.344665050506592,
"learning_rate": 4.28140703517588e-06,
"loss": 1.4003,
"mean_token_accuracy": 0.6278960406780243,
"num_tokens": 1113417.0,
"step": 575
},
{
"epoch": 0.7263556116015133,
"grad_norm": 4.1242146492004395,
"learning_rate": 4.271356783919598e-06,
"loss": 1.3366,
"mean_token_accuracy": 0.6168392598628998,
"num_tokens": 1115469.0,
"step": 576
},
{
"epoch": 0.7276166456494325,
"grad_norm": 4.681807041168213,
"learning_rate": 4.261306532663317e-06,
"loss": 1.4971,
"mean_token_accuracy": 0.5873031616210938,
"num_tokens": 1117326.0,
"step": 577
},
{
"epoch": 0.7288776796973518,
"grad_norm": 4.148594379425049,
"learning_rate": 4.251256281407035e-06,
"loss": 1.4957,
"mean_token_accuracy": 0.6133466064929962,
"num_tokens": 1119353.0,
"step": 578
},
{
"epoch": 0.7301387137452712,
"grad_norm": 4.205901622772217,
"learning_rate": 4.241206030150754e-06,
"loss": 1.3541,
"mean_token_accuracy": 0.6136380732059479,
"num_tokens": 1121372.0,
"step": 579
},
{
"epoch": 0.7313997477931904,
"grad_norm": 4.561435699462891,
"learning_rate": 4.231155778894473e-06,
"loss": 1.5047,
"mean_token_accuracy": 0.5941650867462158,
"num_tokens": 1123316.0,
"step": 580
},
{
"epoch": 0.7326607818411097,
"grad_norm": 4.207332611083984,
"learning_rate": 4.221105527638191e-06,
"loss": 1.4151,
"mean_token_accuracy": 0.6092670261859894,
"num_tokens": 1125322.0,
"step": 581
},
{
"epoch": 0.733921815889029,
"grad_norm": 4.15023946762085,
"learning_rate": 4.21105527638191e-06,
"loss": 1.3431,
"mean_token_accuracy": 0.6442605555057526,
"num_tokens": 1127284.0,
"step": 582
},
{
"epoch": 0.7351828499369483,
"grad_norm": 4.073770046234131,
"learning_rate": 4.2010050251256285e-06,
"loss": 1.4278,
"mean_token_accuracy": 0.6028145551681519,
"num_tokens": 1129336.0,
"step": 583
},
{
"epoch": 0.7364438839848676,
"grad_norm": 4.155696392059326,
"learning_rate": 4.1909547738693475e-06,
"loss": 1.5242,
"mean_token_accuracy": 0.6014257669448853,
"num_tokens": 1131229.0,
"step": 584
},
{
"epoch": 0.7377049180327869,
"grad_norm": 4.599628925323486,
"learning_rate": 4.180904522613066e-06,
"loss": 1.4689,
"mean_token_accuracy": 0.6075104475021362,
"num_tokens": 1133138.0,
"step": 585
},
{
"epoch": 0.7389659520807061,
"grad_norm": 4.1771345138549805,
"learning_rate": 4.170854271356784e-06,
"loss": 1.4611,
"mean_token_accuracy": 0.6091577410697937,
"num_tokens": 1135144.0,
"step": 586
},
{
"epoch": 0.7402269861286255,
"grad_norm": 4.629430294036865,
"learning_rate": 4.160804020100503e-06,
"loss": 1.4212,
"mean_token_accuracy": 0.6028386056423187,
"num_tokens": 1136946.0,
"step": 587
},
{
"epoch": 0.7414880201765448,
"grad_norm": 4.687736988067627,
"learning_rate": 4.150753768844221e-06,
"loss": 1.6235,
"mean_token_accuracy": 0.5800973176956177,
"num_tokens": 1138786.0,
"step": 588
},
{
"epoch": 0.742749054224464,
"grad_norm": 4.503421783447266,
"learning_rate": 4.14070351758794e-06,
"loss": 1.3945,
"mean_token_accuracy": 0.602990061044693,
"num_tokens": 1140532.0,
"step": 589
},
{
"epoch": 0.7440100882723834,
"grad_norm": 4.563640117645264,
"learning_rate": 4.130653266331658e-06,
"loss": 1.4467,
"mean_token_accuracy": 0.595698207616806,
"num_tokens": 1142407.0,
"step": 590
},
{
"epoch": 0.7452711223203027,
"grad_norm": 4.475567817687988,
"learning_rate": 4.120603015075377e-06,
"loss": 1.4367,
"mean_token_accuracy": 0.6017897129058838,
"num_tokens": 1144273.0,
"step": 591
},
{
"epoch": 0.7465321563682219,
"grad_norm": 4.657721519470215,
"learning_rate": 4.110552763819096e-06,
"loss": 1.5447,
"mean_token_accuracy": 0.5949556827545166,
"num_tokens": 1146103.0,
"step": 592
},
{
"epoch": 0.7477931904161412,
"grad_norm": 4.46821403503418,
"learning_rate": 4.100502512562814e-06,
"loss": 1.5193,
"mean_token_accuracy": 0.5897854566574097,
"num_tokens": 1148055.0,
"step": 593
},
{
"epoch": 0.7490542244640606,
"grad_norm": 4.4468536376953125,
"learning_rate": 4.090452261306533e-06,
"loss": 1.5036,
"mean_token_accuracy": 0.594214141368866,
"num_tokens": 1149916.0,
"step": 594
},
{
"epoch": 0.7503152585119798,
"grad_norm": 4.743985652923584,
"learning_rate": 4.080402010050251e-06,
"loss": 1.3838,
"mean_token_accuracy": 0.6155570149421692,
"num_tokens": 1151952.0,
"step": 595
},
{
"epoch": 0.7515762925598991,
"grad_norm": 4.6654462814331055,
"learning_rate": 4.07035175879397e-06,
"loss": 1.4874,
"mean_token_accuracy": 0.5827921628952026,
"num_tokens": 1153786.0,
"step": 596
},
{
"epoch": 0.7528373266078184,
"grad_norm": 4.544785022735596,
"learning_rate": 4.060301507537689e-06,
"loss": 1.4062,
"mean_token_accuracy": 0.6182176172733307,
"num_tokens": 1155728.0,
"step": 597
},
{
"epoch": 0.7540983606557377,
"grad_norm": 4.02006196975708,
"learning_rate": 4.0502512562814074e-06,
"loss": 1.3816,
"mean_token_accuracy": 0.6215299069881439,
"num_tokens": 1157784.0,
"step": 598
},
{
"epoch": 0.755359394703657,
"grad_norm": 4.205069065093994,
"learning_rate": 4.040201005025126e-06,
"loss": 1.3984,
"mean_token_accuracy": 0.6212480962276459,
"num_tokens": 1159746.0,
"step": 599
},
{
"epoch": 0.7566204287515763,
"grad_norm": 4.572359561920166,
"learning_rate": 4.0301507537688446e-06,
"loss": 1.5142,
"mean_token_accuracy": 0.5926640629768372,
"num_tokens": 1161692.0,
"step": 600
},
{
"epoch": 0.7578814627994955,
"grad_norm": 4.29414701461792,
"learning_rate": 4.0201005025125635e-06,
"loss": 1.4277,
"mean_token_accuracy": 0.6287708878517151,
"num_tokens": 1163559.0,
"step": 601
},
{
"epoch": 0.7591424968474149,
"grad_norm": 4.80892276763916,
"learning_rate": 4.010050251256282e-06,
"loss": 1.5654,
"mean_token_accuracy": 0.575875848531723,
"num_tokens": 1165372.0,
"step": 602
},
{
"epoch": 0.7604035308953342,
"grad_norm": 4.304175853729248,
"learning_rate": 4.000000000000001e-06,
"loss": 1.4001,
"mean_token_accuracy": 0.6016526520252228,
"num_tokens": 1167257.0,
"step": 603
},
{
"epoch": 0.7616645649432535,
"grad_norm": 4.059518337249756,
"learning_rate": 3.989949748743719e-06,
"loss": 1.2838,
"mean_token_accuracy": 0.647224634885788,
"num_tokens": 1169254.0,
"step": 604
},
{
"epoch": 0.7629255989911727,
"grad_norm": 4.799088001251221,
"learning_rate": 3.979899497487438e-06,
"loss": 1.4551,
"mean_token_accuracy": 0.6063340604305267,
"num_tokens": 1171048.0,
"step": 605
},
{
"epoch": 0.7641866330390921,
"grad_norm": 4.762022972106934,
"learning_rate": 3.969849246231156e-06,
"loss": 1.6099,
"mean_token_accuracy": 0.5841151475906372,
"num_tokens": 1172943.0,
"step": 606
},
{
"epoch": 0.7654476670870114,
"grad_norm": 4.426660537719727,
"learning_rate": 3.959798994974875e-06,
"loss": 1.3641,
"mean_token_accuracy": 0.6199142932891846,
"num_tokens": 1174952.0,
"step": 607
},
{
"epoch": 0.7667087011349306,
"grad_norm": 4.9128923416137695,
"learning_rate": 3.949748743718593e-06,
"loss": 1.5432,
"mean_token_accuracy": 0.6037909686565399,
"num_tokens": 1176792.0,
"step": 608
},
{
"epoch": 0.7679697351828499,
"grad_norm": 4.370631694793701,
"learning_rate": 3.939698492462311e-06,
"loss": 1.4049,
"mean_token_accuracy": 0.6263740062713623,
"num_tokens": 1178856.0,
"step": 609
},
{
"epoch": 0.7692307692307693,
"grad_norm": 4.397915363311768,
"learning_rate": 3.92964824120603e-06,
"loss": 1.3521,
"mean_token_accuracy": 0.6210023164749146,
"num_tokens": 1180599.0,
"step": 610
},
{
"epoch": 0.7704918032786885,
"grad_norm": 4.44165563583374,
"learning_rate": 3.919597989949749e-06,
"loss": 1.488,
"mean_token_accuracy": 0.5845552682876587,
"num_tokens": 1182446.0,
"step": 611
},
{
"epoch": 0.7717528373266078,
"grad_norm": 4.571348190307617,
"learning_rate": 3.909547738693467e-06,
"loss": 1.3733,
"mean_token_accuracy": 0.5972480475902557,
"num_tokens": 1184361.0,
"step": 612
},
{
"epoch": 0.7730138713745272,
"grad_norm": 4.486395359039307,
"learning_rate": 3.899497487437186e-06,
"loss": 1.4094,
"mean_token_accuracy": 0.6077009737491608,
"num_tokens": 1186296.0,
"step": 613
},
{
"epoch": 0.7742749054224464,
"grad_norm": 4.390718936920166,
"learning_rate": 3.889447236180905e-06,
"loss": 1.4609,
"mean_token_accuracy": 0.6105667650699615,
"num_tokens": 1188264.0,
"step": 614
},
{
"epoch": 0.7755359394703657,
"grad_norm": 4.085479259490967,
"learning_rate": 3.8793969849246234e-06,
"loss": 1.3808,
"mean_token_accuracy": 0.6208994686603546,
"num_tokens": 1190225.0,
"step": 615
},
{
"epoch": 0.776796973518285,
"grad_norm": 4.256773471832275,
"learning_rate": 3.869346733668342e-06,
"loss": 1.4493,
"mean_token_accuracy": 0.6007210910320282,
"num_tokens": 1192274.0,
"step": 616
},
{
"epoch": 0.7780580075662042,
"grad_norm": 4.536936283111572,
"learning_rate": 3.8592964824120606e-06,
"loss": 1.5483,
"mean_token_accuracy": 0.5986529290676117,
"num_tokens": 1194327.0,
"step": 617
},
{
"epoch": 0.7793190416141236,
"grad_norm": 4.602484226226807,
"learning_rate": 3.8492462311557795e-06,
"loss": 1.4892,
"mean_token_accuracy": 0.5982283651828766,
"num_tokens": 1196227.0,
"step": 618
},
{
"epoch": 0.7805800756620429,
"grad_norm": 4.350205898284912,
"learning_rate": 3.839195979899498e-06,
"loss": 1.3968,
"mean_token_accuracy": 0.630781501531601,
"num_tokens": 1198122.0,
"step": 619
},
{
"epoch": 0.7818411097099621,
"grad_norm": 4.328808784484863,
"learning_rate": 3.829145728643217e-06,
"loss": 1.5386,
"mean_token_accuracy": 0.5927076935768127,
"num_tokens": 1200218.0,
"step": 620
},
{
"epoch": 0.7831021437578815,
"grad_norm": 4.0366597175598145,
"learning_rate": 3.819095477386935e-06,
"loss": 1.3376,
"mean_token_accuracy": 0.6228463053703308,
"num_tokens": 1202311.0,
"step": 621
},
{
"epoch": 0.7843631778058008,
"grad_norm": 4.308081150054932,
"learning_rate": 3.809045226130654e-06,
"loss": 1.3082,
"mean_token_accuracy": 0.6177918612957001,
"num_tokens": 1204308.0,
"step": 622
},
{
"epoch": 0.78562421185372,
"grad_norm": 4.21478271484375,
"learning_rate": 3.798994974874372e-06,
"loss": 1.3921,
"mean_token_accuracy": 0.6253844499588013,
"num_tokens": 1206385.0,
"step": 623
},
{
"epoch": 0.7868852459016393,
"grad_norm": 4.382959365844727,
"learning_rate": 3.788944723618091e-06,
"loss": 1.5071,
"mean_token_accuracy": 0.5961708128452301,
"num_tokens": 1208393.0,
"step": 624
},
{
"epoch": 0.7881462799495587,
"grad_norm": 4.760127067565918,
"learning_rate": 3.7788944723618095e-06,
"loss": 1.3599,
"mean_token_accuracy": 0.6244753301143646,
"num_tokens": 1210303.0,
"step": 625
},
{
"epoch": 0.7894073139974779,
"grad_norm": 4.651197910308838,
"learning_rate": 3.768844221105528e-06,
"loss": 1.3909,
"mean_token_accuracy": 0.6095870137214661,
"num_tokens": 1212284.0,
"step": 626
},
{
"epoch": 0.7906683480453972,
"grad_norm": 4.659881591796875,
"learning_rate": 3.7587939698492466e-06,
"loss": 1.4808,
"mean_token_accuracy": 0.5926066935062408,
"num_tokens": 1214147.0,
"step": 627
},
{
"epoch": 0.7919293820933165,
"grad_norm": 4.688325881958008,
"learning_rate": 3.748743718592965e-06,
"loss": 1.4284,
"mean_token_accuracy": 0.6090951859951019,
"num_tokens": 1216040.0,
"step": 628
},
{
"epoch": 0.7931904161412359,
"grad_norm": 4.855409622192383,
"learning_rate": 3.7386934673366837e-06,
"loss": 1.5242,
"mean_token_accuracy": 0.582084596157074,
"num_tokens": 1217799.0,
"step": 629
},
{
"epoch": 0.7944514501891551,
"grad_norm": 4.540473937988281,
"learning_rate": 3.7286432160804027e-06,
"loss": 1.3878,
"mean_token_accuracy": 0.5953293144702911,
"num_tokens": 1219712.0,
"step": 630
},
{
"epoch": 0.7957124842370744,
"grad_norm": 4.534702301025391,
"learning_rate": 3.718592964824121e-06,
"loss": 1.3955,
"mean_token_accuracy": 0.6180699467658997,
"num_tokens": 1221652.0,
"step": 631
},
{
"epoch": 0.7969735182849937,
"grad_norm": 4.40645694732666,
"learning_rate": 3.7085427135678394e-06,
"loss": 1.4383,
"mean_token_accuracy": 0.6130342781543732,
"num_tokens": 1223575.0,
"step": 632
},
{
"epoch": 0.798234552332913,
"grad_norm": 4.6681623458862305,
"learning_rate": 3.698492462311558e-06,
"loss": 1.427,
"mean_token_accuracy": 0.5881649851799011,
"num_tokens": 1225561.0,
"step": 633
},
{
"epoch": 0.7994955863808323,
"grad_norm": 4.4728474617004395,
"learning_rate": 3.6884422110552766e-06,
"loss": 1.4221,
"mean_token_accuracy": 0.6145466864109039,
"num_tokens": 1227379.0,
"step": 634
},
{
"epoch": 0.8007566204287516,
"grad_norm": 4.362063884735107,
"learning_rate": 3.678391959798995e-06,
"loss": 1.4629,
"mean_token_accuracy": 0.5976041853427887,
"num_tokens": 1229400.0,
"step": 635
},
{
"epoch": 0.8020176544766708,
"grad_norm": 4.435058116912842,
"learning_rate": 3.6683417085427137e-06,
"loss": 1.5734,
"mean_token_accuracy": 0.5893145501613617,
"num_tokens": 1231403.0,
"step": 636
},
{
"epoch": 0.8032786885245902,
"grad_norm": 4.274704456329346,
"learning_rate": 3.6582914572864327e-06,
"loss": 1.4007,
"mean_token_accuracy": 0.6151903867721558,
"num_tokens": 1233353.0,
"step": 637
},
{
"epoch": 0.8045397225725095,
"grad_norm": 4.3847975730896,
"learning_rate": 3.648241206030151e-06,
"loss": 1.3496,
"mean_token_accuracy": 0.6147129237651825,
"num_tokens": 1235288.0,
"step": 638
},
{
"epoch": 0.8058007566204287,
"grad_norm": 4.745240688323975,
"learning_rate": 3.63819095477387e-06,
"loss": 1.5088,
"mean_token_accuracy": 0.5913102626800537,
"num_tokens": 1237251.0,
"step": 639
},
{
"epoch": 0.807061790668348,
"grad_norm": 4.760833740234375,
"learning_rate": 3.628140703517588e-06,
"loss": 1.4202,
"mean_token_accuracy": 0.6121359467506409,
"num_tokens": 1239082.0,
"step": 640
},
{
"epoch": 0.8083228247162674,
"grad_norm": 4.281674861907959,
"learning_rate": 3.618090452261307e-06,
"loss": 1.3884,
"mean_token_accuracy": 0.6337145864963531,
"num_tokens": 1241059.0,
"step": 641
},
{
"epoch": 0.8095838587641866,
"grad_norm": 4.366747856140137,
"learning_rate": 3.608040201005025e-06,
"loss": 1.3475,
"mean_token_accuracy": 0.6259979605674744,
"num_tokens": 1242914.0,
"step": 642
},
{
"epoch": 0.8108448928121059,
"grad_norm": 4.431673526763916,
"learning_rate": 3.597989949748744e-06,
"loss": 1.5013,
"mean_token_accuracy": 0.5821367800235748,
"num_tokens": 1244902.0,
"step": 643
},
{
"epoch": 0.8121059268600253,
"grad_norm": 4.428189754486084,
"learning_rate": 3.5879396984924626e-06,
"loss": 1.4184,
"mean_token_accuracy": 0.6101392805576324,
"num_tokens": 1246868.0,
"step": 644
},
{
"epoch": 0.8133669609079445,
"grad_norm": 4.415472507476807,
"learning_rate": 3.577889447236181e-06,
"loss": 1.4531,
"mean_token_accuracy": 0.6098997592926025,
"num_tokens": 1248753.0,
"step": 645
},
{
"epoch": 0.8146279949558638,
"grad_norm": 4.518283843994141,
"learning_rate": 3.5678391959798997e-06,
"loss": 1.4393,
"mean_token_accuracy": 0.6256670653820038,
"num_tokens": 1250582.0,
"step": 646
},
{
"epoch": 0.8158890290037831,
"grad_norm": 4.306365489959717,
"learning_rate": 3.5577889447236187e-06,
"loss": 1.5138,
"mean_token_accuracy": 0.5980395376682281,
"num_tokens": 1252644.0,
"step": 647
},
{
"epoch": 0.8171500630517023,
"grad_norm": 5.037056922912598,
"learning_rate": 3.547738693467337e-06,
"loss": 1.5757,
"mean_token_accuracy": 0.5927419364452362,
"num_tokens": 1254494.0,
"step": 648
},
{
"epoch": 0.8184110970996217,
"grad_norm": 4.363806247711182,
"learning_rate": 3.537688442211056e-06,
"loss": 1.452,
"mean_token_accuracy": 0.6023588478565216,
"num_tokens": 1256495.0,
"step": 649
},
{
"epoch": 0.819672131147541,
"grad_norm": 4.607044696807861,
"learning_rate": 3.527638190954774e-06,
"loss": 1.4529,
"mean_token_accuracy": 0.6298587918281555,
"num_tokens": 1258309.0,
"step": 650
},
{
"epoch": 0.8209331651954602,
"grad_norm": 4.5787482261657715,
"learning_rate": 3.517587939698493e-06,
"loss": 1.4734,
"mean_token_accuracy": 0.6061556339263916,
"num_tokens": 1260298.0,
"step": 651
},
{
"epoch": 0.8221941992433796,
"grad_norm": 4.130124568939209,
"learning_rate": 3.507537688442211e-06,
"loss": 1.3836,
"mean_token_accuracy": 0.6267567276954651,
"num_tokens": 1262427.0,
"step": 652
},
{
"epoch": 0.8234552332912989,
"grad_norm": 4.490095615386963,
"learning_rate": 3.49748743718593e-06,
"loss": 1.3904,
"mean_token_accuracy": 0.6128627061843872,
"num_tokens": 1264349.0,
"step": 653
},
{
"epoch": 0.8247162673392182,
"grad_norm": 4.538565635681152,
"learning_rate": 3.4874371859296487e-06,
"loss": 1.4888,
"mean_token_accuracy": 0.6063677072525024,
"num_tokens": 1266212.0,
"step": 654
},
{
"epoch": 0.8259773013871374,
"grad_norm": 4.711320877075195,
"learning_rate": 3.477386934673367e-06,
"loss": 1.5548,
"mean_token_accuracy": 0.59187251329422,
"num_tokens": 1267999.0,
"step": 655
},
{
"epoch": 0.8272383354350568,
"grad_norm": 4.422387599945068,
"learning_rate": 3.467336683417086e-06,
"loss": 1.3466,
"mean_token_accuracy": 0.6210170686244965,
"num_tokens": 1269937.0,
"step": 656
},
{
"epoch": 0.8284993694829761,
"grad_norm": 4.336258888244629,
"learning_rate": 3.457286432160804e-06,
"loss": 1.3277,
"mean_token_accuracy": 0.629963606595993,
"num_tokens": 1271810.0,
"step": 657
},
{
"epoch": 0.8297604035308953,
"grad_norm": 4.430080413818359,
"learning_rate": 3.447236180904523e-06,
"loss": 1.4157,
"mean_token_accuracy": 0.6269070208072662,
"num_tokens": 1273694.0,
"step": 658
},
{
"epoch": 0.8310214375788146,
"grad_norm": 4.436978816986084,
"learning_rate": 3.437185929648241e-06,
"loss": 1.4499,
"mean_token_accuracy": 0.6197161674499512,
"num_tokens": 1275670.0,
"step": 659
},
{
"epoch": 0.832282471626734,
"grad_norm": 4.394755840301514,
"learning_rate": 3.42713567839196e-06,
"loss": 1.5152,
"mean_token_accuracy": 0.6004717350006104,
"num_tokens": 1277686.0,
"step": 660
},
{
"epoch": 0.8335435056746532,
"grad_norm": 4.80107307434082,
"learning_rate": 3.4170854271356786e-06,
"loss": 1.6007,
"mean_token_accuracy": 0.574835330247879,
"num_tokens": 1279613.0,
"step": 661
},
{
"epoch": 0.8348045397225725,
"grad_norm": 4.431844234466553,
"learning_rate": 3.407035175879397e-06,
"loss": 1.4222,
"mean_token_accuracy": 0.6128553450107574,
"num_tokens": 1281595.0,
"step": 662
},
{
"epoch": 0.8360655737704918,
"grad_norm": 4.554701328277588,
"learning_rate": 3.3969849246231158e-06,
"loss": 1.301,
"mean_token_accuracy": 0.6317387223243713,
"num_tokens": 1283596.0,
"step": 663
},
{
"epoch": 0.8373266078184111,
"grad_norm": 4.815081596374512,
"learning_rate": 3.3869346733668347e-06,
"loss": 1.5612,
"mean_token_accuracy": 0.5957655310630798,
"num_tokens": 1285528.0,
"step": 664
},
{
"epoch": 0.8385876418663304,
"grad_norm": 4.433903694152832,
"learning_rate": 3.376884422110553e-06,
"loss": 1.3869,
"mean_token_accuracy": 0.6024475693702698,
"num_tokens": 1287456.0,
"step": 665
},
{
"epoch": 0.8398486759142497,
"grad_norm": 4.512532711029053,
"learning_rate": 3.366834170854272e-06,
"loss": 1.3795,
"mean_token_accuracy": 0.630597710609436,
"num_tokens": 1289330.0,
"step": 666
},
{
"epoch": 0.8411097099621689,
"grad_norm": 4.5717854499816895,
"learning_rate": 3.35678391959799e-06,
"loss": 1.455,
"mean_token_accuracy": 0.6052423119544983,
"num_tokens": 1291142.0,
"step": 667
},
{
"epoch": 0.8423707440100883,
"grad_norm": 4.499806880950928,
"learning_rate": 3.346733668341709e-06,
"loss": 1.3583,
"mean_token_accuracy": 0.6096299588680267,
"num_tokens": 1293029.0,
"step": 668
},
{
"epoch": 0.8436317780580076,
"grad_norm": 4.7041473388671875,
"learning_rate": 3.336683417085427e-06,
"loss": 1.393,
"mean_token_accuracy": 0.6280495524406433,
"num_tokens": 1294847.0,
"step": 669
},
{
"epoch": 0.8448928121059268,
"grad_norm": 4.718618869781494,
"learning_rate": 3.326633165829146e-06,
"loss": 1.5803,
"mean_token_accuracy": 0.589573085308075,
"num_tokens": 1296756.0,
"step": 670
},
{
"epoch": 0.8461538461538461,
"grad_norm": 4.51429557800293,
"learning_rate": 3.3165829145728647e-06,
"loss": 1.5713,
"mean_token_accuracy": 0.5885864496231079,
"num_tokens": 1298728.0,
"step": 671
},
{
"epoch": 0.8474148802017655,
"grad_norm": 5.319494724273682,
"learning_rate": 3.3065326633165833e-06,
"loss": 1.6109,
"mean_token_accuracy": 0.5749770998954773,
"num_tokens": 1300535.0,
"step": 672
},
{
"epoch": 0.8486759142496847,
"grad_norm": 4.48340368270874,
"learning_rate": 3.296482412060302e-06,
"loss": 1.5013,
"mean_token_accuracy": 0.5988208651542664,
"num_tokens": 1302597.0,
"step": 673
},
{
"epoch": 0.849936948297604,
"grad_norm": 4.54470157623291,
"learning_rate": 3.286432160804021e-06,
"loss": 1.4674,
"mean_token_accuracy": 0.6040069758892059,
"num_tokens": 1304445.0,
"step": 674
},
{
"epoch": 0.8511979823455234,
"grad_norm": 4.200180530548096,
"learning_rate": 3.276381909547739e-06,
"loss": 1.4606,
"mean_token_accuracy": 0.59771728515625,
"num_tokens": 1306592.0,
"step": 675
},
{
"epoch": 0.8524590163934426,
"grad_norm": 4.604515552520752,
"learning_rate": 3.266331658291458e-06,
"loss": 1.3684,
"mean_token_accuracy": 0.6281431317329407,
"num_tokens": 1308472.0,
"step": 676
},
{
"epoch": 0.8537200504413619,
"grad_norm": 4.382105350494385,
"learning_rate": 3.256281407035176e-06,
"loss": 1.3825,
"mean_token_accuracy": 0.6295027732849121,
"num_tokens": 1310610.0,
"step": 677
},
{
"epoch": 0.8549810844892812,
"grad_norm": 4.814754486083984,
"learning_rate": 3.2462311557788946e-06,
"loss": 1.5009,
"mean_token_accuracy": 0.6055155396461487,
"num_tokens": 1312453.0,
"step": 678
},
{
"epoch": 0.8562421185372006,
"grad_norm": 4.793276786804199,
"learning_rate": 3.236180904522613e-06,
"loss": 1.3342,
"mean_token_accuracy": 0.6250176429748535,
"num_tokens": 1314212.0,
"step": 679
},
{
"epoch": 0.8575031525851198,
"grad_norm": 4.565577507019043,
"learning_rate": 3.2261306532663318e-06,
"loss": 1.5052,
"mean_token_accuracy": 0.5999468564987183,
"num_tokens": 1316074.0,
"step": 680
},
{
"epoch": 0.8587641866330391,
"grad_norm": 4.617696285247803,
"learning_rate": 3.2160804020100507e-06,
"loss": 1.474,
"mean_token_accuracy": 0.6138873398303986,
"num_tokens": 1317957.0,
"step": 681
},
{
"epoch": 0.8600252206809584,
"grad_norm": 4.252396106719971,
"learning_rate": 3.206030150753769e-06,
"loss": 1.3459,
"mean_token_accuracy": 0.6233983039855957,
"num_tokens": 1319929.0,
"step": 682
},
{
"epoch": 0.8612862547288777,
"grad_norm": 4.807640552520752,
"learning_rate": 3.195979899497488e-06,
"loss": 1.5199,
"mean_token_accuracy": 0.5885966122150421,
"num_tokens": 1321796.0,
"step": 683
},
{
"epoch": 0.862547288776797,
"grad_norm": 4.889383792877197,
"learning_rate": 3.185929648241206e-06,
"loss": 1.5128,
"mean_token_accuracy": 0.5876397490501404,
"num_tokens": 1323666.0,
"step": 684
},
{
"epoch": 0.8638083228247163,
"grad_norm": 4.820505142211914,
"learning_rate": 3.175879396984925e-06,
"loss": 1.5802,
"mean_token_accuracy": 0.58903369307518,
"num_tokens": 1325573.0,
"step": 685
},
{
"epoch": 0.8650693568726355,
"grad_norm": 4.336293697357178,
"learning_rate": 3.165829145728643e-06,
"loss": 1.3696,
"mean_token_accuracy": 0.6184759736061096,
"num_tokens": 1327590.0,
"step": 686
},
{
"epoch": 0.8663303909205549,
"grad_norm": 4.746899604797363,
"learning_rate": 3.155778894472362e-06,
"loss": 1.5816,
"mean_token_accuracy": 0.5919223725795746,
"num_tokens": 1329387.0,
"step": 687
},
{
"epoch": 0.8675914249684742,
"grad_norm": 4.493013381958008,
"learning_rate": 3.1457286432160807e-06,
"loss": 1.2895,
"mean_token_accuracy": 0.6384941339492798,
"num_tokens": 1331261.0,
"step": 688
},
{
"epoch": 0.8688524590163934,
"grad_norm": 4.692387580871582,
"learning_rate": 3.1356783919597993e-06,
"loss": 1.3633,
"mean_token_accuracy": 0.6215213239192963,
"num_tokens": 1333061.0,
"step": 689
},
{
"epoch": 0.8701134930643127,
"grad_norm": 4.2351460456848145,
"learning_rate": 3.125628140703518e-06,
"loss": 1.3547,
"mean_token_accuracy": 0.6269940733909607,
"num_tokens": 1334999.0,
"step": 690
},
{
"epoch": 0.8713745271122321,
"grad_norm": 4.679457187652588,
"learning_rate": 3.115577889447237e-06,
"loss": 1.4769,
"mean_token_accuracy": 0.597771018743515,
"num_tokens": 1336957.0,
"step": 691
},
{
"epoch": 0.8726355611601513,
"grad_norm": 4.976491928100586,
"learning_rate": 3.105527638190955e-06,
"loss": 1.4483,
"mean_token_accuracy": 0.5990459024906158,
"num_tokens": 1338814.0,
"step": 692
},
{
"epoch": 0.8738965952080706,
"grad_norm": 4.698354721069336,
"learning_rate": 3.095477386934674e-06,
"loss": 1.5794,
"mean_token_accuracy": 0.5691917240619659,
"num_tokens": 1340791.0,
"step": 693
},
{
"epoch": 0.8751576292559899,
"grad_norm": 4.409573554992676,
"learning_rate": 3.085427135678392e-06,
"loss": 1.335,
"mean_token_accuracy": 0.6246625781059265,
"num_tokens": 1342748.0,
"step": 694
},
{
"epoch": 0.8764186633039092,
"grad_norm": 4.804214954376221,
"learning_rate": 3.075376884422111e-06,
"loss": 1.4505,
"mean_token_accuracy": 0.6022706627845764,
"num_tokens": 1344647.0,
"step": 695
},
{
"epoch": 0.8776796973518285,
"grad_norm": 4.49189567565918,
"learning_rate": 3.065326633165829e-06,
"loss": 1.36,
"mean_token_accuracy": 0.6258461177349091,
"num_tokens": 1346581.0,
"step": 696
},
{
"epoch": 0.8789407313997478,
"grad_norm": 4.685361862182617,
"learning_rate": 3.055276381909548e-06,
"loss": 1.4595,
"mean_token_accuracy": 0.6182919144630432,
"num_tokens": 1348444.0,
"step": 697
},
{
"epoch": 0.880201765447667,
"grad_norm": 4.615530967712402,
"learning_rate": 3.0452261306532668e-06,
"loss": 1.2957,
"mean_token_accuracy": 0.6101522147655487,
"num_tokens": 1350314.0,
"step": 698
},
{
"epoch": 0.8814627994955864,
"grad_norm": 5.159006595611572,
"learning_rate": 3.0351758793969853e-06,
"loss": 1.5901,
"mean_token_accuracy": 0.5731832981109619,
"num_tokens": 1352111.0,
"step": 699
},
{
"epoch": 0.8827238335435057,
"grad_norm": 4.202300071716309,
"learning_rate": 3.025125628140704e-06,
"loss": 1.4044,
"mean_token_accuracy": 0.6054458022117615,
"num_tokens": 1354114.0,
"step": 700
},
{
"epoch": 0.8839848675914249,
"grad_norm": 5.155057907104492,
"learning_rate": 3.015075376884422e-06,
"loss": 1.5627,
"mean_token_accuracy": 0.5730942487716675,
"num_tokens": 1356043.0,
"step": 701
},
{
"epoch": 0.8852459016393442,
"grad_norm": 4.509034156799316,
"learning_rate": 3.005025125628141e-06,
"loss": 1.4089,
"mean_token_accuracy": 0.6151978075504303,
"num_tokens": 1358118.0,
"step": 702
},
{
"epoch": 0.8865069356872636,
"grad_norm": 4.847167491912842,
"learning_rate": 2.994974874371859e-06,
"loss": 1.4613,
"mean_token_accuracy": 0.6082049012184143,
"num_tokens": 1360003.0,
"step": 703
},
{
"epoch": 0.8877679697351829,
"grad_norm": 4.529287338256836,
"learning_rate": 2.984924623115578e-06,
"loss": 1.4005,
"mean_token_accuracy": 0.6232767403125763,
"num_tokens": 1361950.0,
"step": 704
},
{
"epoch": 0.8890290037831021,
"grad_norm": 4.487297058105469,
"learning_rate": 2.9748743718592967e-06,
"loss": 1.3502,
"mean_token_accuracy": 0.6181976497173309,
"num_tokens": 1364018.0,
"step": 705
},
{
"epoch": 0.8902900378310215,
"grad_norm": 5.078628063201904,
"learning_rate": 2.9648241206030153e-06,
"loss": 1.5075,
"mean_token_accuracy": 0.5930404663085938,
"num_tokens": 1365836.0,
"step": 706
},
{
"epoch": 0.8915510718789408,
"grad_norm": 4.510313987731934,
"learning_rate": 2.954773869346734e-06,
"loss": 1.3429,
"mean_token_accuracy": 0.6260442435741425,
"num_tokens": 1367764.0,
"step": 707
},
{
"epoch": 0.89281210592686,
"grad_norm": 4.804049015045166,
"learning_rate": 2.9447236180904524e-06,
"loss": 1.5816,
"mean_token_accuracy": 0.5857201814651489,
"num_tokens": 1369711.0,
"step": 708
},
{
"epoch": 0.8940731399747793,
"grad_norm": 5.134244918823242,
"learning_rate": 2.934673366834171e-06,
"loss": 1.4602,
"mean_token_accuracy": 0.5985281467437744,
"num_tokens": 1371602.0,
"step": 709
},
{
"epoch": 0.8953341740226987,
"grad_norm": 4.867412567138672,
"learning_rate": 2.92462311557789e-06,
"loss": 1.4093,
"mean_token_accuracy": 0.6199544668197632,
"num_tokens": 1373503.0,
"step": 710
},
{
"epoch": 0.8965952080706179,
"grad_norm": 4.599572658538818,
"learning_rate": 2.914572864321608e-06,
"loss": 1.4213,
"mean_token_accuracy": 0.6173719167709351,
"num_tokens": 1375473.0,
"step": 711
},
{
"epoch": 0.8978562421185372,
"grad_norm": 4.485718250274658,
"learning_rate": 2.904522613065327e-06,
"loss": 1.3855,
"mean_token_accuracy": 0.6124942898750305,
"num_tokens": 1377487.0,
"step": 712
},
{
"epoch": 0.8991172761664565,
"grad_norm": 4.551990032196045,
"learning_rate": 2.894472361809045e-06,
"loss": 1.2479,
"mean_token_accuracy": 0.6429057419300079,
"num_tokens": 1379357.0,
"step": 713
},
{
"epoch": 0.9003783102143758,
"grad_norm": 4.8430352210998535,
"learning_rate": 2.884422110552764e-06,
"loss": 1.3304,
"mean_token_accuracy": 0.647765189409256,
"num_tokens": 1381137.0,
"step": 714
},
{
"epoch": 0.9016393442622951,
"grad_norm": 4.718348503112793,
"learning_rate": 2.8743718592964828e-06,
"loss": 1.4785,
"mean_token_accuracy": 0.5891905426979065,
"num_tokens": 1383021.0,
"step": 715
},
{
"epoch": 0.9029003783102144,
"grad_norm": 4.648007392883301,
"learning_rate": 2.8643216080402013e-06,
"loss": 1.3729,
"mean_token_accuracy": 0.6156680881977081,
"num_tokens": 1384944.0,
"step": 716
},
{
"epoch": 0.9041614123581336,
"grad_norm": 4.859986305236816,
"learning_rate": 2.85427135678392e-06,
"loss": 1.4065,
"mean_token_accuracy": 0.6208228170871735,
"num_tokens": 1386856.0,
"step": 717
},
{
"epoch": 0.905422446406053,
"grad_norm": 4.757213115692139,
"learning_rate": 2.8442211055276384e-06,
"loss": 1.4957,
"mean_token_accuracy": 0.6022606790065765,
"num_tokens": 1388890.0,
"step": 718
},
{
"epoch": 0.9066834804539723,
"grad_norm": 4.622291564941406,
"learning_rate": 2.834170854271357e-06,
"loss": 1.4246,
"mean_token_accuracy": 0.6103483438491821,
"num_tokens": 1390911.0,
"step": 719
},
{
"epoch": 0.9079445145018915,
"grad_norm": 4.5715484619140625,
"learning_rate": 2.824120603015076e-06,
"loss": 1.2987,
"mean_token_accuracy": 0.6245031952857971,
"num_tokens": 1392827.0,
"step": 720
},
{
"epoch": 0.9092055485498108,
"grad_norm": 4.618542194366455,
"learning_rate": 2.814070351758794e-06,
"loss": 1.3845,
"mean_token_accuracy": 0.610100269317627,
"num_tokens": 1394748.0,
"step": 721
},
{
"epoch": 0.9104665825977302,
"grad_norm": 4.695566654205322,
"learning_rate": 2.804020100502513e-06,
"loss": 1.3981,
"mean_token_accuracy": 0.6025079786777496,
"num_tokens": 1396738.0,
"step": 722
},
{
"epoch": 0.9117276166456494,
"grad_norm": 4.790529251098633,
"learning_rate": 2.7939698492462313e-06,
"loss": 1.4573,
"mean_token_accuracy": 0.5892115533351898,
"num_tokens": 1398750.0,
"step": 723
},
{
"epoch": 0.9129886506935687,
"grad_norm": 4.78794002532959,
"learning_rate": 2.78391959798995e-06,
"loss": 1.462,
"mean_token_accuracy": 0.6218155920505524,
"num_tokens": 1400699.0,
"step": 724
},
{
"epoch": 0.914249684741488,
"grad_norm": 4.759215354919434,
"learning_rate": 2.7738693467336684e-06,
"loss": 1.4259,
"mean_token_accuracy": 0.6136111617088318,
"num_tokens": 1402645.0,
"step": 725
},
{
"epoch": 0.9155107187894073,
"grad_norm": 4.643594741821289,
"learning_rate": 2.763819095477387e-06,
"loss": 1.427,
"mean_token_accuracy": 0.6215586960315704,
"num_tokens": 1404573.0,
"step": 726
},
{
"epoch": 0.9167717528373266,
"grad_norm": 4.983770370483398,
"learning_rate": 2.753768844221106e-06,
"loss": 1.4837,
"mean_token_accuracy": 0.6027411818504333,
"num_tokens": 1406370.0,
"step": 727
},
{
"epoch": 0.9180327868852459,
"grad_norm": 4.207434177398682,
"learning_rate": 2.743718592964824e-06,
"loss": 1.3002,
"mean_token_accuracy": 0.6143061518669128,
"num_tokens": 1408350.0,
"step": 728
},
{
"epoch": 0.9192938209331651,
"grad_norm": 4.515687465667725,
"learning_rate": 2.733668341708543e-06,
"loss": 1.35,
"mean_token_accuracy": 0.6112845540046692,
"num_tokens": 1410230.0,
"step": 729
},
{
"epoch": 0.9205548549810845,
"grad_norm": 4.651388645172119,
"learning_rate": 2.723618090452261e-06,
"loss": 1.2633,
"mean_token_accuracy": 0.662914365530014,
"num_tokens": 1412105.0,
"step": 730
},
{
"epoch": 0.9218158890290038,
"grad_norm": 4.572433948516846,
"learning_rate": 2.71356783919598e-06,
"loss": 1.4194,
"mean_token_accuracy": 0.6180647909641266,
"num_tokens": 1414080.0,
"step": 731
},
{
"epoch": 0.9230769230769231,
"grad_norm": 4.718845367431641,
"learning_rate": 2.7035175879396983e-06,
"loss": 1.4143,
"mean_token_accuracy": 0.6059090197086334,
"num_tokens": 1416061.0,
"step": 732
},
{
"epoch": 0.9243379571248423,
"grad_norm": 4.787539005279541,
"learning_rate": 2.6934673366834173e-06,
"loss": 1.4048,
"mean_token_accuracy": 0.6251717209815979,
"num_tokens": 1417916.0,
"step": 733
},
{
"epoch": 0.9255989911727617,
"grad_norm": 4.552307605743408,
"learning_rate": 2.683417085427136e-06,
"loss": 1.4179,
"mean_token_accuracy": 0.6130162477493286,
"num_tokens": 1419914.0,
"step": 734
},
{
"epoch": 0.926860025220681,
"grad_norm": 4.877958297729492,
"learning_rate": 2.6733668341708545e-06,
"loss": 1.4155,
"mean_token_accuracy": 0.6053019165992737,
"num_tokens": 1421723.0,
"step": 735
},
{
"epoch": 0.9281210592686002,
"grad_norm": 4.281277179718018,
"learning_rate": 2.663316582914573e-06,
"loss": 1.3621,
"mean_token_accuracy": 0.6375770568847656,
"num_tokens": 1423771.0,
"step": 736
},
{
"epoch": 0.9293820933165196,
"grad_norm": 4.664444446563721,
"learning_rate": 2.653266331658292e-06,
"loss": 1.4706,
"mean_token_accuracy": 0.6099792718887329,
"num_tokens": 1425701.0,
"step": 737
},
{
"epoch": 0.9306431273644389,
"grad_norm": 4.789531707763672,
"learning_rate": 2.64321608040201e-06,
"loss": 1.4989,
"mean_token_accuracy": 0.6028049290180206,
"num_tokens": 1427611.0,
"step": 738
},
{
"epoch": 0.9319041614123581,
"grad_norm": 4.176892280578613,
"learning_rate": 2.633165829145729e-06,
"loss": 1.2358,
"mean_token_accuracy": 0.6365175545215607,
"num_tokens": 1429695.0,
"step": 739
},
{
"epoch": 0.9331651954602774,
"grad_norm": 4.739315986633301,
"learning_rate": 2.6231155778894473e-06,
"loss": 1.3257,
"mean_token_accuracy": 0.6217921674251556,
"num_tokens": 1431842.0,
"step": 740
},
{
"epoch": 0.9344262295081968,
"grad_norm": 4.4287800788879395,
"learning_rate": 2.6130653266331663e-06,
"loss": 1.2712,
"mean_token_accuracy": 0.6399266123771667,
"num_tokens": 1433802.0,
"step": 741
},
{
"epoch": 0.935687263556116,
"grad_norm": 4.2787251472473145,
"learning_rate": 2.6030150753768844e-06,
"loss": 1.3577,
"mean_token_accuracy": 0.6182697415351868,
"num_tokens": 1435927.0,
"step": 742
},
{
"epoch": 0.9369482976040353,
"grad_norm": 4.666017532348633,
"learning_rate": 2.5929648241206034e-06,
"loss": 1.4217,
"mean_token_accuracy": 0.6318005323410034,
"num_tokens": 1437917.0,
"step": 743
},
{
"epoch": 0.9382093316519546,
"grad_norm": 4.566494941711426,
"learning_rate": 2.582914572864322e-06,
"loss": 1.3476,
"mean_token_accuracy": 0.6440158486366272,
"num_tokens": 1439816.0,
"step": 744
},
{
"epoch": 0.9394703656998739,
"grad_norm": 4.964987277984619,
"learning_rate": 2.5728643216080405e-06,
"loss": 1.5741,
"mean_token_accuracy": 0.5829296708106995,
"num_tokens": 1441693.0,
"step": 745
},
{
"epoch": 0.9407313997477932,
"grad_norm": 4.58892822265625,
"learning_rate": 2.562814070351759e-06,
"loss": 1.4067,
"mean_token_accuracy": 0.6247598826885223,
"num_tokens": 1443618.0,
"step": 746
},
{
"epoch": 0.9419924337957125,
"grad_norm": 4.80293083190918,
"learning_rate": 2.5527638190954772e-06,
"loss": 1.4583,
"mean_token_accuracy": 0.5979767441749573,
"num_tokens": 1445541.0,
"step": 747
},
{
"epoch": 0.9432534678436317,
"grad_norm": 4.42380952835083,
"learning_rate": 2.542713567839196e-06,
"loss": 1.3464,
"mean_token_accuracy": 0.6280787885189056,
"num_tokens": 1447530.0,
"step": 748
},
{
"epoch": 0.9445145018915511,
"grad_norm": 4.366753578186035,
"learning_rate": 2.5326633165829143e-06,
"loss": 1.2985,
"mean_token_accuracy": 0.6283706724643707,
"num_tokens": 1449581.0,
"step": 749
},
{
"epoch": 0.9457755359394704,
"grad_norm": 5.115789413452148,
"learning_rate": 2.5226130653266333e-06,
"loss": 1.468,
"mean_token_accuracy": 0.5940811038017273,
"num_tokens": 1451352.0,
"step": 750
},
{
"epoch": 0.9470365699873896,
"grad_norm": 5.07560920715332,
"learning_rate": 2.512562814070352e-06,
"loss": 1.4144,
"mean_token_accuracy": 0.6068819165229797,
"num_tokens": 1453291.0,
"step": 751
},
{
"epoch": 0.9482976040353089,
"grad_norm": 5.055481910705566,
"learning_rate": 2.5025125628140705e-06,
"loss": 1.5266,
"mean_token_accuracy": 0.5825677812099457,
"num_tokens": 1455216.0,
"step": 752
},
{
"epoch": 0.9495586380832283,
"grad_norm": 4.670107364654541,
"learning_rate": 2.492462311557789e-06,
"loss": 1.3108,
"mean_token_accuracy": 0.639474481344223,
"num_tokens": 1457181.0,
"step": 753
},
{
"epoch": 0.9508196721311475,
"grad_norm": 5.033635139465332,
"learning_rate": 2.482412060301508e-06,
"loss": 1.4711,
"mean_token_accuracy": 0.6088995933532715,
"num_tokens": 1459026.0,
"step": 754
},
{
"epoch": 0.9520807061790668,
"grad_norm": 4.375858783721924,
"learning_rate": 2.4723618090452266e-06,
"loss": 1.3317,
"mean_token_accuracy": 0.632611870765686,
"num_tokens": 1461001.0,
"step": 755
},
{
"epoch": 0.9533417402269861,
"grad_norm": 4.497524261474609,
"learning_rate": 2.462311557788945e-06,
"loss": 1.3779,
"mean_token_accuracy": 0.6088739335536957,
"num_tokens": 1463014.0,
"step": 756
},
{
"epoch": 0.9546027742749055,
"grad_norm": 5.146448135375977,
"learning_rate": 2.4522613065326637e-06,
"loss": 1.5071,
"mean_token_accuracy": 0.5854723751544952,
"num_tokens": 1464864.0,
"step": 757
},
{
"epoch": 0.9558638083228247,
"grad_norm": 4.540935516357422,
"learning_rate": 2.4422110552763823e-06,
"loss": 1.3522,
"mean_token_accuracy": 0.6171889901161194,
"num_tokens": 1466795.0,
"step": 758
},
{
"epoch": 0.957124842370744,
"grad_norm": 4.848739147186279,
"learning_rate": 2.4321608040201004e-06,
"loss": 1.4463,
"mean_token_accuracy": 0.5893578827381134,
"num_tokens": 1468671.0,
"step": 759
},
{
"epoch": 0.9583858764186634,
"grad_norm": 4.940702438354492,
"learning_rate": 2.422110552763819e-06,
"loss": 1.427,
"mean_token_accuracy": 0.578980565071106,
"num_tokens": 1470508.0,
"step": 760
},
{
"epoch": 0.9596469104665826,
"grad_norm": 4.815670490264893,
"learning_rate": 2.412060301507538e-06,
"loss": 1.3316,
"mean_token_accuracy": 0.6391838192939758,
"num_tokens": 1472378.0,
"step": 761
},
{
"epoch": 0.9609079445145019,
"grad_norm": 4.629350185394287,
"learning_rate": 2.4020100502512565e-06,
"loss": 1.4238,
"mean_token_accuracy": 0.6231440901756287,
"num_tokens": 1474409.0,
"step": 762
},
{
"epoch": 0.9621689785624212,
"grad_norm": 5.507829666137695,
"learning_rate": 2.391959798994975e-06,
"loss": 1.625,
"mean_token_accuracy": 0.5813977718353271,
"num_tokens": 1476222.0,
"step": 763
},
{
"epoch": 0.9634300126103404,
"grad_norm": 4.723374843597412,
"learning_rate": 2.3819095477386936e-06,
"loss": 1.3546,
"mean_token_accuracy": 0.6329129934310913,
"num_tokens": 1478087.0,
"step": 764
},
{
"epoch": 0.9646910466582598,
"grad_norm": 4.788589954376221,
"learning_rate": 2.371859296482412e-06,
"loss": 1.4588,
"mean_token_accuracy": 0.6007176041603088,
"num_tokens": 1479994.0,
"step": 765
},
{
"epoch": 0.9659520807061791,
"grad_norm": 4.644597053527832,
"learning_rate": 2.3618090452261308e-06,
"loss": 1.3886,
"mean_token_accuracy": 0.6195676028728485,
"num_tokens": 1481926.0,
"step": 766
},
{
"epoch": 0.9672131147540983,
"grad_norm": 4.900524616241455,
"learning_rate": 2.3517587939698493e-06,
"loss": 1.5086,
"mean_token_accuracy": 0.5896070003509521,
"num_tokens": 1483763.0,
"step": 767
},
{
"epoch": 0.9684741488020177,
"grad_norm": 4.506715774536133,
"learning_rate": 2.341708542713568e-06,
"loss": 1.3994,
"mean_token_accuracy": 0.6048212349414825,
"num_tokens": 1485638.0,
"step": 768
},
{
"epoch": 0.969735182849937,
"grad_norm": 4.923521518707275,
"learning_rate": 2.3316582914572865e-06,
"loss": 1.4353,
"mean_token_accuracy": 0.6291594505310059,
"num_tokens": 1487423.0,
"step": 769
},
{
"epoch": 0.9709962168978562,
"grad_norm": 4.444231986999512,
"learning_rate": 2.321608040201005e-06,
"loss": 1.3988,
"mean_token_accuracy": 0.6323827803134918,
"num_tokens": 1489443.0,
"step": 770
},
{
"epoch": 0.9722572509457755,
"grad_norm": 4.672366142272949,
"learning_rate": 2.311557788944724e-06,
"loss": 1.5099,
"mean_token_accuracy": 0.5915838181972504,
"num_tokens": 1491513.0,
"step": 771
},
{
"epoch": 0.9735182849936949,
"grad_norm": 4.950202465057373,
"learning_rate": 2.3015075376884426e-06,
"loss": 1.3394,
"mean_token_accuracy": 0.6242406666278839,
"num_tokens": 1493366.0,
"step": 772
},
{
"epoch": 0.9747793190416141,
"grad_norm": 4.696926116943359,
"learning_rate": 2.291457286432161e-06,
"loss": 1.388,
"mean_token_accuracy": 0.6303937137126923,
"num_tokens": 1495258.0,
"step": 773
},
{
"epoch": 0.9760403530895334,
"grad_norm": 4.909535884857178,
"learning_rate": 2.2814070351758797e-06,
"loss": 1.465,
"mean_token_accuracy": 0.5960476994514465,
"num_tokens": 1497097.0,
"step": 774
},
{
"epoch": 0.9773013871374527,
"grad_norm": 4.92638635635376,
"learning_rate": 2.2713567839195983e-06,
"loss": 1.598,
"mean_token_accuracy": 0.5790300369262695,
"num_tokens": 1499002.0,
"step": 775
},
{
"epoch": 0.978562421185372,
"grad_norm": 5.75028133392334,
"learning_rate": 2.261306532663317e-06,
"loss": 1.6262,
"mean_token_accuracy": 0.5684725046157837,
"num_tokens": 1500739.0,
"step": 776
},
{
"epoch": 0.9798234552332913,
"grad_norm": 4.659389972686768,
"learning_rate": 2.2512562814070354e-06,
"loss": 1.4997,
"mean_token_accuracy": 0.6040664911270142,
"num_tokens": 1502766.0,
"step": 777
},
{
"epoch": 0.9810844892812106,
"grad_norm": 4.826159954071045,
"learning_rate": 2.241206030150754e-06,
"loss": 1.3497,
"mean_token_accuracy": 0.6199261844158173,
"num_tokens": 1504543.0,
"step": 778
},
{
"epoch": 0.9823455233291298,
"grad_norm": 4.797970771789551,
"learning_rate": 2.2311557788944725e-06,
"loss": 1.4605,
"mean_token_accuracy": 0.6027579307556152,
"num_tokens": 1506581.0,
"step": 779
},
{
"epoch": 0.9836065573770492,
"grad_norm": 5.0854291915893555,
"learning_rate": 2.221105527638191e-06,
"loss": 1.4007,
"mean_token_accuracy": 0.6198207437992096,
"num_tokens": 1508414.0,
"step": 780
},
{
"epoch": 0.9848675914249685,
"grad_norm": 4.761533260345459,
"learning_rate": 2.21105527638191e-06,
"loss": 1.3869,
"mean_token_accuracy": 0.6144894063472748,
"num_tokens": 1510323.0,
"step": 781
},
{
"epoch": 0.9861286254728878,
"grad_norm": 4.917034149169922,
"learning_rate": 2.2010050251256282e-06,
"loss": 1.5707,
"mean_token_accuracy": 0.5760520994663239,
"num_tokens": 1512444.0,
"step": 782
},
{
"epoch": 0.987389659520807,
"grad_norm": 4.831483364105225,
"learning_rate": 2.1909547738693468e-06,
"loss": 1.376,
"mean_token_accuracy": 0.603929340839386,
"num_tokens": 1514339.0,
"step": 783
},
{
"epoch": 0.9886506935687264,
"grad_norm": 4.954286098480225,
"learning_rate": 2.1809045226130653e-06,
"loss": 1.5463,
"mean_token_accuracy": 0.5951487720012665,
"num_tokens": 1516264.0,
"step": 784
},
{
"epoch": 0.9899117276166457,
"grad_norm": 5.119381904602051,
"learning_rate": 2.170854271356784e-06,
"loss": 1.4713,
"mean_token_accuracy": 0.5979754626750946,
"num_tokens": 1518121.0,
"step": 785
},
{
"epoch": 0.9911727616645649,
"grad_norm": 4.790980339050293,
"learning_rate": 2.1608040201005025e-06,
"loss": 1.3491,
"mean_token_accuracy": 0.6425934433937073,
"num_tokens": 1519948.0,
"step": 786
},
{
"epoch": 0.9924337957124842,
"grad_norm": 4.7805495262146,
"learning_rate": 2.150753768844221e-06,
"loss": 1.2154,
"mean_token_accuracy": 0.6537107527256012,
"num_tokens": 1521854.0,
"step": 787
},
{
"epoch": 0.9936948297604036,
"grad_norm": 4.60954475402832,
"learning_rate": 2.14070351758794e-06,
"loss": 1.2991,
"mean_token_accuracy": 0.6362617313861847,
"num_tokens": 1523793.0,
"step": 788
},
{
"epoch": 0.9949558638083228,
"grad_norm": 4.655366897583008,
"learning_rate": 2.1306532663316586e-06,
"loss": 1.3924,
"mean_token_accuracy": 0.6305295526981354,
"num_tokens": 1525902.0,
"step": 789
},
{
"epoch": 0.9962168978562421,
"grad_norm": 5.052053451538086,
"learning_rate": 2.120603015075377e-06,
"loss": 1.392,
"mean_token_accuracy": 0.6316825747489929,
"num_tokens": 1527769.0,
"step": 790
},
{
"epoch": 0.9974779319041615,
"grad_norm": 4.823788166046143,
"learning_rate": 2.1105527638190957e-06,
"loss": 1.5008,
"mean_token_accuracy": 0.5882376730442047,
"num_tokens": 1530004.0,
"step": 791
},
{
"epoch": 0.9987389659520807,
"grad_norm": 5.215896129608154,
"learning_rate": 2.1005025125628143e-06,
"loss": 1.4967,
"mean_token_accuracy": 0.6016424000263214,
"num_tokens": 1531801.0,
"step": 792
},
{
"epoch": 1.0,
"grad_norm": 5.053985595703125,
"learning_rate": 2.090452261306533e-06,
"loss": 1.4931,
"mean_token_accuracy": 0.6124192774295807,
"num_tokens": 1533640.0,
"step": 793
},
{
"epoch": 1.0012610340479193,
"grad_norm": 4.859620094299316,
"learning_rate": 2.0804020100502514e-06,
"loss": 1.4346,
"mean_token_accuracy": 0.6008819937705994,
"num_tokens": 1535732.0,
"step": 794
},
{
"epoch": 1.0025220680958387,
"grad_norm": 4.924972057342529,
"learning_rate": 2.07035175879397e-06,
"loss": 1.3304,
"mean_token_accuracy": 0.6425945162773132,
"num_tokens": 1537619.0,
"step": 795
},
{
"epoch": 1.003783102143758,
"grad_norm": 4.552638053894043,
"learning_rate": 2.0603015075376885e-06,
"loss": 1.4359,
"mean_token_accuracy": 0.6067358255386353,
"num_tokens": 1539634.0,
"step": 796
},
{
"epoch": 1.005044136191677,
"grad_norm": 5.06874418258667,
"learning_rate": 2.050251256281407e-06,
"loss": 1.5327,
"mean_token_accuracy": 0.6117250621318817,
"num_tokens": 1541468.0,
"step": 797
},
{
"epoch": 1.0063051702395964,
"grad_norm": 5.16888427734375,
"learning_rate": 2.0402010050251257e-06,
"loss": 1.4799,
"mean_token_accuracy": 0.615227073431015,
"num_tokens": 1543264.0,
"step": 798
},
{
"epoch": 1.0075662042875158,
"grad_norm": 4.693305969238281,
"learning_rate": 2.0301507537688446e-06,
"loss": 1.2996,
"mean_token_accuracy": 0.6404381990432739,
"num_tokens": 1545137.0,
"step": 799
},
{
"epoch": 1.008827238335435,
"grad_norm": 4.820840835571289,
"learning_rate": 2.020100502512563e-06,
"loss": 1.4245,
"mean_token_accuracy": 0.6199676990509033,
"num_tokens": 1547213.0,
"step": 800
},
{
"epoch": 1.0100882723833544,
"grad_norm": 5.359424591064453,
"learning_rate": 2.0100502512562818e-06,
"loss": 1.5496,
"mean_token_accuracy": 0.5789197981357574,
"num_tokens": 1548973.0,
"step": 801
},
{
"epoch": 1.0113493064312737,
"grad_norm": 5.1563262939453125,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.4537,
"mean_token_accuracy": 0.5936600565910339,
"num_tokens": 1550755.0,
"step": 802
},
{
"epoch": 1.0126103404791928,
"grad_norm": 4.531444072723389,
"learning_rate": 1.989949748743719e-06,
"loss": 1.2974,
"mean_token_accuracy": 0.6466964781284332,
"num_tokens": 1552830.0,
"step": 803
},
{
"epoch": 1.0138713745271122,
"grad_norm": 4.835750579833984,
"learning_rate": 1.9798994974874375e-06,
"loss": 1.477,
"mean_token_accuracy": 0.5901938676834106,
"num_tokens": 1554850.0,
"step": 804
},
{
"epoch": 1.0151324085750315,
"grad_norm": 4.399245262145996,
"learning_rate": 1.9698492462311556e-06,
"loss": 1.4017,
"mean_token_accuracy": 0.626316249370575,
"num_tokens": 1556904.0,
"step": 805
},
{
"epoch": 1.0163934426229508,
"grad_norm": 5.141709327697754,
"learning_rate": 1.9597989949748746e-06,
"loss": 1.3327,
"mean_token_accuracy": 0.6160070896148682,
"num_tokens": 1558816.0,
"step": 806
},
{
"epoch": 1.0176544766708702,
"grad_norm": 4.775214672088623,
"learning_rate": 1.949748743718593e-06,
"loss": 1.3999,
"mean_token_accuracy": 0.6168196499347687,
"num_tokens": 1560702.0,
"step": 807
},
{
"epoch": 1.0189155107187895,
"grad_norm": 5.105827331542969,
"learning_rate": 1.9396984924623117e-06,
"loss": 1.4751,
"mean_token_accuracy": 0.6149435937404633,
"num_tokens": 1562529.0,
"step": 808
},
{
"epoch": 1.0201765447667086,
"grad_norm": 4.85412073135376,
"learning_rate": 1.9296482412060303e-06,
"loss": 1.3805,
"mean_token_accuracy": 0.6386726498603821,
"num_tokens": 1564418.0,
"step": 809
},
{
"epoch": 1.021437578814628,
"grad_norm": 4.5922064781188965,
"learning_rate": 1.919597989949749e-06,
"loss": 1.404,
"mean_token_accuracy": 0.6304318010807037,
"num_tokens": 1566411.0,
"step": 810
},
{
"epoch": 1.0226986128625473,
"grad_norm": 4.626622676849365,
"learning_rate": 1.9095477386934674e-06,
"loss": 1.2757,
"mean_token_accuracy": 0.6312589943408966,
"num_tokens": 1568268.0,
"step": 811
},
{
"epoch": 1.0239596469104666,
"grad_norm": 4.924299716949463,
"learning_rate": 1.899497487437186e-06,
"loss": 1.4809,
"mean_token_accuracy": 0.5968372523784637,
"num_tokens": 1570241.0,
"step": 812
},
{
"epoch": 1.025220680958386,
"grad_norm": 4.954929828643799,
"learning_rate": 1.8894472361809047e-06,
"loss": 1.4648,
"mean_token_accuracy": 0.5949355065822601,
"num_tokens": 1572138.0,
"step": 813
},
{
"epoch": 1.0264817150063053,
"grad_norm": 4.762870788574219,
"learning_rate": 1.8793969849246233e-06,
"loss": 1.3555,
"mean_token_accuracy": 0.6090791821479797,
"num_tokens": 1574119.0,
"step": 814
},
{
"epoch": 1.0277427490542244,
"grad_norm": 4.991880893707275,
"learning_rate": 1.8693467336683419e-06,
"loss": 1.5048,
"mean_token_accuracy": 0.5935323536396027,
"num_tokens": 1576022.0,
"step": 815
},
{
"epoch": 1.0290037831021437,
"grad_norm": 4.592222213745117,
"learning_rate": 1.8592964824120604e-06,
"loss": 1.3832,
"mean_token_accuracy": 0.6328803896903992,
"num_tokens": 1578031.0,
"step": 816
},
{
"epoch": 1.030264817150063,
"grad_norm": 4.974219799041748,
"learning_rate": 1.849246231155779e-06,
"loss": 1.4417,
"mean_token_accuracy": 0.6144815683364868,
"num_tokens": 1579953.0,
"step": 817
},
{
"epoch": 1.0315258511979823,
"grad_norm": 4.911769390106201,
"learning_rate": 1.8391959798994976e-06,
"loss": 1.3348,
"mean_token_accuracy": 0.6101964116096497,
"num_tokens": 1581942.0,
"step": 818
},
{
"epoch": 1.0327868852459017,
"grad_norm": 4.82524299621582,
"learning_rate": 1.8291457286432163e-06,
"loss": 1.4193,
"mean_token_accuracy": 0.6024172008037567,
"num_tokens": 1583802.0,
"step": 819
},
{
"epoch": 1.034047919293821,
"grad_norm": 5.037111282348633,
"learning_rate": 1.819095477386935e-06,
"loss": 1.4243,
"mean_token_accuracy": 0.5954607129096985,
"num_tokens": 1585699.0,
"step": 820
},
{
"epoch": 1.0353089533417403,
"grad_norm": 5.0072174072265625,
"learning_rate": 1.8090452261306535e-06,
"loss": 1.5389,
"mean_token_accuracy": 0.5851928889751434,
"num_tokens": 1587730.0,
"step": 821
},
{
"epoch": 1.0365699873896594,
"grad_norm": 5.133855819702148,
"learning_rate": 1.798994974874372e-06,
"loss": 1.4712,
"mean_token_accuracy": 0.604417622089386,
"num_tokens": 1589520.0,
"step": 822
},
{
"epoch": 1.0378310214375788,
"grad_norm": 4.941107749938965,
"learning_rate": 1.7889447236180906e-06,
"loss": 1.2498,
"mean_token_accuracy": 0.6343302428722382,
"num_tokens": 1591363.0,
"step": 823
},
{
"epoch": 1.039092055485498,
"grad_norm": 4.669431209564209,
"learning_rate": 1.7788944723618094e-06,
"loss": 1.4315,
"mean_token_accuracy": 0.6125501692295074,
"num_tokens": 1593345.0,
"step": 824
},
{
"epoch": 1.0403530895334174,
"grad_norm": 4.921899795532227,
"learning_rate": 1.768844221105528e-06,
"loss": 1.4912,
"mean_token_accuracy": 0.6139432787895203,
"num_tokens": 1595264.0,
"step": 825
},
{
"epoch": 1.0416141235813368,
"grad_norm": 4.989323616027832,
"learning_rate": 1.7587939698492465e-06,
"loss": 1.4128,
"mean_token_accuracy": 0.6252074241638184,
"num_tokens": 1597177.0,
"step": 826
},
{
"epoch": 1.042875157629256,
"grad_norm": 4.729038238525391,
"learning_rate": 1.748743718592965e-06,
"loss": 1.3916,
"mean_token_accuracy": 0.6020833551883698,
"num_tokens": 1599154.0,
"step": 827
},
{
"epoch": 1.0441361916771752,
"grad_norm": 5.2605299949646,
"learning_rate": 1.7386934673366834e-06,
"loss": 1.451,
"mean_token_accuracy": 0.6022846102714539,
"num_tokens": 1600975.0,
"step": 828
},
{
"epoch": 1.0453972257250945,
"grad_norm": 4.817141056060791,
"learning_rate": 1.728643216080402e-06,
"loss": 1.3798,
"mean_token_accuracy": 0.6150883734226227,
"num_tokens": 1602854.0,
"step": 829
},
{
"epoch": 1.0466582597730139,
"grad_norm": 4.8491106033325195,
"learning_rate": 1.7185929648241205e-06,
"loss": 1.364,
"mean_token_accuracy": 0.6204774081707001,
"num_tokens": 1604765.0,
"step": 830
},
{
"epoch": 1.0479192938209332,
"grad_norm": 4.595874309539795,
"learning_rate": 1.7085427135678393e-06,
"loss": 1.4007,
"mean_token_accuracy": 0.6227355897426605,
"num_tokens": 1606739.0,
"step": 831
},
{
"epoch": 1.0491803278688525,
"grad_norm": 5.359395980834961,
"learning_rate": 1.6984924623115579e-06,
"loss": 1.5305,
"mean_token_accuracy": 0.5770443975925446,
"num_tokens": 1608545.0,
"step": 832
},
{
"epoch": 1.0504413619167718,
"grad_norm": 4.606545925140381,
"learning_rate": 1.6884422110552764e-06,
"loss": 1.3824,
"mean_token_accuracy": 0.6272301375865936,
"num_tokens": 1610675.0,
"step": 833
},
{
"epoch": 1.051702395964691,
"grad_norm": 4.8444719314575195,
"learning_rate": 1.678391959798995e-06,
"loss": 1.4138,
"mean_token_accuracy": 0.6037014126777649,
"num_tokens": 1612584.0,
"step": 834
},
{
"epoch": 1.0529634300126103,
"grad_norm": 5.316982746124268,
"learning_rate": 1.6683417085427136e-06,
"loss": 1.4045,
"mean_token_accuracy": 0.5941404402256012,
"num_tokens": 1614490.0,
"step": 835
},
{
"epoch": 1.0542244640605296,
"grad_norm": 4.479865550994873,
"learning_rate": 1.6582914572864323e-06,
"loss": 1.1823,
"mean_token_accuracy": 0.657404899597168,
"num_tokens": 1616480.0,
"step": 836
},
{
"epoch": 1.055485498108449,
"grad_norm": 5.054855823516846,
"learning_rate": 1.648241206030151e-06,
"loss": 1.4334,
"mean_token_accuracy": 0.6027690470218658,
"num_tokens": 1618429.0,
"step": 837
},
{
"epoch": 1.0567465321563683,
"grad_norm": 4.986600875854492,
"learning_rate": 1.6381909547738695e-06,
"loss": 1.4179,
"mean_token_accuracy": 0.6209523677825928,
"num_tokens": 1620432.0,
"step": 838
},
{
"epoch": 1.0580075662042876,
"grad_norm": 5.486791610717773,
"learning_rate": 1.628140703517588e-06,
"loss": 1.511,
"mean_token_accuracy": 0.6114962697029114,
"num_tokens": 1622308.0,
"step": 839
},
{
"epoch": 1.0592686002522067,
"grad_norm": 4.979884147644043,
"learning_rate": 1.6180904522613066e-06,
"loss": 1.4186,
"mean_token_accuracy": 0.6098051369190216,
"num_tokens": 1624344.0,
"step": 840
},
{
"epoch": 1.060529634300126,
"grad_norm": 5.088829517364502,
"learning_rate": 1.6080402010050254e-06,
"loss": 1.4596,
"mean_token_accuracy": 0.5979678332805634,
"num_tokens": 1626206.0,
"step": 841
},
{
"epoch": 1.0617906683480454,
"grad_norm": 4.898533344268799,
"learning_rate": 1.597989949748744e-06,
"loss": 1.3975,
"mean_token_accuracy": 0.6214229166507721,
"num_tokens": 1628109.0,
"step": 842
},
{
"epoch": 1.0630517023959647,
"grad_norm": 4.962382793426514,
"learning_rate": 1.5879396984924625e-06,
"loss": 1.3319,
"mean_token_accuracy": 0.6575819253921509,
"num_tokens": 1629969.0,
"step": 843
},
{
"epoch": 1.064312736443884,
"grad_norm": 5.184306621551514,
"learning_rate": 1.577889447236181e-06,
"loss": 1.4438,
"mean_token_accuracy": 0.5859068334102631,
"num_tokens": 1631913.0,
"step": 844
},
{
"epoch": 1.0655737704918034,
"grad_norm": 5.116397857666016,
"learning_rate": 1.5678391959798996e-06,
"loss": 1.3804,
"mean_token_accuracy": 0.6171029508113861,
"num_tokens": 1633748.0,
"step": 845
},
{
"epoch": 1.0668348045397225,
"grad_norm": 5.0195722579956055,
"learning_rate": 1.5577889447236184e-06,
"loss": 1.2668,
"mean_token_accuracy": 0.6535399854183197,
"num_tokens": 1635488.0,
"step": 846
},
{
"epoch": 1.0680958385876418,
"grad_norm": 4.750542640686035,
"learning_rate": 1.547738693467337e-06,
"loss": 1.2745,
"mean_token_accuracy": 0.6426283717155457,
"num_tokens": 1637407.0,
"step": 847
},
{
"epoch": 1.0693568726355611,
"grad_norm": 5.365578651428223,
"learning_rate": 1.5376884422110555e-06,
"loss": 1.5784,
"mean_token_accuracy": 0.5733594000339508,
"num_tokens": 1639147.0,
"step": 848
},
{
"epoch": 1.0706179066834804,
"grad_norm": 5.130395412445068,
"learning_rate": 1.527638190954774e-06,
"loss": 1.4151,
"mean_token_accuracy": 0.621956080198288,
"num_tokens": 1640943.0,
"step": 849
},
{
"epoch": 1.0718789407313998,
"grad_norm": 5.161646842956543,
"learning_rate": 1.5175879396984927e-06,
"loss": 1.3837,
"mean_token_accuracy": 0.6152097582817078,
"num_tokens": 1642812.0,
"step": 850
},
{
"epoch": 1.073139974779319,
"grad_norm": 4.936598777770996,
"learning_rate": 1.507537688442211e-06,
"loss": 1.3275,
"mean_token_accuracy": 0.6371041238307953,
"num_tokens": 1644634.0,
"step": 851
},
{
"epoch": 1.0744010088272384,
"grad_norm": 5.145509243011475,
"learning_rate": 1.4974874371859296e-06,
"loss": 1.5006,
"mean_token_accuracy": 0.5886007249355316,
"num_tokens": 1646412.0,
"step": 852
},
{
"epoch": 1.0756620428751575,
"grad_norm": 4.668916702270508,
"learning_rate": 1.4874371859296483e-06,
"loss": 1.3728,
"mean_token_accuracy": 0.6374690234661102,
"num_tokens": 1648357.0,
"step": 853
},
{
"epoch": 1.0769230769230769,
"grad_norm": 5.067697525024414,
"learning_rate": 1.477386934673367e-06,
"loss": 1.3785,
"mean_token_accuracy": 0.616924911737442,
"num_tokens": 1650296.0,
"step": 854
},
{
"epoch": 1.0781841109709962,
"grad_norm": 4.902146816253662,
"learning_rate": 1.4673366834170855e-06,
"loss": 1.5745,
"mean_token_accuracy": 0.5848622024059296,
"num_tokens": 1652305.0,
"step": 855
},
{
"epoch": 1.0794451450189155,
"grad_norm": 4.2945942878723145,
"learning_rate": 1.457286432160804e-06,
"loss": 1.2442,
"mean_token_accuracy": 0.6502452194690704,
"num_tokens": 1654462.0,
"step": 856
},
{
"epoch": 1.0807061790668349,
"grad_norm": 5.339733123779297,
"learning_rate": 1.4472361809045226e-06,
"loss": 1.509,
"mean_token_accuracy": 0.5843194723129272,
"num_tokens": 1656294.0,
"step": 857
},
{
"epoch": 1.0819672131147542,
"grad_norm": 4.766898155212402,
"learning_rate": 1.4371859296482414e-06,
"loss": 1.309,
"mean_token_accuracy": 0.6335008442401886,
"num_tokens": 1658248.0,
"step": 858
},
{
"epoch": 1.0832282471626733,
"grad_norm": 4.7863359451293945,
"learning_rate": 1.42713567839196e-06,
"loss": 1.3785,
"mean_token_accuracy": 0.6025950014591217,
"num_tokens": 1660239.0,
"step": 859
},
{
"epoch": 1.0844892812105926,
"grad_norm": 4.966418743133545,
"learning_rate": 1.4170854271356785e-06,
"loss": 1.5104,
"mean_token_accuracy": 0.5946759879589081,
"num_tokens": 1662327.0,
"step": 860
},
{
"epoch": 1.085750315258512,
"grad_norm": 4.864983081817627,
"learning_rate": 1.407035175879397e-06,
"loss": 1.5377,
"mean_token_accuracy": 0.6015218496322632,
"num_tokens": 1664343.0,
"step": 861
},
{
"epoch": 1.0870113493064313,
"grad_norm": 4.3726654052734375,
"learning_rate": 1.3969849246231156e-06,
"loss": 1.2715,
"mean_token_accuracy": 0.660163015127182,
"num_tokens": 1666409.0,
"step": 862
},
{
"epoch": 1.0882723833543506,
"grad_norm": 5.346407890319824,
"learning_rate": 1.3869346733668342e-06,
"loss": 1.5033,
"mean_token_accuracy": 0.5938428044319153,
"num_tokens": 1668169.0,
"step": 863
},
{
"epoch": 1.08953341740227,
"grad_norm": 5.046968936920166,
"learning_rate": 1.376884422110553e-06,
"loss": 1.4499,
"mean_token_accuracy": 0.6061795651912689,
"num_tokens": 1670117.0,
"step": 864
},
{
"epoch": 1.090794451450189,
"grad_norm": 5.136297702789307,
"learning_rate": 1.3668341708542715e-06,
"loss": 1.3553,
"mean_token_accuracy": 0.6270474493503571,
"num_tokens": 1672035.0,
"step": 865
},
{
"epoch": 1.0920554854981084,
"grad_norm": 4.889704704284668,
"learning_rate": 1.35678391959799e-06,
"loss": 1.4321,
"mean_token_accuracy": 0.6256860196590424,
"num_tokens": 1673881.0,
"step": 866
},
{
"epoch": 1.0933165195460277,
"grad_norm": 4.772144794464111,
"learning_rate": 1.3467336683417087e-06,
"loss": 1.2987,
"mean_token_accuracy": 0.6305316090583801,
"num_tokens": 1675786.0,
"step": 867
},
{
"epoch": 1.094577553593947,
"grad_norm": 4.752320289611816,
"learning_rate": 1.3366834170854272e-06,
"loss": 1.3301,
"mean_token_accuracy": 0.6470165550708771,
"num_tokens": 1677834.0,
"step": 868
},
{
"epoch": 1.0958385876418664,
"grad_norm": 5.081295013427734,
"learning_rate": 1.326633165829146e-06,
"loss": 1.3346,
"mean_token_accuracy": 0.6263269782066345,
"num_tokens": 1679707.0,
"step": 869
},
{
"epoch": 1.0970996216897857,
"grad_norm": 4.907476902008057,
"learning_rate": 1.3165829145728646e-06,
"loss": 1.3672,
"mean_token_accuracy": 0.6208414137363434,
"num_tokens": 1681629.0,
"step": 870
},
{
"epoch": 1.098360655737705,
"grad_norm": 4.639259338378906,
"learning_rate": 1.3065326633165831e-06,
"loss": 1.295,
"mean_token_accuracy": 0.6292709112167358,
"num_tokens": 1683723.0,
"step": 871
},
{
"epoch": 1.0996216897856241,
"grad_norm": 5.0603461265563965,
"learning_rate": 1.2964824120603017e-06,
"loss": 1.3148,
"mean_token_accuracy": 0.6325615048408508,
"num_tokens": 1685518.0,
"step": 872
},
{
"epoch": 1.1008827238335435,
"grad_norm": 4.925965785980225,
"learning_rate": 1.2864321608040203e-06,
"loss": 1.4408,
"mean_token_accuracy": 0.6115374267101288,
"num_tokens": 1687519.0,
"step": 873
},
{
"epoch": 1.1021437578814628,
"grad_norm": 4.810086250305176,
"learning_rate": 1.2763819095477386e-06,
"loss": 1.2673,
"mean_token_accuracy": 0.642699807882309,
"num_tokens": 1689596.0,
"step": 874
},
{
"epoch": 1.1034047919293821,
"grad_norm": 5.053249359130859,
"learning_rate": 1.2663316582914572e-06,
"loss": 1.3439,
"mean_token_accuracy": 0.6248357594013214,
"num_tokens": 1691524.0,
"step": 875
},
{
"epoch": 1.1046658259773015,
"grad_norm": 4.817336559295654,
"learning_rate": 1.256281407035176e-06,
"loss": 1.4219,
"mean_token_accuracy": 0.62397900223732,
"num_tokens": 1693448.0,
"step": 876
},
{
"epoch": 1.1059268600252208,
"grad_norm": 5.001347541809082,
"learning_rate": 1.2462311557788945e-06,
"loss": 1.5118,
"mean_token_accuracy": 0.5976948142051697,
"num_tokens": 1695362.0,
"step": 877
},
{
"epoch": 1.1071878940731399,
"grad_norm": 5.020344257354736,
"learning_rate": 1.2361809045226133e-06,
"loss": 1.4163,
"mean_token_accuracy": 0.608789473772049,
"num_tokens": 1697349.0,
"step": 878
},
{
"epoch": 1.1084489281210592,
"grad_norm": 4.469849586486816,
"learning_rate": 1.2261306532663318e-06,
"loss": 1.2971,
"mean_token_accuracy": 0.6297315359115601,
"num_tokens": 1699484.0,
"step": 879
},
{
"epoch": 1.1097099621689785,
"grad_norm": 5.064508438110352,
"learning_rate": 1.2160804020100502e-06,
"loss": 1.3802,
"mean_token_accuracy": 0.6266240477561951,
"num_tokens": 1701370.0,
"step": 880
},
{
"epoch": 1.1109709962168979,
"grad_norm": 5.334554672241211,
"learning_rate": 1.206030150753769e-06,
"loss": 1.5268,
"mean_token_accuracy": 0.590501219034195,
"num_tokens": 1703236.0,
"step": 881
},
{
"epoch": 1.1122320302648172,
"grad_norm": 4.9287590980529785,
"learning_rate": 1.1959798994974875e-06,
"loss": 1.3817,
"mean_token_accuracy": 0.6426202356815338,
"num_tokens": 1705181.0,
"step": 882
},
{
"epoch": 1.1134930643127365,
"grad_norm": 4.90025520324707,
"learning_rate": 1.185929648241206e-06,
"loss": 1.3757,
"mean_token_accuracy": 0.6233894228935242,
"num_tokens": 1707138.0,
"step": 883
},
{
"epoch": 1.1147540983606556,
"grad_norm": 4.745693683624268,
"learning_rate": 1.1758793969849247e-06,
"loss": 1.3317,
"mean_token_accuracy": 0.6354628205299377,
"num_tokens": 1709038.0,
"step": 884
},
{
"epoch": 1.116015132408575,
"grad_norm": 4.981682300567627,
"learning_rate": 1.1658291457286432e-06,
"loss": 1.3259,
"mean_token_accuracy": 0.6297134160995483,
"num_tokens": 1710879.0,
"step": 885
},
{
"epoch": 1.1172761664564943,
"grad_norm": 5.087985038757324,
"learning_rate": 1.155778894472362e-06,
"loss": 1.4851,
"mean_token_accuracy": 0.5866974294185638,
"num_tokens": 1712758.0,
"step": 886
},
{
"epoch": 1.1185372005044136,
"grad_norm": 4.942793846130371,
"learning_rate": 1.1457286432160806e-06,
"loss": 1.4352,
"mean_token_accuracy": 0.5849944055080414,
"num_tokens": 1714695.0,
"step": 887
},
{
"epoch": 1.119798234552333,
"grad_norm": 4.774109840393066,
"learning_rate": 1.1356783919597991e-06,
"loss": 1.4464,
"mean_token_accuracy": 0.6079711019992828,
"num_tokens": 1716756.0,
"step": 888
},
{
"epoch": 1.1210592686002523,
"grad_norm": 4.673050880432129,
"learning_rate": 1.1256281407035177e-06,
"loss": 1.2345,
"mean_token_accuracy": 0.6406926512718201,
"num_tokens": 1718637.0,
"step": 889
},
{
"epoch": 1.1223203026481714,
"grad_norm": 4.597579479217529,
"learning_rate": 1.1155778894472363e-06,
"loss": 1.2355,
"mean_token_accuracy": 0.6603090167045593,
"num_tokens": 1720597.0,
"step": 890
},
{
"epoch": 1.1235813366960907,
"grad_norm": 5.344587802886963,
"learning_rate": 1.105527638190955e-06,
"loss": 1.459,
"mean_token_accuracy": 0.6139001846313477,
"num_tokens": 1722426.0,
"step": 891
},
{
"epoch": 1.12484237074401,
"grad_norm": 5.077244758605957,
"learning_rate": 1.0954773869346734e-06,
"loss": 1.4039,
"mean_token_accuracy": 0.6174876689910889,
"num_tokens": 1724298.0,
"step": 892
},
{
"epoch": 1.1261034047919294,
"grad_norm": 4.649603366851807,
"learning_rate": 1.085427135678392e-06,
"loss": 1.3202,
"mean_token_accuracy": 0.6394246816635132,
"num_tokens": 1726155.0,
"step": 893
},
{
"epoch": 1.1273644388398487,
"grad_norm": 4.656422138214111,
"learning_rate": 1.0753768844221105e-06,
"loss": 1.3173,
"mean_token_accuracy": 0.6460046172142029,
"num_tokens": 1728121.0,
"step": 894
},
{
"epoch": 1.128625472887768,
"grad_norm": 5.245925426483154,
"learning_rate": 1.0653266331658293e-06,
"loss": 1.5032,
"mean_token_accuracy": 0.6162103414535522,
"num_tokens": 1729979.0,
"step": 895
},
{
"epoch": 1.1298865069356872,
"grad_norm": 5.046647548675537,
"learning_rate": 1.0552763819095479e-06,
"loss": 1.5197,
"mean_token_accuracy": 0.5832743048667908,
"num_tokens": 1731906.0,
"step": 896
},
{
"epoch": 1.1311475409836065,
"grad_norm": 5.067801475524902,
"learning_rate": 1.0452261306532664e-06,
"loss": 1.4258,
"mean_token_accuracy": 0.6073438227176666,
"num_tokens": 1733737.0,
"step": 897
},
{
"epoch": 1.1324085750315258,
"grad_norm": 4.663293361663818,
"learning_rate": 1.035175879396985e-06,
"loss": 1.3173,
"mean_token_accuracy": 0.64065220952034,
"num_tokens": 1735752.0,
"step": 898
},
{
"epoch": 1.1336696090794451,
"grad_norm": 5.36110782623291,
"learning_rate": 1.0251256281407035e-06,
"loss": 1.4858,
"mean_token_accuracy": 0.5942147076129913,
"num_tokens": 1737597.0,
"step": 899
},
{
"epoch": 1.1349306431273645,
"grad_norm": 5.239461421966553,
"learning_rate": 1.0150753768844223e-06,
"loss": 1.3483,
"mean_token_accuracy": 0.6205393671989441,
"num_tokens": 1739348.0,
"step": 900
},
{
"epoch": 1.1361916771752838,
"grad_norm": 5.277314186096191,
"learning_rate": 1.0050251256281409e-06,
"loss": 1.3441,
"mean_token_accuracy": 0.6316133439540863,
"num_tokens": 1741230.0,
"step": 901
},
{
"epoch": 1.1374527112232031,
"grad_norm": 5.051476955413818,
"learning_rate": 9.949748743718594e-07,
"loss": 1.4172,
"mean_token_accuracy": 0.6009290218353271,
"num_tokens": 1743140.0,
"step": 902
},
{
"epoch": 1.1387137452711222,
"grad_norm": 4.778693199157715,
"learning_rate": 9.849246231155778e-07,
"loss": 1.3541,
"mean_token_accuracy": 0.6240184009075165,
"num_tokens": 1745096.0,
"step": 903
},
{
"epoch": 1.1399747793190416,
"grad_norm": 5.342038631439209,
"learning_rate": 9.748743718592966e-07,
"loss": 1.3844,
"mean_token_accuracy": 0.6123272478580475,
"num_tokens": 1746956.0,
"step": 904
},
{
"epoch": 1.141235813366961,
"grad_norm": 5.211719989776611,
"learning_rate": 9.648241206030151e-07,
"loss": 1.4345,
"mean_token_accuracy": 0.6103442311286926,
"num_tokens": 1748802.0,
"step": 905
},
{
"epoch": 1.1424968474148802,
"grad_norm": 4.845167636871338,
"learning_rate": 9.547738693467337e-07,
"loss": 1.3347,
"mean_token_accuracy": 0.6275226771831512,
"num_tokens": 1750816.0,
"step": 906
},
{
"epoch": 1.1437578814627996,
"grad_norm": 4.847137451171875,
"learning_rate": 9.447236180904524e-07,
"loss": 1.3872,
"mean_token_accuracy": 0.61635622382164,
"num_tokens": 1752796.0,
"step": 907
},
{
"epoch": 1.1450189155107189,
"grad_norm": 5.0400166511535645,
"learning_rate": 9.346733668341709e-07,
"loss": 1.449,
"mean_token_accuracy": 0.601254791021347,
"num_tokens": 1754805.0,
"step": 908
},
{
"epoch": 1.146279949558638,
"grad_norm": 4.840874195098877,
"learning_rate": 9.246231155778895e-07,
"loss": 1.3805,
"mean_token_accuracy": 0.6148854196071625,
"num_tokens": 1756877.0,
"step": 909
},
{
"epoch": 1.1475409836065573,
"grad_norm": 5.118767738342285,
"learning_rate": 9.145728643216082e-07,
"loss": 1.4999,
"mean_token_accuracy": 0.5984103679656982,
"num_tokens": 1758852.0,
"step": 910
},
{
"epoch": 1.1488020176544766,
"grad_norm": 4.932574272155762,
"learning_rate": 9.045226130653267e-07,
"loss": 1.4006,
"mean_token_accuracy": 0.6041274070739746,
"num_tokens": 1760856.0,
"step": 911
},
{
"epoch": 1.150063051702396,
"grad_norm": 4.902439117431641,
"learning_rate": 8.944723618090453e-07,
"loss": 1.2811,
"mean_token_accuracy": 0.6541339159011841,
"num_tokens": 1762797.0,
"step": 912
},
{
"epoch": 1.1513240857503153,
"grad_norm": 5.102466106414795,
"learning_rate": 8.84422110552764e-07,
"loss": 1.4599,
"mean_token_accuracy": 0.5948452055454254,
"num_tokens": 1764710.0,
"step": 913
},
{
"epoch": 1.1525851197982346,
"grad_norm": 5.040976047515869,
"learning_rate": 8.743718592964825e-07,
"loss": 1.4398,
"mean_token_accuracy": 0.6058560907840729,
"num_tokens": 1766649.0,
"step": 914
},
{
"epoch": 1.1538461538461537,
"grad_norm": 4.775660514831543,
"learning_rate": 8.64321608040201e-07,
"loss": 1.2688,
"mean_token_accuracy": 0.6359184980392456,
"num_tokens": 1768822.0,
"step": 915
},
{
"epoch": 1.155107187894073,
"grad_norm": 5.1789703369140625,
"learning_rate": 8.542713567839197e-07,
"loss": 1.4468,
"mean_token_accuracy": 0.5935891568660736,
"num_tokens": 1770754.0,
"step": 916
},
{
"epoch": 1.1563682219419924,
"grad_norm": 4.609385967254639,
"learning_rate": 8.442211055276382e-07,
"loss": 1.2928,
"mean_token_accuracy": 0.6313645839691162,
"num_tokens": 1772740.0,
"step": 917
},
{
"epoch": 1.1576292559899117,
"grad_norm": 5.162957668304443,
"learning_rate": 8.341708542713568e-07,
"loss": 1.3787,
"mean_token_accuracy": 0.6098227500915527,
"num_tokens": 1774706.0,
"step": 918
},
{
"epoch": 1.158890290037831,
"grad_norm": 4.727216720581055,
"learning_rate": 8.241206030150755e-07,
"loss": 1.3411,
"mean_token_accuracy": 0.6247797906398773,
"num_tokens": 1776761.0,
"step": 919
},
{
"epoch": 1.1601513240857504,
"grad_norm": 5.466963768005371,
"learning_rate": 8.14070351758794e-07,
"loss": 1.5878,
"mean_token_accuracy": 0.5861708521842957,
"num_tokens": 1778643.0,
"step": 920
},
{
"epoch": 1.1614123581336697,
"grad_norm": 5.906630516052246,
"learning_rate": 8.040201005025127e-07,
"loss": 1.628,
"mean_token_accuracy": 0.5795406103134155,
"num_tokens": 1780398.0,
"step": 921
},
{
"epoch": 1.1626733921815888,
"grad_norm": 4.632744789123535,
"learning_rate": 7.939698492462313e-07,
"loss": 1.3589,
"mean_token_accuracy": 0.6283348500728607,
"num_tokens": 1782437.0,
"step": 922
},
{
"epoch": 1.1639344262295082,
"grad_norm": 5.072994232177734,
"learning_rate": 7.839195979899498e-07,
"loss": 1.3792,
"mean_token_accuracy": 0.6125733852386475,
"num_tokens": 1784407.0,
"step": 923
},
{
"epoch": 1.1651954602774275,
"grad_norm": 5.171667575836182,
"learning_rate": 7.738693467336685e-07,
"loss": 1.4334,
"mean_token_accuracy": 0.617727667093277,
"num_tokens": 1786360.0,
"step": 924
},
{
"epoch": 1.1664564943253468,
"grad_norm": 5.152869701385498,
"learning_rate": 7.63819095477387e-07,
"loss": 1.4553,
"mean_token_accuracy": 0.5993034839630127,
"num_tokens": 1788397.0,
"step": 925
},
{
"epoch": 1.1677175283732661,
"grad_norm": 4.626251697540283,
"learning_rate": 7.537688442211055e-07,
"loss": 1.2551,
"mean_token_accuracy": 0.6582286357879639,
"num_tokens": 1790522.0,
"step": 926
},
{
"epoch": 1.1689785624211853,
"grad_norm": 4.900967121124268,
"learning_rate": 7.437185929648242e-07,
"loss": 1.286,
"mean_token_accuracy": 0.6288881003856659,
"num_tokens": 1792448.0,
"step": 927
},
{
"epoch": 1.1702395964691046,
"grad_norm": 4.809275150299072,
"learning_rate": 7.336683417085427e-07,
"loss": 1.3884,
"mean_token_accuracy": 0.6122523248195648,
"num_tokens": 1794566.0,
"step": 928
},
{
"epoch": 1.171500630517024,
"grad_norm": 5.039896488189697,
"learning_rate": 7.236180904522613e-07,
"loss": 1.3066,
"mean_token_accuracy": 0.6312415897846222,
"num_tokens": 1796559.0,
"step": 929
},
{
"epoch": 1.1727616645649432,
"grad_norm": 5.1657538414001465,
"learning_rate": 7.1356783919598e-07,
"loss": 1.5675,
"mean_token_accuracy": 0.5969944596290588,
"num_tokens": 1798499.0,
"step": 930
},
{
"epoch": 1.1740226986128626,
"grad_norm": 4.674160003662109,
"learning_rate": 7.035175879396985e-07,
"loss": 1.2363,
"mean_token_accuracy": 0.6646978855133057,
"num_tokens": 1800436.0,
"step": 931
},
{
"epoch": 1.175283732660782,
"grad_norm": 4.792122840881348,
"learning_rate": 6.934673366834171e-07,
"loss": 1.3855,
"mean_token_accuracy": 0.6197312772274017,
"num_tokens": 1802460.0,
"step": 932
},
{
"epoch": 1.1765447667087012,
"grad_norm": 5.117883682250977,
"learning_rate": 6.834170854271358e-07,
"loss": 1.3949,
"mean_token_accuracy": 0.5948143601417542,
"num_tokens": 1804295.0,
"step": 933
},
{
"epoch": 1.1778058007566203,
"grad_norm": 5.0117387771606445,
"learning_rate": 6.733668341708543e-07,
"loss": 1.5329,
"mean_token_accuracy": 0.5969884097576141,
"num_tokens": 1806314.0,
"step": 934
},
{
"epoch": 1.1790668348045397,
"grad_norm": 4.953305721282959,
"learning_rate": 6.63316582914573e-07,
"loss": 1.3986,
"mean_token_accuracy": 0.6202296614646912,
"num_tokens": 1808343.0,
"step": 935
},
{
"epoch": 1.180327868852459,
"grad_norm": 5.259175777435303,
"learning_rate": 6.532663316582916e-07,
"loss": 1.4476,
"mean_token_accuracy": 0.6117253601551056,
"num_tokens": 1810135.0,
"step": 936
},
{
"epoch": 1.1815889029003783,
"grad_norm": 4.881397724151611,
"learning_rate": 6.432160804020101e-07,
"loss": 1.186,
"mean_token_accuracy": 0.6482206284999847,
"num_tokens": 1812003.0,
"step": 937
},
{
"epoch": 1.1828499369482977,
"grad_norm": 5.129254341125488,
"learning_rate": 6.331658291457286e-07,
"loss": 1.4479,
"mean_token_accuracy": 0.6047539412975311,
"num_tokens": 1813913.0,
"step": 938
},
{
"epoch": 1.184110970996217,
"grad_norm": 5.123323917388916,
"learning_rate": 6.231155778894473e-07,
"loss": 1.4659,
"mean_token_accuracy": 0.6227533519268036,
"num_tokens": 1815758.0,
"step": 939
},
{
"epoch": 1.1853720050441363,
"grad_norm": 5.076217174530029,
"learning_rate": 6.130653266331659e-07,
"loss": 1.2311,
"mean_token_accuracy": 0.6610271036624908,
"num_tokens": 1817639.0,
"step": 940
},
{
"epoch": 1.1866330390920554,
"grad_norm": 5.267666816711426,
"learning_rate": 6.030150753768845e-07,
"loss": 1.4227,
"mean_token_accuracy": 0.6167608201503754,
"num_tokens": 1819582.0,
"step": 941
},
{
"epoch": 1.1878940731399747,
"grad_norm": 5.258944034576416,
"learning_rate": 5.92964824120603e-07,
"loss": 1.4665,
"mean_token_accuracy": 0.5967157781124115,
"num_tokens": 1821342.0,
"step": 942
},
{
"epoch": 1.189155107187894,
"grad_norm": 5.271570682525635,
"learning_rate": 5.829145728643216e-07,
"loss": 1.3629,
"mean_token_accuracy": 0.6277696192264557,
"num_tokens": 1823287.0,
"step": 943
},
{
"epoch": 1.1904161412358134,
"grad_norm": 5.216139793395996,
"learning_rate": 5.728643216080403e-07,
"loss": 1.4321,
"mean_token_accuracy": 0.6034521460533142,
"num_tokens": 1825173.0,
"step": 944
},
{
"epoch": 1.1916771752837327,
"grad_norm": 4.663553714752197,
"learning_rate": 5.628140703517588e-07,
"loss": 1.3356,
"mean_token_accuracy": 0.6389399468898773,
"num_tokens": 1827152.0,
"step": 945
},
{
"epoch": 1.1929382093316518,
"grad_norm": 5.13004732131958,
"learning_rate": 5.527638190954775e-07,
"loss": 1.4534,
"mean_token_accuracy": 0.6107485592365265,
"num_tokens": 1829095.0,
"step": 946
},
{
"epoch": 1.1941992433795712,
"grad_norm": 5.074164390563965,
"learning_rate": 5.42713567839196e-07,
"loss": 1.4438,
"mean_token_accuracy": 0.6243781745433807,
"num_tokens": 1830919.0,
"step": 947
},
{
"epoch": 1.1954602774274905,
"grad_norm": 5.084892272949219,
"learning_rate": 5.326633165829146e-07,
"loss": 1.3777,
"mean_token_accuracy": 0.6200196444988251,
"num_tokens": 1832886.0,
"step": 948
},
{
"epoch": 1.1967213114754098,
"grad_norm": 5.34892463684082,
"learning_rate": 5.226130653266332e-07,
"loss": 1.5094,
"mean_token_accuracy": 0.586660623550415,
"num_tokens": 1834806.0,
"step": 949
},
{
"epoch": 1.1979823455233292,
"grad_norm": 5.087215423583984,
"learning_rate": 5.125628140703518e-07,
"loss": 1.3344,
"mean_token_accuracy": 0.6292024254798889,
"num_tokens": 1836645.0,
"step": 950
},
{
"epoch": 1.1992433795712485,
"grad_norm": 5.228329181671143,
"learning_rate": 5.025125628140704e-07,
"loss": 1.466,
"mean_token_accuracy": 0.5961521863937378,
"num_tokens": 1838667.0,
"step": 951
},
{
"epoch": 1.2005044136191678,
"grad_norm": 5.482524871826172,
"learning_rate": 4.924623115577889e-07,
"loss": 1.4509,
"mean_token_accuracy": 0.59737628698349,
"num_tokens": 1840592.0,
"step": 952
},
{
"epoch": 1.201765447667087,
"grad_norm": 5.0047407150268555,
"learning_rate": 4.824120603015076e-07,
"loss": 1.5227,
"mean_token_accuracy": 0.589676022529602,
"num_tokens": 1842572.0,
"step": 953
},
{
"epoch": 1.2030264817150063,
"grad_norm": 5.240270614624023,
"learning_rate": 4.723618090452262e-07,
"loss": 1.4232,
"mean_token_accuracy": 0.602595180273056,
"num_tokens": 1844510.0,
"step": 954
},
{
"epoch": 1.2042875157629256,
"grad_norm": 5.139934539794922,
"learning_rate": 4.6231155778894475e-07,
"loss": 1.3838,
"mean_token_accuracy": 0.6217158138751984,
"num_tokens": 1846434.0,
"step": 955
},
{
"epoch": 1.205548549810845,
"grad_norm": 5.630468845367432,
"learning_rate": 4.5226130653266337e-07,
"loss": 1.4834,
"mean_token_accuracy": 0.5865810513496399,
"num_tokens": 1848310.0,
"step": 956
},
{
"epoch": 1.2068095838587642,
"grad_norm": 4.692196369171143,
"learning_rate": 4.42211055276382e-07,
"loss": 1.3122,
"mean_token_accuracy": 0.6437986791133881,
"num_tokens": 1850320.0,
"step": 957
},
{
"epoch": 1.2080706179066836,
"grad_norm": 4.836750507354736,
"learning_rate": 4.321608040201005e-07,
"loss": 1.4254,
"mean_token_accuracy": 0.6103801727294922,
"num_tokens": 1852240.0,
"step": 958
},
{
"epoch": 1.2093316519546027,
"grad_norm": 5.327441215515137,
"learning_rate": 4.221105527638191e-07,
"loss": 1.5109,
"mean_token_accuracy": 0.5911853909492493,
"num_tokens": 1854188.0,
"step": 959
},
{
"epoch": 1.210592686002522,
"grad_norm": 4.50408935546875,
"learning_rate": 4.1206030150753773e-07,
"loss": 1.3162,
"mean_token_accuracy": 0.6251911222934723,
"num_tokens": 1856242.0,
"step": 960
},
{
"epoch": 1.2118537200504413,
"grad_norm": 4.922251224517822,
"learning_rate": 4.0201005025125634e-07,
"loss": 1.4668,
"mean_token_accuracy": 0.6030009984970093,
"num_tokens": 1858306.0,
"step": 961
},
{
"epoch": 1.2131147540983607,
"grad_norm": 5.415045738220215,
"learning_rate": 3.919597989949749e-07,
"loss": 1.4913,
"mean_token_accuracy": 0.5978050827980042,
"num_tokens": 1860313.0,
"step": 962
},
{
"epoch": 1.21437578814628,
"grad_norm": 5.077240943908691,
"learning_rate": 3.819095477386935e-07,
"loss": 1.3712,
"mean_token_accuracy": 0.6228812038898468,
"num_tokens": 1862220.0,
"step": 963
},
{
"epoch": 1.2156368221941993,
"grad_norm": 5.192144870758057,
"learning_rate": 3.718592964824121e-07,
"loss": 1.3479,
"mean_token_accuracy": 0.625325471162796,
"num_tokens": 1864092.0,
"step": 964
},
{
"epoch": 1.2168978562421184,
"grad_norm": 4.991547584533691,
"learning_rate": 3.6180904522613065e-07,
"loss": 1.333,
"mean_token_accuracy": 0.6345547437667847,
"num_tokens": 1866153.0,
"step": 965
},
{
"epoch": 1.2181588902900378,
"grad_norm": 5.291337490081787,
"learning_rate": 3.5175879396984927e-07,
"loss": 1.3972,
"mean_token_accuracy": 0.6194877922534943,
"num_tokens": 1868053.0,
"step": 966
},
{
"epoch": 1.219419924337957,
"grad_norm": 5.237453937530518,
"learning_rate": 3.417085427135679e-07,
"loss": 1.311,
"mean_token_accuracy": 0.6315875053405762,
"num_tokens": 1869990.0,
"step": 967
},
{
"epoch": 1.2206809583858764,
"grad_norm": 5.804203987121582,
"learning_rate": 3.316582914572865e-07,
"loss": 1.4085,
"mean_token_accuracy": 0.6040690243244171,
"num_tokens": 1871716.0,
"step": 968
},
{
"epoch": 1.2219419924337958,
"grad_norm": 5.937376022338867,
"learning_rate": 3.2160804020100506e-07,
"loss": 1.5021,
"mean_token_accuracy": 0.6007244288921356,
"num_tokens": 1873622.0,
"step": 969
},
{
"epoch": 1.223203026481715,
"grad_norm": 5.428589344024658,
"learning_rate": 3.1155778894472363e-07,
"loss": 1.4529,
"mean_token_accuracy": 0.6070205569267273,
"num_tokens": 1875500.0,
"step": 970
},
{
"epoch": 1.2244640605296344,
"grad_norm": 5.018257141113281,
"learning_rate": 3.0150753768844224e-07,
"loss": 1.2837,
"mean_token_accuracy": 0.642302542924881,
"num_tokens": 1877351.0,
"step": 971
},
{
"epoch": 1.2257250945775535,
"grad_norm": 4.901374816894531,
"learning_rate": 2.914572864321608e-07,
"loss": 1.3173,
"mean_token_accuracy": 0.6497815549373627,
"num_tokens": 1879291.0,
"step": 972
},
{
"epoch": 1.2269861286254728,
"grad_norm": 4.900482654571533,
"learning_rate": 2.814070351758794e-07,
"loss": 1.3306,
"mean_token_accuracy": 0.6035780906677246,
"num_tokens": 1881288.0,
"step": 973
},
{
"epoch": 1.2282471626733922,
"grad_norm": 4.8116774559021,
"learning_rate": 2.71356783919598e-07,
"loss": 1.2566,
"mean_token_accuracy": 0.6475660502910614,
"num_tokens": 1883149.0,
"step": 974
},
{
"epoch": 1.2295081967213115,
"grad_norm": 4.605260848999023,
"learning_rate": 2.613065326633166e-07,
"loss": 1.414,
"mean_token_accuracy": 0.621475875377655,
"num_tokens": 1885183.0,
"step": 975
},
{
"epoch": 1.2307692307692308,
"grad_norm": 4.634363651275635,
"learning_rate": 2.512562814070352e-07,
"loss": 1.3724,
"mean_token_accuracy": 0.6205690801143646,
"num_tokens": 1887171.0,
"step": 976
},
{
"epoch": 1.23203026481715,
"grad_norm": 5.679710865020752,
"learning_rate": 2.412060301507538e-07,
"loss": 1.5362,
"mean_token_accuracy": 0.5842530131340027,
"num_tokens": 1889014.0,
"step": 977
},
{
"epoch": 1.2332912988650693,
"grad_norm": 5.509294509887695,
"learning_rate": 2.3115577889447237e-07,
"loss": 1.4568,
"mean_token_accuracy": 0.6119529604911804,
"num_tokens": 1890746.0,
"step": 978
},
{
"epoch": 1.2345523329129886,
"grad_norm": 4.818939208984375,
"learning_rate": 2.21105527638191e-07,
"loss": 1.212,
"mean_token_accuracy": 0.6457622051239014,
"num_tokens": 1892570.0,
"step": 979
},
{
"epoch": 1.235813366960908,
"grad_norm": 5.132483959197998,
"learning_rate": 2.1105527638190956e-07,
"loss": 1.4684,
"mean_token_accuracy": 0.6010770797729492,
"num_tokens": 1894591.0,
"step": 980
},
{
"epoch": 1.2370744010088273,
"grad_norm": 5.0741167068481445,
"learning_rate": 2.0100502512562817e-07,
"loss": 1.4341,
"mean_token_accuracy": 0.6012249886989594,
"num_tokens": 1896483.0,
"step": 981
},
{
"epoch": 1.2383354350567466,
"grad_norm": 5.2892746925354,
"learning_rate": 1.9095477386934676e-07,
"loss": 1.4243,
"mean_token_accuracy": 0.5856616497039795,
"num_tokens": 1898341.0,
"step": 982
},
{
"epoch": 1.239596469104666,
"grad_norm": 5.422547817230225,
"learning_rate": 1.8090452261306533e-07,
"loss": 1.4239,
"mean_token_accuracy": 0.6171863377094269,
"num_tokens": 1900203.0,
"step": 983
},
{
"epoch": 1.240857503152585,
"grad_norm": 5.478043079376221,
"learning_rate": 1.7085427135678394e-07,
"loss": 1.5671,
"mean_token_accuracy": 0.5849802196025848,
"num_tokens": 1902029.0,
"step": 984
},
{
"epoch": 1.2421185372005044,
"grad_norm": 5.046949863433838,
"learning_rate": 1.6080402010050253e-07,
"loss": 1.4521,
"mean_token_accuracy": 0.6011047959327698,
"num_tokens": 1903898.0,
"step": 985
},
{
"epoch": 1.2433795712484237,
"grad_norm": 4.796839237213135,
"learning_rate": 1.5075376884422112e-07,
"loss": 1.3994,
"mean_token_accuracy": 0.6067076921463013,
"num_tokens": 1905807.0,
"step": 986
},
{
"epoch": 1.244640605296343,
"grad_norm": 5.78264045715332,
"learning_rate": 1.407035175879397e-07,
"loss": 1.71,
"mean_token_accuracy": 0.5604110062122345,
"num_tokens": 1907818.0,
"step": 987
},
{
"epoch": 1.2459016393442623,
"grad_norm": 4.714611530303955,
"learning_rate": 1.306532663316583e-07,
"loss": 1.315,
"mean_token_accuracy": 0.6363774240016937,
"num_tokens": 1909809.0,
"step": 988
},
{
"epoch": 1.2471626733921817,
"grad_norm": 5.528034210205078,
"learning_rate": 1.206030150753769e-07,
"loss": 1.5308,
"mean_token_accuracy": 0.5902082026004791,
"num_tokens": 1911626.0,
"step": 989
},
{
"epoch": 1.248423707440101,
"grad_norm": 5.443125247955322,
"learning_rate": 1.105527638190955e-07,
"loss": 1.4753,
"mean_token_accuracy": 0.6086395978927612,
"num_tokens": 1913529.0,
"step": 990
},
{
"epoch": 1.2496847414880201,
"grad_norm": 5.464473247528076,
"learning_rate": 1.0050251256281409e-07,
"loss": 1.4429,
"mean_token_accuracy": 0.6182594001293182,
"num_tokens": 1915369.0,
"step": 991
},
{
"epoch": 1.2509457755359394,
"grad_norm": 5.163119316101074,
"learning_rate": 9.045226130653266e-08,
"loss": 1.3722,
"mean_token_accuracy": 0.6328320205211639,
"num_tokens": 1917246.0,
"step": 992
},
{
"epoch": 1.2522068095838588,
"grad_norm": 5.1116743087768555,
"learning_rate": 8.040201005025127e-08,
"loss": 1.4695,
"mean_token_accuracy": 0.6038559377193451,
"num_tokens": 1919135.0,
"step": 993
},
{
"epoch": 1.253467843631778,
"grad_norm": 5.0618462562561035,
"learning_rate": 7.035175879396986e-08,
"loss": 1.3716,
"mean_token_accuracy": 0.5951160490512848,
"num_tokens": 1920964.0,
"step": 994
},
{
"epoch": 1.2547288776796974,
"grad_norm": 4.8035759925842285,
"learning_rate": 6.030150753768845e-08,
"loss": 1.3621,
"mean_token_accuracy": 0.6313849091529846,
"num_tokens": 1922859.0,
"step": 995
},
{
"epoch": 1.2559899117276165,
"grad_norm": 4.914729118347168,
"learning_rate": 5.025125628140704e-08,
"loss": 1.239,
"mean_token_accuracy": 0.6349127888679504,
"num_tokens": 1924713.0,
"step": 996
},
{
"epoch": 1.2572509457755359,
"grad_norm": 4.952082633972168,
"learning_rate": 4.020100502512563e-08,
"loss": 1.3765,
"mean_token_accuracy": 0.6257254481315613,
"num_tokens": 1926640.0,
"step": 997
},
{
"epoch": 1.2585119798234552,
"grad_norm": 5.086965084075928,
"learning_rate": 3.015075376884422e-08,
"loss": 1.4267,
"mean_token_accuracy": 0.6163321137428284,
"num_tokens": 1928611.0,
"step": 998
},
{
"epoch": 1.2597730138713745,
"grad_norm": 5.323322296142578,
"learning_rate": 2.0100502512562817e-08,
"loss": 1.4194,
"mean_token_accuracy": 0.6186275482177734,
"num_tokens": 1930441.0,
"step": 999
},
{
"epoch": 1.2610340479192939,
"grad_norm": 4.54163932800293,
"learning_rate": 1.0050251256281408e-08,
"loss": 1.2804,
"mean_token_accuracy": 0.6409396231174469,
"num_tokens": 1932506.0,
"step": 1000
}
],
"logging_steps": 1,
"max_steps": 1000,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 250,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.758847907620454e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}