DiffuCoder-7B-cpGRPO / trainer_state.json
yizheapple's picture
Upload folder using huggingface_hub
36651e2 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.999711843242724,
"eval_steps": 500,
"global_step": 5204,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 220.77500915527344,
"epoch": 0.00019210450485063874,
"grad_norm": 2.5577025413513184,
"kl": 0.0,
"learning_rate": 0.0,
"loss": 0.038,
"reward": 0.37062498927116394,
"reward_std": 0.34713491797447205,
"rewards/code_format_reward": 0.26875001192092896,
"rewards/code_reward": 0.11812499910593033,
"step": 1,
"zero_std_ratio": 0.375
},
{
"clip_ratio/high_max": 0.01640424354829722,
"clip_ratio/high_mean": 0.003707133045989192,
"clip_ratio/low_mean": 0.0004983297904901621,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004205462749167863,
"completion_length": 164.34375381469727,
"epoch": 0.0019210450485063874,
"grad_norm": 2.2875964641571045,
"kl": 0.13929970601263145,
"learning_rate": 9.999947520846931e-07,
"loss": 0.0575,
"reward": 0.655464380979538,
"reward_std": 0.6216425597667694,
"rewards/code_format_reward": 0.5078125074505806,
"rewards/code_reward": 0.20077905245125294,
"step": 10,
"zero_std_ratio": 0.125
},
{
"clip_ratio/high_max": 0.04116484243422747,
"clip_ratio/high_mean": 0.007335515914019197,
"clip_ratio/low_mean": 0.00010183055419474841,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007437346538063138,
"completion_length": 100.84750213623047,
"epoch": 0.003842090097012775,
"grad_norm": 2.409867286682129,
"kl": 1.1695969879627228,
"learning_rate": 9.999734326385416e-07,
"loss": -0.0111,
"reward": 0.9829235672950745,
"reward_std": 0.5127422153949738,
"rewards/code_format_reward": 0.84375,
"rewards/code_reward": 0.2805242508649826,
"step": 20,
"zero_std_ratio": 0.075
},
{
"clip_ratio/high_max": 0.039504543878138065,
"clip_ratio/high_mean": 0.0051267803879454735,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0051267803879454735,
"completion_length": 97.57750091552734,
"epoch": 0.005763135145519163,
"grad_norm": 4.608696460723877,
"kl": 2.0701700329780577,
"learning_rate": 9.99935714443203e-07,
"loss": -0.019,
"reward": 1.1568554759025573,
"reward_std": 0.6407819569110871,
"rewards/code_format_reward": 0.8674999952316285,
"rewards/code_reward": 0.3615527212619781,
"step": 30,
"zero_std_ratio": 0.025
},
{
"clip_ratio/high_max": 0.005034898268058896,
"clip_ratio/high_mean": 0.0006811309984186664,
"clip_ratio/low_mean": 0.00013664715661434456,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0008177781579433941,
"completion_length": 83.10500030517578,
"epoch": 0.00768418019402555,
"grad_norm": 4.833131313323975,
"kl": 2.2019619703292848,
"learning_rate": 9.99881598873272e-07,
"loss": -0.02,
"reward": 1.1795239448547363,
"reward_std": 0.7194581270217896,
"rewards/code_format_reward": 0.8987499952316285,
"rewards/code_reward": 0.36507447361946105,
"step": 40,
"zero_std_ratio": 0.05
},
{
"clip_ratio/high_max": 0.00872214906848967,
"clip_ratio/high_mean": 0.0010902686335612088,
"clip_ratio/low_mean": 0.00040422612219117584,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0014944947557523846,
"completion_length": 88.22500152587891,
"epoch": 0.009605225242531937,
"grad_norm": 2.778585433959961,
"kl": 2.4498987793922424,
"learning_rate": 9.998110879009265e-07,
"loss": -0.0035,
"reward": 1.2663686752319336,
"reward_std": 0.6244019389152526,
"rewards/code_format_reward": 0.918750011920929,
"rewards/code_reward": 0.40349680185317993,
"step": 50,
"zero_std_ratio": 0.075
},
{
"clip_ratio/high_max": 0.016367838624864815,
"clip_ratio/high_mean": 0.002741052128840238,
"clip_ratio/low_mean": 0.0008432979579083621,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003584350028540939,
"completion_length": 91.3375,
"epoch": 0.011526270291038325,
"grad_norm": 2.5201120376586914,
"kl": 2.7947509050369264,
"learning_rate": 9.997241840958557e-07,
"loss": 0.005,
"reward": 1.0697558522224426,
"reward_std": 0.49940577149391174,
"rewards/code_format_reward": 0.9200000047683716,
"rewards/code_reward": 0.30487790107727053,
"step": 60,
"zero_std_ratio": 0.025
},
{
"clip_ratio/high_max": 0.031629907339811324,
"clip_ratio/high_mean": 0.005140213097911328,
"clip_ratio/low_mean": 0.003656612744089216,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008796826144680381,
"completion_length": 84.79750213623046,
"epoch": 0.013447315339544713,
"grad_norm": 7.281564712524414,
"kl": 1.7218781247735024,
"learning_rate": 9.99620890625166e-07,
"loss": -0.0261,
"reward": 1.1421246886253358,
"reward_std": 0.5977877795696258,
"rewards/code_format_reward": 0.9275000095367432,
"rewards/code_reward": 0.33918734490871427,
"step": 70,
"zero_std_ratio": 0.05
},
{
"clip_ratio/high_max": 0.10261552361771464,
"clip_ratio/high_mean": 0.014289343578275293,
"clip_ratio/low_mean": 0.0031720689148642123,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.017461412807460875,
"completion_length": 75.67250061035156,
"epoch": 0.0153683603880511,
"grad_norm": 3.359511137008667,
"kl": 0.3687619216740131,
"learning_rate": 9.995012112532654e-07,
"loss": -0.0037,
"reward": 1.2640612244606018,
"reward_std": 0.5189764618873596,
"rewards/code_format_reward": 0.9087499976158142,
"rewards/code_reward": 0.40484309792518614,
"step": 80,
"zero_std_ratio": 0.075
},
{
"clip_ratio/high_max": 0.053048994287382814,
"clip_ratio/high_mean": 0.0092369354548282,
"clip_ratio/low_mean": 0.00010360952001065016,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009340544970473274,
"completion_length": 84.56500091552735,
"epoch": 0.01728940543655749,
"grad_norm": 2.177191734313965,
"kl": 0.5693678379058837,
"learning_rate": 9.993651503417269e-07,
"loss": -0.008,
"reward": 1.1986377000808717,
"reward_std": 0.49277395009994507,
"rewards/code_format_reward": 0.9112500071525573,
"rewards/code_reward": 0.3715063512325287,
"step": 90,
"zero_std_ratio": 0.1
},
{
"clip_ratio/high_max": 0.04136249013245106,
"clip_ratio/high_mean": 0.00551328391302377,
"clip_ratio/low_mean": 0.001408070686738938,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006921354681253433,
"completion_length": 80.8550033569336,
"epoch": 0.019210450485063875,
"grad_norm": 2.0033416748046875,
"kl": 0.8493028253316879,
"learning_rate": 9.992127128491296e-07,
"loss": 0.0027,
"reward": 1.1780336141586303,
"reward_std": 0.4479735493659973,
"rewards/code_format_reward": 0.9275000095367432,
"rewards/code_reward": 0.3571417987346649,
"step": 100,
"zero_std_ratio": 0.125
},
{
"clip_ratio/high_max": 0.0585523322224617,
"clip_ratio/high_mean": 0.008032404945697635,
"clip_ratio/low_mean": 0.006125411042012275,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.014157815964426845,
"completion_length": 74.19000091552735,
"epoch": 0.02113149553357026,
"grad_norm": 2.267624855041504,
"kl": 1.1039492040872574,
"learning_rate": 9.990439043308776e-07,
"loss": -0.0238,
"reward": 1.2784739494323731,
"reward_std": 0.49057124853134154,
"rewards/code_format_reward": 0.9475000023841857,
"rewards/code_reward": 0.40236196517944334,
"step": 110,
"zero_std_ratio": 0.175
},
{
"clip_ratio/high_max": 0.07124514738097787,
"clip_ratio/high_mean": 0.01569047374650836,
"clip_ratio/low_mean": 0.0004420768018462695,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01613254987169057,
"completion_length": 68.35750045776368,
"epoch": 0.02305254058207665,
"grad_norm": 4.1564249992370605,
"kl": 1.4338344126939773,
"learning_rate": 9.988587309389975e-07,
"loss": -0.0026,
"reward": 1.1606964468955994,
"reward_std": 0.46601226925849915,
"rewards/code_format_reward": 0.9475000023841857,
"rewards/code_reward": 0.34347322285175325,
"step": 120,
"zero_std_ratio": 0.175
},
{
"clip_ratio/high_max": 0.07592196827754379,
"clip_ratio/high_mean": 0.014454811741597951,
"clip_ratio/low_mean": 0.0013599038298707455,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015814715722808615,
"completion_length": 72.15750122070312,
"epoch": 0.024973585630583037,
"grad_norm": 3.9662575721740723,
"kl": 1.5312897458672523,
"learning_rate": 9.98657199421914e-07,
"loss": -0.0024,
"reward": 1.1610160946846009,
"reward_std": 0.3773229032754898,
"rewards/code_format_reward": 0.9587499976158143,
"rewards/code_reward": 0.3408205330371857,
"step": 130,
"zero_std_ratio": 0.25
},
{
"clip_ratio/high_max": 0.07943324451334774,
"clip_ratio/high_mean": 0.014111382194096222,
"clip_ratio/low_mean": 0.0036704083904623985,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.017781790602020918,
"completion_length": 83.8675033569336,
"epoch": 0.026894630679089426,
"grad_norm": 9.37182331085205,
"kl": 0.5293755233287811,
"learning_rate": 9.984393171242054e-07,
"loss": -0.0045,
"reward": 1.3634901762008667,
"reward_std": 0.5678210258483887,
"rewards/code_format_reward": 0.9512500047683716,
"rewards/code_reward": 0.4439325869083405,
"step": 140,
"zero_std_ratio": 0.175
},
{
"clip_ratio/high_max": 0.13982175141572953,
"clip_ratio/high_mean": 0.018845621962100267,
"clip_ratio/low_mean": 0.0009358229042845778,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.019781444873660802,
"completion_length": 79.41999969482421,
"epoch": 0.028815675727595812,
"grad_norm": 3.3437957763671875,
"kl": 1.0034890450537204,
"learning_rate": 9.982050919863332e-07,
"loss": -0.0003,
"reward": 1.332119607925415,
"reward_std": 0.4401752531528473,
"rewards/code_format_reward": 0.9674999952316284,
"rewards/code_reward": 0.4241847813129425,
"step": 150,
"zero_std_ratio": 0.2
},
{
"clip_ratio/high_max": 0.08667803611606359,
"clip_ratio/high_mean": 0.01204416286200285,
"clip_ratio/low_mean": 0.0012475995084969328,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013291762379230932,
"completion_length": 80.43250122070313,
"epoch": 0.0307367207761022,
"grad_norm": 3.763737678527832,
"kl": 0.9024959966540337,
"learning_rate": 9.979545325443564e-07,
"loss": -0.0043,
"reward": 1.3518987059593202,
"reward_std": 0.46767728328704833,
"rewards/code_format_reward": 0.9450000047683715,
"rewards/code_reward": 0.4396993488073349,
"step": 160,
"zero_std_ratio": 0.25
},
{
"clip_ratio/high_max": 0.08449154160916805,
"clip_ratio/high_mean": 0.012011481402441859,
"clip_ratio/low_mean": 0.00172541297506541,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013736894307658076,
"completion_length": 78.61250152587891,
"epoch": 0.03265776582460859,
"grad_norm": 7.203779220581055,
"kl": 0.9198675453662872,
"learning_rate": 9.976876479296167e-07,
"loss": -0.0013,
"reward": 1.3803849458694457,
"reward_std": 0.4038102596998215,
"rewards/code_format_reward": 0.9587499976158143,
"rewards/code_reward": 0.4505049705505371,
"step": 170,
"zero_std_ratio": 0.2
},
{
"clip_ratio/high_max": 0.07188423536717892,
"clip_ratio/high_mean": 0.013125935778953135,
"clip_ratio/low_mean": 0.0036661239922977985,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.016792060085572304,
"completion_length": 77.1875,
"epoch": 0.03457881087311498,
"grad_norm": 4.487454414367676,
"kl": 1.7507148087024689,
"learning_rate": 9.974044478684084e-07,
"loss": 0.0129,
"reward": 1.3845421075820923,
"reward_std": 0.5211645245552063,
"rewards/code_format_reward": 0.9325000047683716,
"rewards/code_reward": 0.4591460168361664,
"step": 180,
"zero_std_ratio": 0.175
},
{
"clip_ratio/high_max": 0.03638382372446358,
"clip_ratio/high_mean": 0.005234482995001599,
"clip_ratio/low_mean": 0.0020637288223952057,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007298211794113741,
"completion_length": 72.73499984741211,
"epoch": 0.03649985592162136,
"grad_norm": 1.9644516706466675,
"kl": 1.5947209149599075,
"learning_rate": 9.97104942681622e-07,
"loss": -0.0015,
"reward": 1.5394827842712402,
"reward_std": 0.42243914008140565,
"rewards/code_format_reward": 0.9625,
"rewards/code_reward": 0.5291163563728333,
"step": 190,
"zero_std_ratio": 0.225
},
{
"clip_ratio/high_max": 0.2028519107028842,
"clip_ratio/high_mean": 0.031164265819825232,
"clip_ratio/low_mean": 0.00270410452503711,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03386837020516396,
"completion_length": 69.81000061035157,
"epoch": 0.03842090097012775,
"grad_norm": 3.170088052749634,
"kl": 1.0117133632302284,
"learning_rate": 9.9678914328437e-07,
"loss": 0.0113,
"reward": 1.4108091354370118,
"reward_std": 0.43394198417663576,
"rewards/code_format_reward": 0.9675000071525574,
"rewards/code_reward": 0.46352959871292115,
"step": 200,
"zero_std_ratio": 0.225
},
{
"clip_ratio/high_max": 0.053923821565695106,
"clip_ratio/high_mean": 0.009883182743215002,
"clip_ratio/low_mean": 0.0038779765891376883,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013761159335263073,
"completion_length": 69.04750213623046,
"epoch": 0.04034194601863414,
"grad_norm": 2.5729293823242188,
"kl": 1.1861489608883857,
"learning_rate": 9.964570611855874e-07,
"loss": -0.007,
"reward": 1.4398113250732423,
"reward_std": 0.39351261258125303,
"rewards/code_format_reward": 0.9650000095367431,
"rewards/code_reward": 0.47865564227104185,
"step": 210,
"zero_std_ratio": 0.3
},
{
"clip_ratio/high_max": 0.1575187448877841,
"clip_ratio/high_mean": 0.020484526228392495,
"clip_ratio/low_mean": 0.011988240911159664,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03247276756446808,
"completion_length": 61.67000122070313,
"epoch": 0.04226299106714052,
"grad_norm": 9.919574737548828,
"kl": 3.983895111083984,
"learning_rate": 9.961087084876135e-07,
"loss": 0.0076,
"reward": 1.2202381372451783,
"reward_std": 0.26475468575954436,
"rewards/code_format_reward": 0.96875,
"rewards/code_reward": 0.36793155074119566,
"step": 220,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.141914052516222,
"clip_ratio/high_mean": 0.023408634401857854,
"clip_ratio/low_mean": 0.004240041392040439,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.027648675863747484,
"completion_length": 67.37250213623047,
"epoch": 0.04418403611564691,
"grad_norm": 106.46134185791016,
"kl": 2.121386268734932,
"learning_rate": 9.957440978857498e-07,
"loss": -0.0021,
"reward": 1.3681801557540894,
"reward_std": 0.37111111879348757,
"rewards/code_format_reward": 0.9675000071525574,
"rewards/code_reward": 0.4422150731086731,
"step": 230,
"zero_std_ratio": 0.25
},
{
"clip_ratio/high_max": 0.07315623210743069,
"clip_ratio/high_mean": 0.01155225959373638,
"clip_ratio/low_mean": 0.005734288269013632,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01728654802427627,
"completion_length": 72.65750198364258,
"epoch": 0.0461050811641533,
"grad_norm": 3.1017534732818604,
"kl": 0.882834991812706,
"learning_rate": 9.953632426677983e-07,
"loss": -0.0093,
"reward": 1.484795618057251,
"reward_std": 0.4526777356863022,
"rewards/code_format_reward": 0.9662500023841858,
"rewards/code_reward": 0.5008352994918823,
"step": 240,
"zero_std_ratio": 0.2
},
{
"clip_ratio/high_max": 0.06419091664720326,
"clip_ratio/high_mean": 0.008793376400717534,
"clip_ratio/low_mean": 0.0021902987034991385,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010983675080933609,
"completion_length": 88.21500091552734,
"epoch": 0.048026126212659684,
"grad_norm": 5.3243279457092285,
"kl": 2.7226425796747207,
"learning_rate": 9.94966156713577e-07,
"loss": -0.0127,
"reward": 1.455380654335022,
"reward_std": 0.4675000965595245,
"rewards/code_format_reward": 0.9737500071525573,
"rewards/code_reward": 0.4842528164386749,
"step": 250,
"zero_std_ratio": 0.2
},
{
"clip_ratio/high_max": 0.0741606397787109,
"clip_ratio/high_mean": 0.012071207936969586,
"clip_ratio/low_mean": 0.003062122967094183,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015133331064134836,
"completion_length": 88.98250122070313,
"epoch": 0.04994717126116607,
"grad_norm": 2.7804369926452637,
"kl": 0.6169008180499077,
"learning_rate": 9.94552854494413e-07,
"loss": 0.0033,
"reward": 1.427869987487793,
"reward_std": 0.4851543098688126,
"rewards/code_format_reward": 0.96875,
"rewards/code_reward": 0.4717474699020386,
"step": 260,
"zero_std_ratio": 0.1
},
{
"clip_ratio/high_max": 0.03937563952058554,
"clip_ratio/high_mean": 0.0065028761862777175,
"clip_ratio/low_mean": 0.004409579199273139,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01091245551360771,
"completion_length": 87.42750091552735,
"epoch": 0.05186821630967246,
"grad_norm": 6.559643745422363,
"kl": 0.4433484449982643,
"learning_rate": 9.941233510726168e-07,
"loss": -0.0018,
"reward": 1.4182387351989747,
"reward_std": 0.4612067699432373,
"rewards/code_format_reward": 0.9412499904632569,
"rewards/code_reward": 0.4738068819046021,
"step": 270,
"zero_std_ratio": 0.175
},
{
"clip_ratio/high_max": 0.057588514033705,
"clip_ratio/high_mean": 0.008462971181143076,
"clip_ratio/low_mean": 0.007865038787713274,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.016328010114375503,
"completion_length": 79.69500122070312,
"epoch": 0.05378926135817885,
"grad_norm": 6.077131271362305,
"kl": 0.6961165189743042,
"learning_rate": 9.936776621009322e-07,
"loss": 0.0038,
"reward": 1.5715951919555664,
"reward_std": 0.4179812580347061,
"rewards/code_format_reward": 0.975,
"rewards/code_reward": 0.5420475661754608,
"step": 280,
"zero_std_ratio": 0.2
},
{
"clip_ratio/high_max": 0.025062982086092235,
"clip_ratio/high_mean": 0.004761367203900591,
"clip_ratio/low_mean": 0.0028623046877328307,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007623671973124147,
"completion_length": 83.21750183105469,
"epoch": 0.055710306406685235,
"grad_norm": 6.066061019897461,
"kl": 0.7484225794672966,
"learning_rate": 9.932158038219662e-07,
"loss": -0.0052,
"reward": 1.1587857127189636,
"reward_std": 0.39943512678146365,
"rewards/code_format_reward": 0.9637500047683716,
"rewards/code_reward": 0.3384553253650665,
"step": 290,
"zero_std_ratio": 0.25
},
{
"clip_ratio/high_max": 0.10117955654859542,
"clip_ratio/high_mean": 0.013649052195250987,
"clip_ratio/low_mean": 0.0008885912131518126,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.014537643361836671,
"completion_length": 84.00750122070312,
"epoch": 0.057631351455191625,
"grad_norm": 3.23887038230896,
"kl": 0.8064253896474838,
"learning_rate": 9.92737793067597e-07,
"loss": -0.0034,
"reward": 1.3393104553222657,
"reward_std": 0.4101540923118591,
"rewards/code_format_reward": 0.9549999952316284,
"rewards/code_reward": 0.43090522289276123,
"step": 300,
"zero_std_ratio": 0.15
},
{
"clip_ratio/high_max": 0.04703736044466496,
"clip_ratio/high_mean": 0.007716302154585719,
"clip_ratio/low_mean": 0.0006432932626921683,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008359595513320528,
"completion_length": 77.70500030517579,
"epoch": 0.059552396503698014,
"grad_norm": 3.357680320739746,
"kl": 0.6727996915578842,
"learning_rate": 9.922436472583614e-07,
"loss": 0.0013,
"reward": 1.6670202493667603,
"reward_std": 0.4320096135139465,
"rewards/code_format_reward": 0.9712500095367431,
"rewards/code_reward": 0.5906976163387299,
"step": 310,
"zero_std_ratio": 0.3
},
{
"clip_ratio/high_max": 0.16380154211074113,
"clip_ratio/high_mean": 0.03262772373855114,
"clip_ratio/low_mean": 0.0011754593724617735,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03380318162962794,
"completion_length": 72.78750152587891,
"epoch": 0.0614734415522044,
"grad_norm": 3.652451992034912,
"kl": 1.8953835844993592,
"learning_rate": 9.91733384402818e-07,
"loss": -0.005,
"reward": 1.4837595462799071,
"reward_std": 0.45500350296497344,
"rewards/code_format_reward": 0.9662500023841858,
"rewards/code_reward": 0.5003172576427459,
"step": 320,
"zero_std_ratio": 0.225
},
{
"clip_ratio/high_max": 0.034284231485798955,
"clip_ratio/high_mean": 0.005935872689587995,
"clip_ratio/low_mean": 0.000911827472737059,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0068477002554573115,
"completion_length": 74.17000274658203,
"epoch": 0.06339448660071079,
"grad_norm": 1.5065704584121704,
"kl": 0.40838020071387293,
"learning_rate": 9.912070230968928e-07,
"loss": -0.0054,
"reward": 1.3848075151443482,
"reward_std": 0.3038723856210709,
"rewards/code_format_reward": 0.9612499952316285,
"rewards/code_reward": 0.45209125280380247,
"step": 330,
"zero_std_ratio": 0.35
},
{
"clip_ratio/high_max": 0.05724322898313403,
"clip_ratio/high_mean": 0.009350239217747002,
"clip_ratio/low_mean": 0.0077414238592609765,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0170916625414975,
"completion_length": 80.06500091552735,
"epoch": 0.06531553164921718,
"grad_norm": 3.77842116355896,
"kl": 0.8782595857977867,
"learning_rate": 9.906645825232008e-07,
"loss": -0.0023,
"reward": 1.294193172454834,
"reward_std": 0.3676457226276398,
"rewards/code_format_reward": 0.9549999952316284,
"rewards/code_reward": 0.4083465874195099,
"step": 340,
"zero_std_ratio": 0.275
},
{
"clip_ratio/high_max": 0.10199148450046777,
"clip_ratio/high_mean": 0.018657304299995302,
"clip_ratio/low_mean": 0.004165191331412643,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.022822496155276893,
"completion_length": 86.5000015258789,
"epoch": 0.06723657669772357,
"grad_norm": 3.2845616340637207,
"kl": 0.9463568836450577,
"learning_rate": 9.901060824503463e-07,
"loss": -0.0115,
"reward": 1.485135293006897,
"reward_std": 0.48840407729148866,
"rewards/code_format_reward": 0.9487499833106995,
"rewards/code_reward": 0.5053801357746124,
"step": 350,
"zero_std_ratio": 0.225
},
{
"clip_ratio/high_max": 0.07233364712446928,
"clip_ratio/high_mean": 0.009769158461131156,
"clip_ratio/low_mean": 0.019356250233249737,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.029125408595427872,
"completion_length": 80.54000091552734,
"epoch": 0.06915762174622996,
"grad_norm": 19.32016944885254,
"kl": 1.1565445899963378,
"learning_rate": 9.89531543232204e-07,
"loss": 0.0045,
"reward": 1.3412477493286132,
"reward_std": 0.49785757064819336,
"rewards/code_format_reward": 0.9599999904632568,
"rewards/code_reward": 0.43062385320663454,
"step": 360,
"zero_std_ratio": 0.25
},
{
"clip_ratio/high_max": 0.11471173651516438,
"clip_ratio/high_mean": 0.02246011425741017,
"clip_ratio/low_mean": 0.00892345790634863,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.031383572798222306,
"completion_length": 74.02000274658204,
"epoch": 0.07107866679473633,
"grad_norm": 2.2520923614501953,
"kl": 1.074078917503357,
"learning_rate": 9.889409858071753e-07,
"loss": -0.0059,
"reward": 1.5273491621017456,
"reward_std": 0.414175683259964,
"rewards/code_format_reward": 0.9775000095367432,
"rewards/code_reward": 0.519299578666687,
"step": 370,
"zero_std_ratio": 0.275
},
{
"clip_ratio/high_max": 0.06336253914050757,
"clip_ratio/high_mean": 0.01199121386744082,
"clip_ratio/low_mean": 0.009130357182584703,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.021121570840477943,
"completion_length": 86.4000015258789,
"epoch": 0.07299971184324272,
"grad_norm": 4.1052961349487305,
"kl": 1.3110491752624511,
"learning_rate": 9.883344316974266e-07,
"loss": -0.0079,
"reward": 1.5908024072647096,
"reward_std": 0.47413656711578367,
"rewards/code_format_reward": 0.9600000023841858,
"rewards/code_reward": 0.555401211977005,
"step": 380,
"zero_std_ratio": 0.2
},
{
"clip_ratio/high_max": 0.04433182019274682,
"clip_ratio/high_mean": 0.008989717412623577,
"clip_ratio/low_mean": 0.006074265367351473,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015063982826541178,
"completion_length": 86.175,
"epoch": 0.07492075689174911,
"grad_norm": 4.5202155113220215,
"kl": 0.830048742890358,
"learning_rate": 9.877119030081048e-07,
"loss": -0.0051,
"reward": 1.492829155921936,
"reward_std": 0.3874175697565079,
"rewards/code_format_reward": 0.9824999928474426,
"rewards/code_reward": 0.5007895469665528,
"step": 390,
"zero_std_ratio": 0.325
},
{
"clip_ratio/high_max": 0.1522485612425953,
"clip_ratio/high_mean": 0.0220908185117878,
"clip_ratio/low_mean": 0.012701757764443756,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03479257607832551,
"completion_length": 78.91000137329101,
"epoch": 0.0768418019402555,
"grad_norm": 2.6146676540374756,
"kl": 0.8627120085060597,
"learning_rate": 9.870734224265308e-07,
"loss": -0.0059,
"reward": 1.5748756647109985,
"reward_std": 0.3048340857028961,
"rewards/code_format_reward": 0.987500011920929,
"rewards/code_reward": 0.5405627965927124,
"step": 400,
"zero_std_ratio": 0.4
},
{
"clip_ratio/high_max": 0.16312104668468236,
"clip_ratio/high_mean": 0.025311203207820654,
"clip_ratio/low_mean": 0.008227485651150345,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03353868862614036,
"completion_length": 77.27750091552734,
"epoch": 0.07876284698876189,
"grad_norm": 1.7234841585159302,
"kl": 0.8750749856233597,
"learning_rate": 9.864190132213742e-07,
"loss": -0.0062,
"reward": 1.6338460445404053,
"reward_std": 0.3537067860364914,
"rewards/code_format_reward": 0.9849999904632568,
"rewards/code_reward": 0.570673018693924,
"step": 410,
"zero_std_ratio": 0.325
},
{
"clip_ratio/high_max": 0.0936438184697181,
"clip_ratio/high_mean": 0.014423616812564433,
"clip_ratio/low_mean": 0.010347768076462672,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.024771385360509157,
"completion_length": 75.69749908447265,
"epoch": 0.08068389203726828,
"grad_norm": 2.0902154445648193,
"kl": 1.264050543308258,
"learning_rate": 9.857486992418036e-07,
"loss": 0.0048,
"reward": 1.644848608970642,
"reward_std": 0.277804034948349,
"rewards/code_format_reward": 0.9799999952316284,
"rewards/code_reward": 0.5774242997169494,
"step": 420,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.05532362968660891,
"clip_ratio/high_mean": 0.00992250678827986,
"clip_ratio/low_mean": 0.004125738283619285,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.014048245223239064,
"completion_length": 69.60749969482421,
"epoch": 0.08260493708577465,
"grad_norm": 3.702075481414795,
"kl": 1.7400359451770782,
"learning_rate": 9.850625049166189e-07,
"loss": -0.0008,
"reward": 1.5316168069839478,
"reward_std": 0.275749945640564,
"rewards/code_format_reward": 0.9737499952316284,
"rewards/code_reward": 0.5223708748817444,
"step": 430,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.15841160174459218,
"clip_ratio/high_mean": 0.02351265251636505,
"clip_ratio/low_mean": 0.010281538363778963,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.033794190967455506,
"completion_length": 74.51000061035157,
"epoch": 0.08452598213428104,
"grad_norm": 3.4808361530303955,
"kl": 1.2856003642082214,
"learning_rate": 9.8436045525336e-07,
"loss": -0.0035,
"reward": 1.5067368984222411,
"reward_std": 0.28293364942073823,
"rewards/code_format_reward": 0.9737499833106995,
"rewards/code_reward": 0.5099309325218201,
"step": 440,
"zero_std_ratio": 0.375
},
{
"clip_ratio/high_max": 0.06043836465105414,
"clip_ratio/high_mean": 0.009103650611359626,
"clip_ratio/low_mean": 0.002932069695089012,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012035720515996218,
"completion_length": 76.08250122070312,
"epoch": 0.08644702718278743,
"grad_norm": 3.665134906768799,
"kl": 1.0338351279497147,
"learning_rate": 9.836425758373958e-07,
"loss": 0.0011,
"reward": 1.4822889804840087,
"reward_std": 0.18996141627430915,
"rewards/code_format_reward": 0.9674999952316284,
"rewards/code_reward": 0.49926944375038146,
"step": 450,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.21641009524464608,
"clip_ratio/high_mean": 0.03260216782800853,
"clip_ratio/low_mean": 0.007402116784942336,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.040004284400492904,
"completion_length": 73.13500213623047,
"epoch": 0.08836807223129382,
"grad_norm": 3.1982343196868896,
"kl": 0.6477661892771721,
"learning_rate": 9.829088928309923e-07,
"loss": -0.0043,
"reward": 1.7202057361602783,
"reward_std": 0.25773381292819975,
"rewards/code_format_reward": 0.975,
"rewards/code_reward": 0.6163528442382813,
"step": 460,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.09453173456713557,
"clip_ratio/high_mean": 0.015337946941144764,
"clip_ratio/low_mean": 0.005975433619460091,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02131338034523651,
"completion_length": 81.9000015258789,
"epoch": 0.09028911727980021,
"grad_norm": 1.441091775894165,
"kl": 0.6155861958861351,
"learning_rate": 9.82159432972358e-07,
"loss": -0.0063,
"reward": 1.4617766380310058,
"reward_std": 0.24772228300571442,
"rewards/code_format_reward": 0.9774999976158142,
"rewards/code_reward": 0.48651331663131714,
"step": 470,
"zero_std_ratio": 0.3
},
{
"clip_ratio/high_max": 0.16705528497695923,
"clip_ratio/high_mean": 0.026639112271368504,
"clip_ratio/low_mean": 0.0035399875399889425,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03017909936606884,
"completion_length": 77.79500274658203,
"epoch": 0.0922101623283066,
"grad_norm": 47.74139404296875,
"kl": 1.360982394218445,
"learning_rate": 9.813942235746705e-07,
"loss": 0.0034,
"reward": 1.5168325901031494,
"reward_std": 0.3997103154659271,
"rewards/code_format_reward": 0.9737500071525573,
"rewards/code_reward": 0.5149787843227387,
"step": 480,
"zero_std_ratio": 0.275
},
{
"clip_ratio/high_max": 0.26956315375864504,
"clip_ratio/high_mean": 0.04211876043118536,
"clip_ratio/low_mean": 0.002336682367604226,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04445544336922467,
"completion_length": 86.21500091552734,
"epoch": 0.09413120737681299,
"grad_norm": 3.7244272232055664,
"kl": 2.59437358379364,
"learning_rate": 9.80613292525081e-07,
"loss": 0.0038,
"reward": 1.6131777048110962,
"reward_std": 0.32231712639331817,
"rewards/code_format_reward": 0.9799999833106995,
"rewards/code_reward": 0.5615888297557831,
"step": 490,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.22240130547434092,
"clip_ratio/high_mean": 0.044074146053753795,
"clip_ratio/low_mean": 0.012573283386882395,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0566474299877882,
"completion_length": 72.23500061035156,
"epoch": 0.09605225242531937,
"grad_norm": 2.852999687194824,
"kl": 1.615745335817337,
"learning_rate": 9.79816668283697e-07,
"loss": 0.0017,
"reward": 1.5203128576278686,
"reward_std": 0.3012717217206955,
"rewards/code_format_reward": 0.9725000023841858,
"rewards/code_reward": 0.517031443119049,
"step": 500,
"zero_std_ratio": 0.35
},
{
"clip_ratio/high_max": 0.15330625362694264,
"clip_ratio/high_mean": 0.02403738833963871,
"clip_ratio/low_mean": 0.004583830677438528,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.028621218353509902,
"completion_length": 74.30000076293945,
"epoch": 0.09797329747382576,
"grad_norm": 2.484840154647827,
"kl": 2.1540999174118043,
"learning_rate": 9.790043798825458e-07,
"loss": 0.0073,
"reward": 1.5013367414474488,
"reward_std": 0.24206546545028687,
"rewards/code_format_reward": 0.9699999928474426,
"rewards/code_reward": 0.508168363571167,
"step": 510,
"zero_std_ratio": 0.4
},
{
"clip_ratio/high_max": 0.15383050357922912,
"clip_ratio/high_mean": 0.027125787048134953,
"clip_ratio/low_mean": 0.002593657124089077,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.029719442850910126,
"completion_length": 65.0400016784668,
"epoch": 0.09989434252233215,
"grad_norm": 7.2150959968566895,
"kl": 1.1968895211815833,
"learning_rate": 9.781764569245178e-07,
"loss": -0.006,
"reward": 1.510750651359558,
"reward_std": 0.41533524394035337,
"rewards/code_format_reward": 0.9712500095367431,
"rewards/code_reward": 0.5125628054141999,
"step": 520,
"zero_std_ratio": 0.325
},
{
"clip_ratio/high_max": 0.109321213606745,
"clip_ratio/high_mean": 0.018354640086181463,
"clip_ratio/low_mean": 0.011131488461978733,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.029486127989366652,
"completion_length": 74.52250213623047,
"epoch": 0.10181538757083854,
"grad_norm": 1.8456060886383057,
"kl": 0.7155197218060494,
"learning_rate": 9.773329295822844e-07,
"loss": 0.0073,
"reward": 1.5899319171905517,
"reward_std": 0.3179755389690399,
"rewards/code_format_reward": 0.975,
"rewards/code_reward": 0.5512159705162049,
"step": 530,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.04905872759409249,
"clip_ratio/high_mean": 0.008021075790748,
"clip_ratio/low_mean": 0.004390958754811436,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012412034533917904,
"completion_length": 67.07500076293945,
"epoch": 0.10373643261934493,
"grad_norm": 4.641266345977783,
"kl": 0.7290919035673141,
"learning_rate": 9.764738285972015e-07,
"loss": 0.0008,
"reward": 1.300760817527771,
"reward_std": 0.3361863404512405,
"rewards/code_format_reward": 0.9537500143051147,
"rewards/code_reward": 0.4119428813457489,
"step": 540,
"zero_std_ratio": 0.325
},
{
"clip_ratio/high_max": 0.1825170351192355,
"clip_ratio/high_mean": 0.027253909036517143,
"clip_ratio/low_mean": 0.0015975978298229166,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.028851506439968942,
"completion_length": 73.99250030517578,
"epoch": 0.10565747766785132,
"grad_norm": 1.1770566701889038,
"kl": 1.328820213675499,
"learning_rate": 9.755991852781876e-07,
"loss": -0.0023,
"reward": 1.5671115159988402,
"reward_std": 0.34309983551502227,
"rewards/code_format_reward": 0.9737500071525573,
"rewards/code_reward": 0.5401182293891906,
"step": 550,
"zero_std_ratio": 0.3
},
{
"clip_ratio/high_max": 0.12550847120583059,
"clip_ratio/high_mean": 0.025771993771195413,
"clip_ratio/low_mean": 0.0035689805867150427,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.029340974800288678,
"completion_length": 71.76750030517579,
"epoch": 0.1075785227163577,
"grad_norm": 0.3435879647731781,
"kl": 2.12383970618248,
"learning_rate": 9.747090315005836e-07,
"loss": 0.0024,
"reward": 1.5273173809051515,
"reward_std": 0.2889336168766022,
"rewards/code_format_reward": 0.9649999976158142,
"rewards/code_reward": 0.5224087119102478,
"step": 560,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.0834595168940723,
"clip_ratio/high_mean": 0.015262311231344939,
"clip_ratio/low_mean": 0.021645180485211312,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03690749178640544,
"completion_length": 79.53250122070312,
"epoch": 0.10949956776486408,
"grad_norm": 1.7026695013046265,
"kl": 1.6705755025148392,
"learning_rate": 9.738033997049902e-07,
"loss": 0.1708,
"reward": 1.5908133745193482,
"reward_std": 0.3691225051879883,
"rewards/code_format_reward": 0.9912500023841858,
"rewards/code_reward": 0.5475941836833954,
"step": 570,
"zero_std_ratio": 0.35
},
{
"clip_ratio/high_max": 0.18124623028561473,
"clip_ratio/high_mean": 0.02496154889231548,
"clip_ratio/low_mean": 0.020611650816863402,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04557319916784763,
"completion_length": 85.51750183105469,
"epoch": 0.11142061281337047,
"grad_norm": 18.138025283813477,
"kl": 4.237766814231873,
"learning_rate": 9.728823228960862e-07,
"loss": -0.0051,
"reward": 1.5469601631164551,
"reward_std": 0.37420718297362326,
"rewards/code_format_reward": 0.975000011920929,
"rewards/code_reward": 0.5297300696372986,
"step": 580,
"zero_std_ratio": 0.3
},
{
"clip_ratio/high_max": 0.014268473512493074,
"clip_ratio/high_mean": 0.0028658110386459157,
"clip_ratio/low_mean": 0.0056301898322999476,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008496000757440924,
"completion_length": 80.10750274658203,
"epoch": 0.11334165786187686,
"grad_norm": 5.16138219833374,
"kl": 0.6609396353363991,
"learning_rate": 9.71945834641426e-07,
"loss": -0.004,
"reward": 1.4476024627685546,
"reward_std": 0.3472218900918961,
"rewards/code_format_reward": 0.9699999928474426,
"rewards/code_reward": 0.4813012361526489,
"step": 590,
"zero_std_ratio": 0.325
},
{
"clip_ratio/high_max": 0.17693078136071563,
"clip_ratio/high_mean": 0.02441923434380442,
"clip_ratio/low_mean": 0.012987980741309002,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.037407214660197495,
"completion_length": 83.96500091552734,
"epoch": 0.11526270291038325,
"grad_norm": 1.7465465068817139,
"kl": 1.0383819937705994,
"learning_rate": 9.709939690702158e-07,
"loss": -0.0078,
"reward": 1.4550770282745362,
"reward_std": 0.3056318134069443,
"rewards/code_format_reward": 0.9587500095367432,
"rewards/code_reward": 0.48785099387168884,
"step": 600,
"zero_std_ratio": 0.375
},
{
"clip_ratio/high_max": 0.1888352295383811,
"clip_ratio/high_mean": 0.026437551854178308,
"clip_ratio/low_mean": 0.0054486555512994524,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.031886206939816475,
"completion_length": 79.63500213623047,
"epoch": 0.11718374795888964,
"grad_norm": 5.674210548400879,
"kl": 1.2073093384504319,
"learning_rate": 9.700267608720692e-07,
"loss": -0.0021,
"reward": 1.4424492359161376,
"reward_std": 0.3397494524717331,
"rewards/code_format_reward": 0.9725000143051148,
"rewards/code_reward": 0.4780996203422546,
"step": 610,
"zero_std_ratio": 0.275
},
{
"clip_ratio/high_max": 0.09671425293199717,
"clip_ratio/high_mean": 0.020163473271531986,
"clip_ratio/low_mean": 0.006395513273309917,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02655898590455763,
"completion_length": 75.22750091552734,
"epoch": 0.11910479300739603,
"grad_norm": 5.531320571899414,
"kl": 2.2407817423343657,
"learning_rate": 9.690442452957448e-07,
"loss": -0.0021,
"reward": 1.5595922470092773,
"reward_std": 0.28165863305330274,
"rewards/code_format_reward": 0.9787499904632568,
"rewards/code_reward": 0.5351086378097534,
"step": 620,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.11810005996376276,
"clip_ratio/high_mean": 0.02500568316318095,
"clip_ratio/low_mean": 0.00357620443101041,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.028581888042390348,
"completion_length": 80.09000091552734,
"epoch": 0.1210258380559024,
"grad_norm": 2.165558338165283,
"kl": 1.546025463938713,
"learning_rate": 9.680464581478594e-07,
"loss": -0.0037,
"reward": 1.51439368724823,
"reward_std": 0.3320598304271698,
"rewards/code_format_reward": 0.9725000023841858,
"rewards/code_reward": 0.5140718221664429,
"step": 630,
"zero_std_ratio": 0.35
},
{
"clip_ratio/high_max": 0.10313799739815295,
"clip_ratio/high_mean": 0.017414161982014776,
"clip_ratio/low_mean": 0.009596780824358575,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.027010941854678096,
"completion_length": 76.15749969482422,
"epoch": 0.1229468831044088,
"grad_norm": 5.05511999130249,
"kl": 1.6615911841392517,
"learning_rate": 9.670334357915852e-07,
"loss": 0.0033,
"reward": 1.5930729150772094,
"reward_std": 0.3864523351192474,
"rewards/code_format_reward": 0.9662500023841858,
"rewards/code_reward": 0.554973942041397,
"step": 640,
"zero_std_ratio": 0.275
},
{
"clip_ratio/high_max": 0.1653188370168209,
"clip_ratio/high_mean": 0.027094300370663404,
"clip_ratio/low_mean": 0.0033949258620850744,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03048922661691904,
"completion_length": 74.23250274658203,
"epoch": 0.12486792815291518,
"grad_norm": 1.1590094566345215,
"kl": 0.39487394616007804,
"learning_rate": 9.660052151453228e-07,
"loss": -0.006,
"reward": 1.7198987245559691,
"reward_std": 0.3215783953666687,
"rewards/code_format_reward": 0.9849999904632568,
"rewards/code_reward": 0.613699346780777,
"step": 650,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.2655821519903839,
"clip_ratio/high_mean": 0.03813204998150468,
"clip_ratio/low_mean": 0.017123100493336096,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.05525515023618936,
"completion_length": 79.31999969482422,
"epoch": 0.12678897320142157,
"grad_norm": 2.8189809322357178,
"kl": 0.9924295842647552,
"learning_rate": 9.649618336813565e-07,
"loss": -0.0022,
"reward": 1.710445189476013,
"reward_std": 0.2906018912792206,
"rewards/code_format_reward": 0.9762500047683715,
"rewards/code_reward": 0.6111600875854493,
"step": 660,
"zero_std_ratio": 0.35
},
{
"clip_ratio/high_max": 0.10447313897311687,
"clip_ratio/high_mean": 0.017084641277324408,
"clip_ratio/low_mean": 0.018559307692339645,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03564394909190014,
"completion_length": 73.31750183105468,
"epoch": 0.12871001824992795,
"grad_norm": 7.561813831329346,
"kl": 1.0190230280160903,
"learning_rate": 9.639033294244894e-07,
"loss": -0.0059,
"reward": 1.4508479833602905,
"reward_std": 0.2639226779341698,
"rewards/code_format_reward": 0.9724999904632569,
"rewards/code_reward": 0.4822989523410797,
"step": 670,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.17416613902896644,
"clip_ratio/high_mean": 0.02931727101095021,
"clip_ratio/low_mean": 0.013709834642941131,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04302710462361574,
"completion_length": 75.30500183105468,
"epoch": 0.13063106329843435,
"grad_norm": 4.0138373374938965,
"kl": 1.8731355726718903,
"learning_rate": 9.628297409506558e-07,
"loss": 0.0038,
"reward": 1.5990655183792115,
"reward_std": 0.38845544308423996,
"rewards/code_format_reward": 0.9762500047683715,
"rewards/code_reward": 0.5554702281951904,
"step": 680,
"zero_std_ratio": 0.325
},
{
"clip_ratio/high_max": 0.14133978222962468,
"clip_ratio/high_mean": 0.025468734742025843,
"clip_ratio/low_mean": 0.0034107466402929277,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.028879481457988732,
"completion_length": 71.69250183105468,
"epoch": 0.13255210834694073,
"grad_norm": 2.7108314037323,
"kl": 1.0770379617810248,
"learning_rate": 9.61741107385517e-07,
"loss": 0.0015,
"reward": 1.357295000553131,
"reward_std": 0.16353759765625,
"rewards/code_format_reward": 0.981249988079071,
"rewards/code_reward": 0.43333501517772677,
"step": 690,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.2215075224637985,
"clip_ratio/high_mean": 0.03973329542204738,
"clip_ratio/low_mean": 0.021483630378497764,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.06121692657470703,
"completion_length": 77.00250244140625,
"epoch": 0.13447315339544713,
"grad_norm": 3.874828338623047,
"kl": 1.798163938522339,
"learning_rate": 9.606374684030354e-07,
"loss": -0.0002,
"reward": 1.4897700071334838,
"reward_std": 0.3036611869931221,
"rewards/code_format_reward": 0.9699999928474426,
"rewards/code_reward": 0.5023849844932556,
"step": 700,
"zero_std_ratio": 0.375
},
{
"clip_ratio/high_max": 0.26057110670953987,
"clip_ratio/high_mean": 0.04422192363999784,
"clip_ratio/low_mean": 0.012507367390207946,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.05672929054126143,
"completion_length": 68.01749954223632,
"epoch": 0.1363941984439535,
"grad_norm": 1.9008493423461914,
"kl": 1.1601522982120513,
"learning_rate": 9.595188642240268e-07,
"loss": -0.006,
"reward": 1.5408167839050293,
"reward_std": 0.23992418646812438,
"rewards/code_format_reward": 0.9837499976158142,
"rewards/code_reward": 0.5244708836078644,
"step": 710,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.11190514008048921,
"clip_ratio/high_mean": 0.022988432584679686,
"clip_ratio/low_mean": 0.003842631517909467,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02683106428885367,
"completion_length": 70.91749954223633,
"epoch": 0.1383152434924599,
"grad_norm": 2.230220317840576,
"kl": 0.6176944851875306,
"learning_rate": 9.58385335614697e-07,
"loss": -0.0038,
"reward": 1.474353313446045,
"reward_std": 0.22789922058582307,
"rewards/code_format_reward": 0.9850000023841858,
"rewards/code_reward": 0.49092662930488584,
"step": 720,
"zero_std_ratio": 0.4
},
{
"clip_ratio/high_max": 0.22790296860039233,
"clip_ratio/high_mean": 0.043722260277718306,
"clip_ratio/low_mean": 0.005503303511068225,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0492255637422204,
"completion_length": 70.33000183105469,
"epoch": 0.1402362885409663,
"grad_norm": 3.880234956741333,
"kl": 1.7978762328624724,
"learning_rate": 9.572369238851546e-07,
"loss": -0.01,
"reward": 1.7555195808410644,
"reward_std": 0.30654080510139464,
"rewards/code_format_reward": 0.9862499952316284,
"rewards/code_reward": 0.6311972856521606,
"step": 730,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.13005290240980685,
"clip_ratio/high_mean": 0.02253831790876575,
"clip_ratio/low_mean": 0.0076317260100040585,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.030170044326223434,
"completion_length": 67.4625015258789,
"epoch": 0.14215733358947266,
"grad_norm": 31014.41015625,
"kl": 2.5802926242351534,
"learning_rate": 9.560736708879055e-07,
"loss": 4.1316,
"reward": 1.391554856300354,
"reward_std": 0.3107602626085281,
"rewards/code_format_reward": 0.9824999928474426,
"rewards/code_reward": 0.4501524269580841,
"step": 740,
"zero_std_ratio": 0.25
},
{
"clip_ratio/high_max": 0.21672796942293643,
"clip_ratio/high_mean": 0.03920850001741201,
"clip_ratio/low_mean": 0.0084746521897614,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.047683153115212915,
"completion_length": 71.03750076293946,
"epoch": 0.14407837863797907,
"grad_norm": 1.3094109296798706,
"kl": 4.56303431391716,
"learning_rate": 9.54895619016329e-07,
"loss": 0.0111,
"reward": 1.5939582109451294,
"reward_std": 0.2379148319363594,
"rewards/code_format_reward": 0.96875,
"rewards/code_reward": 0.5547916054725647,
"step": 750,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.08126737037673593,
"clip_ratio/high_mean": 0.01269659586250782,
"clip_ratio/low_mean": 0.006480468995869159,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.019177064718678593,
"completion_length": 74.09750213623047,
"epoch": 0.14599942368648544,
"grad_norm": 3.0267083644866943,
"kl": 1.5844107165932655,
"learning_rate": 9.53702811203131e-07,
"loss": 0.0048,
"reward": 1.4744285106658936,
"reward_std": 0.2754403457045555,
"rewards/code_format_reward": 0.9900000095367432,
"rewards/code_reward": 0.489714241027832,
"step": 760,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.22151243952102959,
"clip_ratio/high_mean": 0.038386100489879026,
"clip_ratio/low_mean": 0.001766498590586707,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04015259912703186,
"completion_length": 73.72750244140624,
"epoch": 0.14792046873499184,
"grad_norm": 3596482.75,
"kl": 0.6901701986789703,
"learning_rate": 9.524952909187801e-07,
"loss": 83.9443,
"reward": 1.4019340753555298,
"reward_std": 0.24908357337117196,
"rewards/code_format_reward": 0.9749999880790711,
"rewards/code_reward": 0.45721703171730044,
"step": 770,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.07684345319867134,
"clip_ratio/high_mean": 0.014277776470407844,
"clip_ratio/low_mean": 0.016169815976172685,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0304475924000144,
"completion_length": 79.24250183105468,
"epoch": 0.14984151378349822,
"grad_norm": 3.468223810195923,
"kl": 0.45489892959594724,
"learning_rate": 9.512731021699245e-07,
"loss": -0.0056,
"reward": 1.580666732788086,
"reward_std": 0.41472728848457335,
"rewards/code_format_reward": 0.9774999976158142,
"rewards/code_reward": 0.5459583520889282,
"step": 780,
"zero_std_ratio": 0.275
},
{
"clip_ratio/high_max": 0.10067678079940379,
"clip_ratio/high_mean": 0.013439147116150707,
"clip_ratio/low_mean": 0.023053765966324136,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03649291144683957,
"completion_length": 72.04750137329101,
"epoch": 0.15176255883200462,
"grad_norm": 13.193933486938477,
"kl": 1.6161374658346177,
"learning_rate": 9.500362894977864e-07,
"loss": 0.0007,
"reward": 1.6252036333084106,
"reward_std": 0.3433967262506485,
"rewards/code_format_reward": 0.9837499976158142,
"rewards/code_reward": 0.5666643261909485,
"step": 790,
"zero_std_ratio": 0.325
},
{
"clip_ratio/high_max": 0.11107501722872257,
"clip_ratio/high_mean": 0.01587685807608068,
"clip_ratio/low_mean": 0.001843169682251755,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.017720027733594178,
"completion_length": 77.34250183105469,
"epoch": 0.153683603880511,
"grad_norm": 3.4086289405822754,
"kl": 0.735039034485817,
"learning_rate": 9.487848979765399e-07,
"loss": -0.0033,
"reward": 1.7214166164398192,
"reward_std": 0.3059865742921829,
"rewards/code_format_reward": 0.9924999952316285,
"rewards/code_reward": 0.6125832796096802,
"step": 800,
"zero_std_ratio": 0.325
},
{
"clip_ratio/high_max": 0.06381021924316883,
"clip_ratio/high_mean": 0.012221441417932511,
"clip_ratio/low_mean": 0.002595777277019806,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.014817218482494354,
"completion_length": 78.94500045776367,
"epoch": 0.15560464892901738,
"grad_norm": 2.894174098968506,
"kl": 0.9337424471974373,
"learning_rate": 9.475189732116677e-07,
"loss": -0.0074,
"reward": 1.5309076070785523,
"reward_std": 0.36832110285758973,
"rewards/code_format_reward": 0.981249988079071,
"rewards/code_reward": 0.5201413094997406,
"step": 810,
"zero_std_ratio": 0.35
},
{
"clip_ratio/high_max": 0.0614451477304101,
"clip_ratio/high_mean": 0.011137601570226252,
"clip_ratio/low_mean": 0.015545779425883666,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02668338119983673,
"completion_length": 80.46750030517578,
"epoch": 0.15752569397752378,
"grad_norm": 1.5945316553115845,
"kl": 1.666656306385994,
"learning_rate": 9.462385613382997e-07,
"loss": -0.0138,
"reward": 1.4196115970611571,
"reward_std": 0.3273743912577629,
"rewards/code_format_reward": 0.9625,
"rewards/code_reward": 0.4691807866096497,
"step": 820,
"zero_std_ratio": 0.4
},
{
"clip_ratio/high_max": 0.0729204102884978,
"clip_ratio/high_mean": 0.011435226618777961,
"clip_ratio/low_mean": 0.0035716916667297483,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015006918273866177,
"completion_length": 83.92250061035156,
"epoch": 0.15944673902603015,
"grad_norm": 3.7898244857788086,
"kl": 3.157607713341713,
"learning_rate": 9.449437090195312e-07,
"loss": 0.6488,
"reward": 1.5506922006607056,
"reward_std": 0.3165741294622421,
"rewards/code_format_reward": 0.9712499976158142,
"rewards/code_reward": 0.532533586025238,
"step": 830,
"zero_std_ratio": 0.325
},
{
"clip_ratio/high_max": 0.22014709915965797,
"clip_ratio/high_mean": 0.030960237560793757,
"clip_ratio/low_mean": 0.008386016800068318,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03934625396504998,
"completion_length": 79.59750213623047,
"epoch": 0.16136778407453656,
"grad_norm": 3.164461851119995,
"kl": 0.48004563301801684,
"learning_rate": 9.436344634447226e-07,
"loss": 0.0002,
"reward": 1.4315959692001343,
"reward_std": 0.2676436066627502,
"rewards/code_format_reward": 0.9774999976158142,
"rewards/code_reward": 0.4714229583740234,
"step": 840,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.20247683776542544,
"clip_ratio/high_mean": 0.040387283614836636,
"clip_ratio/low_mean": 0.0031327656004577877,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04352005030959845,
"completion_length": 81.20750274658204,
"epoch": 0.16328882912304293,
"grad_norm": 3.2722160816192627,
"kl": 0.8405016213655472,
"learning_rate": 9.42310872327779e-07,
"loss": -0.0002,
"reward": 1.550826621055603,
"reward_std": 0.4091781198978424,
"rewards/code_format_reward": 0.9725000023841858,
"rewards/code_reward": 0.5322882652282714,
"step": 850,
"zero_std_ratio": 0.35
},
{
"clip_ratio/high_max": 0.061589781753718854,
"clip_ratio/high_mean": 0.011824411456473172,
"clip_ratio/low_mean": 0.011703617853345349,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.023528029827866705,
"completion_length": 62.61500244140625,
"epoch": 0.1652098741715493,
"grad_norm": 0.2732953727245331,
"kl": 1.4307941138744353,
"learning_rate": 9.409729839054123e-07,
"loss": 0.0075,
"reward": 1.5864750623703003,
"reward_std": 0.2073097825050354,
"rewards/code_format_reward": 0.9837499976158142,
"rewards/code_reward": 0.5473000288009644,
"step": 860,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.1379000276327133,
"clip_ratio/high_mean": 0.02470994950272143,
"clip_ratio/low_mean": 0.004926441749557853,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.029636391997337343,
"completion_length": 77.2400032043457,
"epoch": 0.1671309192200557,
"grad_norm": 3.488050699234009,
"kl": 0.9351878672838211,
"learning_rate": 9.396208469353826e-07,
"loss": -0.0059,
"reward": 1.5735363721847535,
"reward_std": 0.3392061233520508,
"rewards/code_format_reward": 0.9725000023841858,
"rewards/code_reward": 0.5436432063579559,
"step": 870,
"zero_std_ratio": 0.325
},
{
"clip_ratio/high_max": 0.07835716316476464,
"clip_ratio/high_mean": 0.014919109572656453,
"clip_ratio/low_mean": 0.006504692946327851,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.021423802757635713,
"completion_length": 74.78000183105469,
"epoch": 0.1690519642685621,
"grad_norm": 5.493437767028809,
"kl": 1.060418888926506,
"learning_rate": 9.382545106947214e-07,
"loss": -0.0036,
"reward": 1.745260238647461,
"reward_std": 0.297343048453331,
"rewards/code_format_reward": 0.9887499928474426,
"rewards/code_reward": 0.6254426181316376,
"step": 880,
"zero_std_ratio": 0.4
},
{
"clip_ratio/high_max": 0.12418304020538926,
"clip_ratio/high_mean": 0.022332211420871318,
"clip_ratio/low_mean": 0.022319327194418294,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04465153906494379,
"completion_length": 84.53250122070312,
"epoch": 0.1709730093170685,
"grad_norm": 5.462327480316162,
"kl": 1.5445073664188385,
"learning_rate": 9.368740249779358e-07,
"loss": 0.0049,
"reward": 1.473905611038208,
"reward_std": 0.33463606536388396,
"rewards/code_format_reward": 0.9737499952316284,
"rewards/code_reward": 0.49351527690887453,
"step": 890,
"zero_std_ratio": 0.25
},
{
"clip_ratio/high_max": 0.08285986992996186,
"clip_ratio/high_mean": 0.015584854045300744,
"clip_ratio/low_mean": 0.002020698119304143,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.017605551629094406,
"completion_length": 85.78250122070312,
"epoch": 0.17289405436557487,
"grad_norm": 3.7394657135009766,
"kl": 1.2308152213692665,
"learning_rate": 9.354794400951942e-07,
"loss": 0.0006,
"reward": 1.3064285874366761,
"reward_std": 0.3360040634870529,
"rewards/code_format_reward": 0.9787500023841857,
"rewards/code_reward": 0.40852679312229156,
"step": 900,
"zero_std_ratio": 0.275
},
{
"clip_ratio/high_max": 0.06636467641219497,
"clip_ratio/high_mean": 0.01088127460097894,
"clip_ratio/low_mean": 0.005357642179296818,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01623891657218337,
"completion_length": 86.17000122070313,
"epoch": 0.17481509941408127,
"grad_norm": 3.883023977279663,
"kl": 0.5634948700666428,
"learning_rate": 9.340708068704917e-07,
"loss": -0.0132,
"reward": 1.6946633338928223,
"reward_std": 0.2633577108383179,
"rewards/code_format_reward": 0.987499988079071,
"rewards/code_reward": 0.6004566550254822,
"step": 910,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.12004003385081888,
"clip_ratio/high_mean": 0.01987670698435977,
"clip_ratio/low_mean": 0.00857236894662492,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.028449075785465537,
"completion_length": 83.18000030517578,
"epoch": 0.17673614446258765,
"grad_norm": 5.860812187194824,
"kl": 1.0160879641771317,
"learning_rate": 9.326481766397991e-07,
"loss": -0.0011,
"reward": 1.5558514595031738,
"reward_std": 0.28839708790183066,
"rewards/code_format_reward": 0.9737500071525573,
"rewards/code_reward": 0.5344882309436798,
"step": 920,
"zero_std_ratio": 0.375
},
{
"clip_ratio/high_max": 0.06109805963933468,
"clip_ratio/high_mean": 0.00847023066598922,
"clip_ratio/low_mean": 0.004858631710521877,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01332886223681271,
"completion_length": 85.00750122070312,
"epoch": 0.17865718951109402,
"grad_norm": 2.287473440170288,
"kl": 0.629003182053566,
"learning_rate": 9.312116012491916e-07,
"loss": -0.0155,
"reward": 1.3984088182449341,
"reward_std": 0.38690108954906466,
"rewards/code_format_reward": 0.9787500023841857,
"rewards/code_reward": 0.45451690554618834,
"step": 930,
"zero_std_ratio": 0.275
},
{
"clip_ratio/high_max": 0.11440350348129869,
"clip_ratio/high_mean": 0.021249773760791867,
"clip_ratio/low_mean": 0.010212704542209395,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03146247826516628,
"completion_length": 85.56500244140625,
"epoch": 0.18057823455960043,
"grad_norm": 2.5915870666503906,
"kl": 0.6908730089664459,
"learning_rate": 9.297611330529588e-07,
"loss": -0.0019,
"reward": 1.5472615003585815,
"reward_std": 0.34995803236961365,
"rewards/code_format_reward": 0.9762500047683715,
"rewards/code_reward": 0.529568213224411,
"step": 940,
"zero_std_ratio": 0.35
},
{
"clip_ratio/high_max": 0.11480946252122522,
"clip_ratio/high_mean": 0.021491143060848115,
"clip_ratio/low_mean": 0.007519157652859576,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.029010300803929568,
"completion_length": 72.10000152587891,
"epoch": 0.1824992796081068,
"grad_norm": 1.5689059495925903,
"kl": 0.7929495573043823,
"learning_rate": 9.282968249116975e-07,
"loss": -0.0054,
"reward": 1.8428637742996217,
"reward_std": 0.2614489495754242,
"rewards/code_format_reward": 0.9875,
"rewards/code_reward": 0.6745568513870239,
"step": 950,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.3971266824752092,
"clip_ratio/high_mean": 0.05282264268025756,
"clip_ratio/low_mean": 0.004530514683574438,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.05735315615311265,
"completion_length": 70.86750030517578,
"epoch": 0.1844203246566132,
"grad_norm": 3.4463512897491455,
"kl": 0.8312035664916039,
"learning_rate": 9.268187301903852e-07,
"loss": 0.0003,
"reward": 1.6929683208465576,
"reward_std": 0.2562918782234192,
"rewards/code_format_reward": 0.9837499976158142,
"rewards/code_reward": 0.600546681880951,
"step": 960,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.1673737466800958,
"clip_ratio/high_mean": 0.03157579629332759,
"clip_ratio/low_mean": 0.012752554472535848,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.044328349828720096,
"completion_length": 76.92749938964843,
"epoch": 0.18634136970511958,
"grad_norm": 3.0548853874206543,
"kl": 0.6291002959012986,
"learning_rate": 9.253269027564339e-07,
"loss": -0.005,
"reward": 1.4119353413581848,
"reward_std": 0.33177118599414823,
"rewards/code_format_reward": 0.981249988079071,
"rewards/code_reward": 0.46065517961978913,
"step": 970,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.15495819319039583,
"clip_ratio/high_mean": 0.022329012653790413,
"clip_ratio/low_mean": 0.006486268152366392,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02881528080906719,
"completion_length": 68.58250122070312,
"epoch": 0.18826241475362598,
"grad_norm": 7.065835952758789,
"kl": 1.0375685960054397,
"learning_rate": 9.238213969777292e-07,
"loss": -0.0046,
"reward": 1.6331373691558837,
"reward_std": 0.2626490265130997,
"rewards/code_format_reward": 0.9850000023841858,
"rewards/code_reward": 0.5703186750411987,
"step": 980,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.10045178183354438,
"clip_ratio/high_mean": 0.020599483215482904,
"clip_ratio/low_mean": 0.007835417747264728,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02843490142840892,
"completion_length": 71.47500076293946,
"epoch": 0.19018345980213236,
"grad_norm": 4.533353328704834,
"kl": 2.011890631914139,
"learning_rate": 9.223022677206474e-07,
"loss": -0.0001,
"reward": 1.7676753044128417,
"reward_std": 0.25886805951595304,
"rewards/code_format_reward": 0.9824999928474426,
"rewards/code_reward": 0.6382126212120056,
"step": 990,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.12670890614390373,
"clip_ratio/high_mean": 0.022856980562210083,
"clip_ratio/low_mean": 0.016935013599868397,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.039791994355618955,
"completion_length": 70.40500106811524,
"epoch": 0.19210450485063874,
"grad_norm": 9.587749481201172,
"kl": 1.1125446915626527,
"learning_rate": 9.207695703480562e-07,
"loss": -0.0049,
"reward": 1.5464402914047242,
"reward_std": 0.30552313327789304,
"rewards/code_format_reward": 0.9899999976158143,
"rewards/code_reward": 0.5257201135158539,
"step": 1000,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.13173274043947458,
"clip_ratio/high_mean": 0.021644592471420764,
"clip_ratio/low_mean": 0.01016495683870744,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03180954959243536,
"completion_length": 81.64500122070312,
"epoch": 0.19402554989914514,
"grad_norm": 61.59896469116211,
"kl": 1.3899411320686341,
"learning_rate": 9.192233607172973e-07,
"loss": 0.0117,
"reward": 1.5586263418197632,
"reward_std": 0.32884465754032133,
"rewards/code_format_reward": 0.9862499952316284,
"rewards/code_reward": 0.5327506422996521,
"step": 1010,
"zero_std_ratio": 0.35
},
{
"clip_ratio/high_max": 0.38223748579621314,
"clip_ratio/high_mean": 0.05293128285557032,
"clip_ratio/low_mean": 0.008536407171050087,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.06146768992766738,
"completion_length": 75.7425033569336,
"epoch": 0.19594659494765151,
"grad_norm": 0.8699261546134949,
"kl": 2.267198386788368,
"learning_rate": 9.17663695178151e-07,
"loss": 0.0007,
"reward": 1.4393709778785706,
"reward_std": 0.19248414039611816,
"rewards/code_format_reward": 0.9887499928474426,
"rewards/code_reward": 0.4724979490041733,
"step": 1020,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.05449964143335819,
"clip_ratio/high_mean": 0.008484689320903271,
"clip_ratio/low_mean": 0.0017167545520351268,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010201443906407804,
"completion_length": 74.80750045776367,
"epoch": 0.19786763999615792,
"grad_norm": 3.8721530437469482,
"kl": 1.034875027090311,
"learning_rate": 9.160906305707814e-07,
"loss": -0.0065,
"reward": 1.6229804277420044,
"reward_std": 0.21886643767356873,
"rewards/code_format_reward": 0.9962499976158142,
"rewards/code_reward": 0.5624276876449585,
"step": 1030,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.057783479290083054,
"clip_ratio/high_mean": 0.008794186974409968,
"clip_ratio/low_mean": 0.01261859169753734,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02141277852933854,
"completion_length": 80.94500122070312,
"epoch": 0.1997886850446643,
"grad_norm": 2.0369646549224854,
"kl": 0.47016064152121545,
"learning_rate": 9.145042242236667e-07,
"loss": -0.0016,
"reward": 1.5200274467468262,
"reward_std": 0.2379522889852524,
"rewards/code_format_reward": 0.98125,
"rewards/code_reward": 0.5147012054920197,
"step": 1040,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.05080003601033241,
"clip_ratio/high_mean": 0.0081847107532667,
"clip_ratio/low_mean": 0.003685746184783056,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011870456766337157,
"completion_length": 86.39750213623047,
"epoch": 0.2017097300931707,
"grad_norm": 1.86152184009552,
"kl": 0.9119557231664658,
"learning_rate": 9.129045339515085e-07,
"loss": -0.0025,
"reward": 1.338998556137085,
"reward_std": 0.29172809422016144,
"rewards/code_format_reward": 0.9787500023841857,
"rewards/code_reward": 0.42481178045272827,
"step": 1050,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.11181259918957949,
"clip_ratio/high_mean": 0.01702371232677251,
"clip_ratio/low_mean": 0.003983464353950694,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.021007176581770183,
"completion_length": 89.0250015258789,
"epoch": 0.20363077514167707,
"grad_norm": 1.664932370185852,
"kl": 1.7415984645485878,
"learning_rate": 9.112916180531254e-07,
"loss": -0.0009,
"reward": 1.6867451906204223,
"reward_std": 0.26216842532157897,
"rewards/code_format_reward": 0.9849999904632568,
"rewards/code_reward": 0.5971225798130035,
"step": 1060,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.1619036693125963,
"clip_ratio/high_mean": 0.02605230761691928,
"clip_ratio/low_mean": 0.011786457896232606,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03783876644447446,
"completion_length": 80.52750091552734,
"epoch": 0.20555182019018345,
"grad_norm": 3.1480722427368164,
"kl": 2.3309426337480543,
"learning_rate": 9.096655353093286e-07,
"loss": -0.0108,
"reward": 1.7797099113464356,
"reward_std": 0.3243818938732147,
"rewards/code_format_reward": 0.9875,
"rewards/code_reward": 0.6429799437522888,
"step": 1070,
"zero_std_ratio": 0.375
},
{
"clip_ratio/high_max": 0.145719656907022,
"clip_ratio/high_mean": 0.02472380215767771,
"clip_ratio/low_mean": 0.01881317695369944,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.043536979146301745,
"completion_length": 75.46750183105469,
"epoch": 0.20747286523868985,
"grad_norm": 4.7426347732543945,
"kl": 0.7767296731472015,
"learning_rate": 9.080263449807788e-07,
"loss": 0.0042,
"reward": 1.5128322124481202,
"reward_std": 0.26058112680912016,
"rewards/code_format_reward": 0.9662500023841858,
"rewards/code_reward": 0.514853572845459,
"step": 1080,
"zero_std_ratio": 0.375
},
{
"clip_ratio/high_max": 0.04860758520662785,
"clip_ratio/high_mean": 0.00921072952914983,
"clip_ratio/low_mean": 0.013458288778201677,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.022669017571024595,
"completion_length": 77.22750244140624,
"epoch": 0.20939391028719623,
"grad_norm": 2.2836835384368896,
"kl": 0.6794285923242569,
"learning_rate": 9.063741068058278e-07,
"loss": -0.0028,
"reward": 1.5665315628051757,
"reward_std": 0.23656646013259888,
"rewards/code_format_reward": 0.9737500071525573,
"rewards/code_reward": 0.5398283064365387,
"step": 1090,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.2074673067778349,
"clip_ratio/high_mean": 0.036228268034756185,
"clip_ratio/low_mean": 0.003734398238157155,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.039962667226791385,
"completion_length": 91.18000030517578,
"epoch": 0.21131495533570263,
"grad_norm": 7.916996002197266,
"kl": 1.0919141083955766,
"learning_rate": 9.0470888099834e-07,
"loss": 0.1666,
"reward": 1.68690767288208,
"reward_std": 0.32907233834266664,
"rewards/code_format_reward": 0.9799999952316284,
"rewards/code_reward": 0.5984537959098816,
"step": 1100,
"zero_std_ratio": 0.375
},
{
"clip_ratio/high_max": 0.16652454435825348,
"clip_ratio/high_mean": 0.027045656740665436,
"clip_ratio/low_mean": 0.006342244842380751,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03338790265843272,
"completion_length": 80.02000122070312,
"epoch": 0.213236000384209,
"grad_norm": 24.34583282470703,
"kl": 1.00138920545578,
"learning_rate": 9.030307282454995e-07,
"loss": -0.0023,
"reward": 1.6111816883087158,
"reward_std": 0.24880893230438234,
"rewards/code_format_reward": 0.9724999904632569,
"rewards/code_reward": 0.5624658226966858,
"step": 1110,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.18198216175660492,
"clip_ratio/high_mean": 0.02493738690391183,
"clip_ratio/low_mean": 0.004894328210502863,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.029831714881584048,
"completion_length": 71.95,
"epoch": 0.2151570454327154,
"grad_norm": 2.7608304023742676,
"kl": 0.971074515581131,
"learning_rate": 9.013397097055971e-07,
"loss": -0.0022,
"reward": 1.6884326457977294,
"reward_std": 0.3369467526674271,
"rewards/code_format_reward": 0.9712499856948853,
"rewards/code_reward": 0.6014038324356079,
"step": 1120,
"zero_std_ratio": 0.375
},
{
"clip_ratio/high_max": 0.16543546952307225,
"clip_ratio/high_mean": 0.02493141880258918,
"clip_ratio/low_mean": 0.007064808573340997,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03199622761458158,
"completion_length": 72.50500030517578,
"epoch": 0.21707809048122179,
"grad_norm": 7.147952556610107,
"kl": 6.163409499824047,
"learning_rate": 8.996358870058017e-07,
"loss": 0.0081,
"reward": 1.5753276348114014,
"reward_std": 0.2175431028008461,
"rewards/code_format_reward": 0.9924999952316285,
"rewards/code_reward": 0.5395387947559357,
"step": 1130,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.05989155264105648,
"clip_ratio/high_mean": 0.009021314003621227,
"clip_ratio/low_mean": 0.014251881884410978,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.023273196443915366,
"completion_length": 74.57750091552734,
"epoch": 0.21899913552972816,
"grad_norm": 17.58907699584961,
"kl": 0.9839092344045639,
"learning_rate": 8.979193222399154e-07,
"loss": -0.0006,
"reward": 1.570918822288513,
"reward_std": 0.27486068904399874,
"rewards/code_format_reward": 0.9737500071525573,
"rewards/code_reward": 0.5420219123363494,
"step": 1140,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.23600016683340072,
"clip_ratio/high_mean": 0.04525289600715041,
"clip_ratio/low_mean": 0.00799154011765495,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.05324443739373237,
"completion_length": 71.89750061035156,
"epoch": 0.22092018057823457,
"grad_norm": 8.010896682739258,
"kl": 1.0768774889409543,
"learning_rate": 8.961900779661095e-07,
"loss": 0.0139,
"reward": 1.5848765134811402,
"reward_std": 0.21965934410691262,
"rewards/code_format_reward": 0.9862499952316284,
"rewards/code_reward": 0.5458757638931274,
"step": 1150,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.10782922431826591,
"clip_ratio/high_mean": 0.014393238560296595,
"clip_ratio/low_mean": 0.0046036563231609765,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.018996895058080554,
"completion_length": 78.2925018310547,
"epoch": 0.22284122562674094,
"grad_norm": 3.7750465869903564,
"kl": 0.5210637584328651,
"learning_rate": 8.944482172046448e-07,
"loss": -0.0065,
"reward": 1.6227028608322143,
"reward_std": 0.2484603613615036,
"rewards/code_format_reward": 0.98125,
"rewards/code_reward": 0.5660388946533204,
"step": 1160,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.13120641289278864,
"clip_ratio/high_mean": 0.019719564472325146,
"clip_ratio/low_mean": 0.00696407729992643,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.026683641644194723,
"completion_length": 81.64000091552734,
"epoch": 0.22476227067524734,
"grad_norm": 1.1691230535507202,
"kl": 0.5908193171024323,
"learning_rate": 8.926938034355751e-07,
"loss": -0.0008,
"reward": 1.6598936080932618,
"reward_std": 0.3073273479938507,
"rewards/code_format_reward": 0.9875,
"rewards/code_reward": 0.5830717980861664,
"step": 1170,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.26425148695707323,
"clip_ratio/high_mean": 0.03642228813841939,
"clip_ratio/low_mean": 0.0025068818649742752,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03892916943877935,
"completion_length": 83.06500244140625,
"epoch": 0.22668331572375372,
"grad_norm": 5.047176361083984,
"kl": 0.8601905956864357,
"learning_rate": 8.90926900596434e-07,
"loss": 0.019,
"reward": 1.6030859470367431,
"reward_std": 0.18358819633722306,
"rewards/code_format_reward": 0.9862500071525574,
"rewards/code_reward": 0.5549804508686066,
"step": 1180,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.20188620835542678,
"clip_ratio/high_mean": 0.03365288833156228,
"clip_ratio/low_mean": 0.012162915989756584,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04581580460071564,
"completion_length": 80.93500061035157,
"epoch": 0.2286043607722601,
"grad_norm": 3.431043863296509,
"kl": 3.284740853309631,
"learning_rate": 8.891475730799039e-07,
"loss": -0.0024,
"reward": 1.719798493385315,
"reward_std": 0.2678588882088661,
"rewards/code_format_reward": 0.9887500047683716,
"rewards/code_reward": 0.6127117216587067,
"step": 1190,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.13805483505129815,
"clip_ratio/high_mean": 0.02111883880570531,
"clip_ratio/low_mean": 0.002508872369071469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.023627711273729802,
"completion_length": 87.33250274658204,
"epoch": 0.2305254058207665,
"grad_norm": 4.731442928314209,
"kl": 1.1696231275796891,
"learning_rate": 8.873558857314706e-07,
"loss": -0.0053,
"reward": 1.7580220222473144,
"reward_std": 0.28411929309368134,
"rewards/code_format_reward": 0.9900000095367432,
"rewards/code_reward": 0.6315110087394714,
"step": 1200,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.07043634681031108,
"clip_ratio/high_mean": 0.009235845855437219,
"clip_ratio/low_mean": 0.017453379271319135,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02668922524899244,
"completion_length": 86.74250030517578,
"epoch": 0.23244645086927287,
"grad_norm": 23.686250686645508,
"kl": 1.7613270074129104,
"learning_rate": 8.855519038470587e-07,
"loss": 0.91,
"reward": 1.8096629619598388,
"reward_std": 0.2700611263513565,
"rewards/code_format_reward": 0.9824999928474426,
"rewards/code_reward": 0.6592064738273621,
"step": 1210,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.1193816315382719,
"clip_ratio/high_mean": 0.01799508691765368,
"clip_ratio/low_mean": 0.0052341839407745285,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.023229270869342143,
"completion_length": 91.73750152587891,
"epoch": 0.23436749591777928,
"grad_norm": 5.015241622924805,
"kl": 87723751.16166303,
"learning_rate": 8.83735693170653e-07,
"loss": 178666.875,
"reward": 1.5409840583801269,
"reward_std": 0.3586106300354004,
"rewards/code_format_reward": 0.9687500119209289,
"rewards/code_reward": 0.5283045113086701,
"step": 1220,
"zero_std_ratio": 0.4
},
{
"clip_ratio/high_max": 0.15788686936721205,
"clip_ratio/high_mean": 0.02180835944600403,
"clip_ratio/low_mean": 0.004957044991897419,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.026765404315665365,
"completion_length": 83.70250091552734,
"epoch": 0.23628854096628565,
"grad_norm": 2.7140953540802,
"kl": 0.755669391900301,
"learning_rate": 8.81907319891902e-07,
"loss": -0.0099,
"reward": 1.8449480056762695,
"reward_std": 0.28006095588207247,
"rewards/code_format_reward": 0.987499988079071,
"rewards/code_reward": 0.6755990028381348,
"step": 1230,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.12416752465069295,
"clip_ratio/high_mean": 0.01972346901893616,
"clip_ratio/low_mean": 0.01847981174942106,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0382032809779048,
"completion_length": 91.47000274658203,
"epoch": 0.23820958601479206,
"grad_norm": 10.781957626342773,
"kl": 1.0129390999674797,
"learning_rate": 8.800668506437059e-07,
"loss": 0.0011,
"reward": 1.6923505306243896,
"reward_std": 0.3265227422118187,
"rewards/code_format_reward": 0.9787500023841857,
"rewards/code_reward": 0.6014877319335937,
"step": 1240,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.12042212830856443,
"clip_ratio/high_mean": 0.017916655144654216,
"clip_ratio/low_mean": 0.007017276567057707,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02493393179029226,
"completion_length": 76.6675018310547,
"epoch": 0.24013063106329843,
"grad_norm": 47.773136138916016,
"kl": 1.4071896970272064,
"learning_rate": 8.782143524997882e-07,
"loss": 0.0018,
"reward": 1.6722928285598755,
"reward_std": 0.25374017357826234,
"rewards/code_format_reward": 0.9824999809265137,
"rewards/code_reward": 0.5905213832855225,
"step": 1250,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.08169625541195273,
"clip_ratio/high_mean": 0.013112110400106758,
"clip_ratio/low_mean": 0.003914138658728916,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01702624891186133,
"completion_length": 78.23750152587891,
"epoch": 0.2420516761118048,
"grad_norm": 2688.99462890625,
"kl": 9.395949372649193,
"learning_rate": 8.76349892972251e-07,
"loss": 0.1943,
"reward": 1.5601455688476562,
"reward_std": 0.3348282665014267,
"rewards/code_format_reward": 0.9712499976158142,
"rewards/code_reward": 0.5372602701187134,
"step": 1260,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.13095853393897414,
"clip_ratio/high_mean": 0.018921413994394242,
"clip_ratio/low_mean": 0.018763081403449178,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03768449563067407,
"completion_length": 76.1500015258789,
"epoch": 0.2439727211603112,
"grad_norm": 3.0777931213378906,
"kl": 1.7352074533700943,
"learning_rate": 8.744735400091154e-07,
"loss": 0.0055,
"reward": 1.633968448638916,
"reward_std": 0.23277063071727752,
"rewards/code_format_reward": 0.9824999928474426,
"rewards/code_reward": 0.5713592231273651,
"step": 1270,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.15694143967702984,
"clip_ratio/high_mean": 0.026766782545018943,
"clip_ratio/low_mean": 0.010570818380801938,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03733760174363852,
"completion_length": 76.13500213623047,
"epoch": 0.2458937662088176,
"grad_norm": 2.8748385906219482,
"kl": 3.007472372055054,
"learning_rate": 8.725853619918444e-07,
"loss": 0.0249,
"reward": 1.4643328666687012,
"reward_std": 0.2899716466665268,
"rewards/code_format_reward": 0.9799999952316284,
"rewards/code_reward": 0.48716638684272767,
"step": 1280,
"zero_std_ratio": 0.35
},
{
"clip_ratio/high_max": 0.07734788609668612,
"clip_ratio/high_mean": 0.013521577988285571,
"clip_ratio/low_mean": 0.002974278874171432,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01649585694540292,
"completion_length": 77.6050033569336,
"epoch": 0.247814811257324,
"grad_norm": 4.51137638092041,
"kl": 0.6521440967917442,
"learning_rate": 8.706854277328507e-07,
"loss": -0.0065,
"reward": 1.663088607788086,
"reward_std": 0.29463320076465604,
"rewards/code_format_reward": 0.9887499928474426,
"rewards/code_reward": 0.5843567848205566,
"step": 1290,
"zero_std_ratio": 0.375
},
{
"clip_ratio/high_max": 0.12926983460783958,
"clip_ratio/high_mean": 0.016393666993826626,
"clip_ratio/low_mean": 0.024948839796707034,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04134250609204173,
"completion_length": 74.63750305175782,
"epoch": 0.24973585630583037,
"grad_norm": 7.019649982452393,
"kl": 0.6837658904492855,
"learning_rate": 8.687738064729902e-07,
"loss": -0.0022,
"reward": 1.6927862167358398,
"reward_std": 0.14656674191355706,
"rewards/code_format_reward": 0.9962499976158142,
"rewards/code_reward": 0.5973306179046631,
"step": 1300,
"zero_std_ratio": 0.675
},
{
"clip_ratio/high_max": 0.15073961750604212,
"clip_ratio/high_mean": 0.024888798157917336,
"clip_ratio/low_mean": 0.004707413475262001,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02959621128393337,
"completion_length": 79.17500152587891,
"epoch": 0.25165690135433677,
"grad_norm": 3.9428677558898926,
"kl": 1.0088127315044404,
"learning_rate": 8.668505678790368e-07,
"loss": 0.7445,
"reward": 1.5962260961532593,
"reward_std": 0.22741070687770842,
"rewards/code_format_reward": 0.98125,
"rewards/code_reward": 0.5528005361557007,
"step": 1310,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.0862931152805686,
"clip_ratio/high_mean": 0.016994312894530593,
"clip_ratio/low_mean": 0.0031913593309582213,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.020185671979561448,
"completion_length": 79.30500183105468,
"epoch": 0.25357794640284315,
"grad_norm": 2.810743808746338,
"kl": 2.0237294919788837,
"learning_rate": 8.649157820411451e-07,
"loss": -0.0028,
"reward": 1.6300202369689942,
"reward_std": 0.2859074264764786,
"rewards/code_format_reward": 0.975,
"rewards/code_reward": 0.5712601006031036,
"step": 1320,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.14902311654295772,
"clip_ratio/high_mean": 0.02855427504691761,
"clip_ratio/low_mean": 0.012185945303644984,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.040740220062434676,
"completion_length": 70.88000030517578,
"epoch": 0.2554989914513495,
"grad_norm": 4.68557071685791,
"kl": 1.2288852274417876,
"learning_rate": 8.629695194702949e-07,
"loss": -0.0057,
"reward": 1.4114359855651855,
"reward_std": 0.2626632884144783,
"rewards/code_format_reward": 0.9625,
"rewards/code_reward": 0.46509301066398623,
"step": 1330,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.11323303133249282,
"clip_ratio/high_mean": 0.016216285666450857,
"clip_ratio/low_mean": 0.0045135776337701826,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02072986289858818,
"completion_length": 71.99250030517578,
"epoch": 0.2574200364998559,
"grad_norm": 43.944698333740234,
"kl": 1.446278090775013,
"learning_rate": 8.610118510957221e-07,
"loss": 0.0112,
"reward": 1.5807109117507934,
"reward_std": 0.23466840982437134,
"rewards/code_format_reward": 0.9737500071525573,
"rewards/code_reward": 0.5469179153442383,
"step": 1340,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.20504833161830902,
"clip_ratio/high_mean": 0.029384778672829272,
"clip_ratio/low_mean": 0.006734570109983906,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03611934892833233,
"completion_length": 69.60750198364258,
"epoch": 0.25934108154836233,
"grad_norm": 3.4515652656555176,
"kl": 1.288391387462616,
"learning_rate": 8.59042848262334e-07,
"loss": 0.0022,
"reward": 1.7648874998092652,
"reward_std": 0.29008678793907167,
"rewards/code_format_reward": 0.99375,
"rewards/code_reward": 0.6340062260627747,
"step": 1350,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.18540791552513838,
"clip_ratio/high_mean": 0.030647353292442857,
"clip_ratio/low_mean": 0.0048290589373209515,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03547641267068684,
"completion_length": 73.8150016784668,
"epoch": 0.2612621265968687,
"grad_norm": 24.974191665649414,
"kl": 1.361786951869726,
"learning_rate": 8.570625827281077e-07,
"loss": -0.0015,
"reward": 1.6352276086807251,
"reward_std": 0.20483867302536965,
"rewards/code_format_reward": 0.9712500095367431,
"rewards/code_reward": 0.5748012781143188,
"step": 1360,
"zero_std_ratio": 0.625
},
{
"clip_ratio/high_max": 0.25138766765594484,
"clip_ratio/high_mean": 0.043486443860456345,
"clip_ratio/low_mean": 0.006613140180706978,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.05009958455339074,
"completion_length": 85.41999969482421,
"epoch": 0.2631831716453751,
"grad_norm": 0.2826422452926636,
"kl": 1.1484392315149308,
"learning_rate": 8.550711266614774e-07,
"loss": -0.0015,
"reward": 1.5049166679382324,
"reward_std": 0.17118329852819442,
"rewards/code_format_reward": 0.9737499952316284,
"rewards/code_reward": 0.5090208292007447,
"step": 1370,
"zero_std_ratio": 0.625
},
{
"clip_ratio/high_max": 0.10418513733893633,
"clip_ratio/high_mean": 0.017387184244580568,
"clip_ratio/low_mean": 0.006483422458404675,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.023870606115087865,
"completion_length": 78.00750274658203,
"epoch": 0.26510421669388146,
"grad_norm": 0.43826720118522644,
"kl": 0.5077251173555851,
"learning_rate": 8.530685526387023e-07,
"loss": 0.0071,
"reward": 1.5417476654052735,
"reward_std": 0.2806018695235252,
"rewards/code_format_reward": 0.975,
"rewards/code_reward": 0.5271238267421723,
"step": 1380,
"zero_std_ratio": 0.375
},
{
"clip_ratio/high_max": 0.12633447310654447,
"clip_ratio/high_mean": 0.01944113611098146,
"clip_ratio/low_mean": 0.02114583211950958,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04058696813735878,
"completion_length": 69.89499969482422,
"epoch": 0.26702526174238783,
"grad_norm": 3.222648859024048,
"kl": 0.8532382689416409,
"learning_rate": 8.510549336412227e-07,
"loss": 0.2832,
"reward": 1.4325429320335388,
"reward_std": 0.23379142954945564,
"rewards/code_format_reward": 0.95625,
"rewards/code_reward": 0.47720896899700166,
"step": 1390,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.10978957340121269,
"clip_ratio/high_mean": 0.015348212420940399,
"clip_ratio/low_mean": 0.00886362442979589,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.024211836606264116,
"completion_length": 74.32000198364258,
"epoch": 0.26894630679089426,
"grad_norm": 511.98333740234375,
"kl": 6.762348529696465,
"learning_rate": 8.490303430529996e-07,
"loss": 0.0097,
"reward": 1.5433219909667968,
"reward_std": 0.3002948135137558,
"rewards/code_format_reward": 0.9787500023841857,
"rewards/code_reward": 0.5269734919071197,
"step": 1400,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.021737607452087103,
"clip_ratio/high_mean": 0.004128801400656812,
"clip_ratio/low_mean": 0.008135353482794016,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012264154804870486,
"completion_length": 70.88250122070312,
"epoch": 0.27086735183940064,
"grad_norm": 4.558300018310547,
"kl": 1.0645984336733818,
"learning_rate": 8.469948546578406e-07,
"loss": -0.002,
"reward": 1.711915636062622,
"reward_std": 0.23479849100112915,
"rewards/code_format_reward": 0.9875,
"rewards/code_reward": 0.6090827941894531,
"step": 1410,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.31115832179784775,
"clip_ratio/high_mean": 0.04542893636971712,
"clip_ratio/low_mean": 0.004167796808178537,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04959673266857863,
"completion_length": 82.51750335693359,
"epoch": 0.272788396887907,
"grad_norm": 26.85635757446289,
"kl": 0.6633755072951317,
"learning_rate": 8.449485426367113e-07,
"loss": -0.0044,
"reward": 1.8086278200149537,
"reward_std": 0.25109012275934217,
"rewards/code_format_reward": 0.9862499952316284,
"rewards/code_reward": 0.6577514052391052,
"step": 1420,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.21433292645961047,
"clip_ratio/high_mean": 0.027504962938837706,
"clip_ratio/low_mean": 0.007746222103014589,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03525118476245552,
"completion_length": 69.12750015258788,
"epoch": 0.2747094419364134,
"grad_norm": 39.272727966308594,
"kl": 2.1152508199214934,
"learning_rate": 8.428914815650318e-07,
"loss": 56.6465,
"reward": 1.5950207233428955,
"reward_std": 0.25626782774925233,
"rewards/code_format_reward": 0.9824999928474426,
"rewards/code_reward": 0.5518853664398193,
"step": 1430,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.14325670124962925,
"clip_ratio/high_mean": 0.02268084152601659,
"clip_ratio/low_mean": 0.006528474338119849,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.029209316370543092,
"completion_length": 67.30000076293945,
"epoch": 0.2766304869849198,
"grad_norm": 4.287910461425781,
"kl": 1.2686308354139328,
"learning_rate": 8.408237464099576e-07,
"loss": 9.8201,
"reward": 1.6364605188369752,
"reward_std": 0.22813104093074799,
"rewards/code_format_reward": 0.9749999880790711,
"rewards/code_reward": 0.5744802415370941,
"step": 1440,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.18851536950096487,
"clip_ratio/high_mean": 0.024719347018981354,
"clip_ratio/low_mean": 0.013444452191470191,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.038163799053290856,
"completion_length": 82.13250274658203,
"epoch": 0.2785515320334262,
"grad_norm": 0.4786536991596222,
"kl": 8.468844538927078,
"learning_rate": 8.387454125276494e-07,
"loss": 0.0456,
"reward": 1.7758944988250733,
"reward_std": 0.1511917643249035,
"rewards/code_format_reward": 0.9875,
"rewards/code_reward": 0.6410722196102142,
"step": 1450,
"zero_std_ratio": 0.6
},
{
"clip_ratio/high_max": 0.15984937213361264,
"clip_ratio/high_mean": 0.025054804515093565,
"clip_ratio/low_mean": 0.01257994698244147,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03763475017622113,
"completion_length": 79.66000213623047,
"epoch": 0.2804725770819326,
"grad_norm": 3.223284959793091,
"kl": 1.7015444114804268,
"learning_rate": 8.366565556605258e-07,
"loss": 0.0276,
"reward": 1.5976650953292846,
"reward_std": 0.341750779747963,
"rewards/code_format_reward": 0.96875,
"rewards/code_reward": 0.5566450238227845,
"step": 1460,
"zero_std_ratio": 0.4
},
{
"clip_ratio/high_max": 0.27182711616624144,
"clip_ratio/high_mean": 0.040798351392732,
"clip_ratio/low_mean": 0.002200227712455671,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04299857785226777,
"completion_length": 79.22250213623047,
"epoch": 0.28239362213043895,
"grad_norm": 1.4845157861709595,
"kl": 1.693036738038063,
"learning_rate": 8.345572519345031e-07,
"loss": -0.0017,
"reward": 1.7161717653274535,
"reward_std": 0.2422049015760422,
"rewards/code_format_reward": 0.9824999928474426,
"rewards/code_reward": 0.612460857629776,
"step": 1470,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.17126517184078693,
"clip_ratio/high_mean": 0.025960111571475864,
"clip_ratio/low_mean": 0.00444280517695006,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.030402917158789934,
"completion_length": 83.31750183105468,
"epoch": 0.2843146671789453,
"grad_norm": 5.96829080581665,
"kl": 0.574289733916521,
"learning_rate": 8.324475778562209e-07,
"loss": -0.0061,
"reward": 1.7776363611221313,
"reward_std": 0.2358689785003662,
"rewards/code_format_reward": 0.9837499976158142,
"rewards/code_reward": 0.6428806602954864,
"step": 1480,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.2015096817165613,
"clip_ratio/high_mean": 0.03324723746627569,
"clip_ratio/low_mean": 0.00480144299363019,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.038048680778592824,
"completion_length": 73.28000106811524,
"epoch": 0.28623571222745176,
"grad_norm": 6.496949672698975,
"kl": 0.6653359919786453,
"learning_rate": 8.30327610310254e-07,
"loss": 0.0021,
"reward": 1.6191941976547242,
"reward_std": 0.31718442738056185,
"rewards/code_format_reward": 0.9825000047683716,
"rewards/code_reward": 0.5639720797538758,
"step": 1490,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.15795932533219456,
"clip_ratio/high_mean": 0.02212390162749216,
"clip_ratio/low_mean": 0.00480329486890696,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.026927196700125933,
"completion_length": 73.78000106811524,
"epoch": 0.28815675727595813,
"grad_norm": 5.75892972946167,
"kl": 0.46196936070919037,
"learning_rate": 8.281974265563108e-07,
"loss": -0.0045,
"reward": 1.7829506158828736,
"reward_std": 0.17953601479530334,
"rewards/code_format_reward": 0.9949999928474427,
"rewards/code_reward": 0.642725282907486,
"step": 1500,
"zero_std_ratio": 0.625
},
{
"clip_ratio/high_max": 0.24582542856223882,
"clip_ratio/high_mean": 0.030850262753665446,
"clip_ratio/low_mean": 0.005616182333324104,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03646644388791174,
"completion_length": 77.69500198364258,
"epoch": 0.2900778023244645,
"grad_norm": 326340576.0,
"kl": 0.605505321919918,
"learning_rate": 8.260571042264166e-07,
"loss": 8518.9961,
"reward": 1.7113344192504882,
"reward_std": 0.18693218380212784,
"rewards/code_format_reward": 0.9875,
"rewards/code_reward": 0.6087921977043151,
"step": 1510,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.22759323129430414,
"clip_ratio/high_mean": 0.03405714362161234,
"clip_ratio/low_mean": 0.0032101303557283247,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03726727362954989,
"completion_length": 75.53250122070312,
"epoch": 0.2919988473729709,
"grad_norm": 2.2893807888031006,
"kl": 0.5214515089988708,
"learning_rate": 8.23906721322086e-07,
"loss": 0.0027,
"reward": 1.6311777591705323,
"reward_std": 0.17696685791015626,
"rewards/code_format_reward": 0.9862499952316284,
"rewards/code_reward": 0.5690263509750366,
"step": 1520,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.06725322343409061,
"clip_ratio/high_mean": 0.010706762981135398,
"clip_ratio/low_mean": 0.0018884234530560206,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012595186498947442,
"completion_length": 78.90999908447266,
"epoch": 0.29391989242147726,
"grad_norm": 2.6211440563201904,
"kl": 0.5930808052420616,
"learning_rate": 8.217463562114786e-07,
"loss": -0.0035,
"reward": 1.7637510299682617,
"reward_std": 0.209340962767601,
"rewards/code_format_reward": 0.981249988079071,
"rewards/code_reward": 0.6365630030632019,
"step": 1530,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.06629493543878198,
"clip_ratio/high_mean": 0.012000571249518543,
"clip_ratio/low_mean": 0.010053297760896385,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.022053868882358073,
"completion_length": 77.66250152587891,
"epoch": 0.2958409374699837,
"grad_norm": 0.5937472581863403,
"kl": 0.6556157968938351,
"learning_rate": 8.195760876265438e-07,
"loss": 0.0023,
"reward": 1.4144308805465697,
"reward_std": 0.12647379338741302,
"rewards/code_format_reward": 0.9825000047683716,
"rewards/code_reward": 0.461590439081192,
"step": 1540,
"zero_std_ratio": 0.65
},
{
"clip_ratio/high_max": 0.2611659773625433,
"clip_ratio/high_mean": 0.05069012229796499,
"clip_ratio/low_mean": 0.009917778367525897,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.06060790033079684,
"completion_length": 80.60250091552734,
"epoch": 0.29776198251849006,
"grad_norm": 7.297484874725342,
"kl": 2.139972834289074,
"learning_rate": 8.173959946601519e-07,
"loss": 0.0662,
"reward": 1.6416264057159424,
"reward_std": 0.3118141442537308,
"rewards/code_format_reward": 0.9749999880790711,
"rewards/code_reward": 0.5770631790161133,
"step": 1550,
"zero_std_ratio": 0.4
},
{
"clip_ratio/high_max": 0.14441705606877803,
"clip_ratio/high_mean": 0.023728324193507434,
"clip_ratio/low_mean": 0.005098688977886923,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.028827012795954943,
"completion_length": 77.17750244140625,
"epoch": 0.29968302756699644,
"grad_norm": 5.614815711975098,
"kl": 0.5137595549225807,
"learning_rate": 8.152061567632108e-07,
"loss": -0.0057,
"reward": 1.5097593545913697,
"reward_std": 0.29559260606765747,
"rewards/code_format_reward": 0.9575000047683716,
"rewards/code_reward": 0.5155046641826629,
"step": 1560,
"zero_std_ratio": 0.375
},
{
"clip_ratio/high_max": 0.03715956890955567,
"clip_ratio/high_mean": 0.006024846772197634,
"clip_ratio/low_mean": 0.009319488028995692,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015344334836117923,
"completion_length": 76.34000091552734,
"epoch": 0.3016040726155028,
"grad_norm": 5.059381008148193,
"kl": 0.8711868159472942,
"learning_rate": 8.130066537417707e-07,
"loss": -0.0003,
"reward": 1.4149085521697997,
"reward_std": 0.19155050422996284,
"rewards/code_format_reward": 0.9749999880790711,
"rewards/code_reward": 0.463704252243042,
"step": 1570,
"zero_std_ratio": 0.625
},
{
"clip_ratio/high_max": 0.0935845285654068,
"clip_ratio/high_mean": 0.013573423656634987,
"clip_ratio/low_mean": 0.00990565216197865,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.023479075590148567,
"completion_length": 83.97500152587891,
"epoch": 0.30352511766400925,
"grad_norm": 2.025956869125366,
"kl": 0.9980318561196327,
"learning_rate": 8.10797565754116e-07,
"loss": -0.0041,
"reward": 1.5444376945495606,
"reward_std": 0.19510383605957032,
"rewards/code_format_reward": 0.9887499928474426,
"rewards/code_reward": 0.525031316280365,
"step": 1580,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.11659459788352251,
"clip_ratio/high_mean": 0.016526972700376064,
"clip_ratio/low_mean": 0.0030368489184184,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.019563821679912507,
"completion_length": 90.33000335693359,
"epoch": 0.3054461627125156,
"grad_norm": 4.901747703552246,
"kl": 0.6650052145123482,
"learning_rate": 8.085789733078439e-07,
"loss": 0.9063,
"reward": 1.6000897407531738,
"reward_std": 0.20618843138217927,
"rewards/code_format_reward": 0.9774999976158142,
"rewards/code_reward": 0.5556698679924011,
"step": 1590,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.1246914654970169,
"clip_ratio/high_mean": 0.018419789243489505,
"clip_ratio/low_mean": 0.0033823222620412707,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.021802110970020293,
"completion_length": 82.78250122070312,
"epoch": 0.307367207761022,
"grad_norm": 16365.4453125,
"kl": 83.84930176734925,
"learning_rate": 8.063509572569303e-07,
"loss": 0.4123,
"reward": 1.8164207458496093,
"reward_std": 0.25260339230298995,
"rewards/code_format_reward": 0.987499988079071,
"rewards/code_reward": 0.6613353252410888,
"step": 1600,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.18187000900506972,
"clip_ratio/high_mean": 0.026620355295017363,
"clip_ratio/low_mean": 0.011157544914749452,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03777789976447821,
"completion_length": 72.65250244140626,
"epoch": 0.3092882528095284,
"grad_norm": 2.8136842250823975,
"kl": 0.9565572030842304,
"learning_rate": 8.041135987987831e-07,
"loss": 0.0037,
"reward": 1.7599462985992431,
"reward_std": 0.26825075447559354,
"rewards/code_format_reward": 0.9899999976158143,
"rewards/code_reward": 0.6324730753898621,
"step": 1610,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.03404317735694349,
"clip_ratio/high_mean": 0.006068735342705622,
"clip_ratio/low_mean": 0.010824382931605214,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.016893118200823665,
"completion_length": 78.07500305175782,
"epoch": 0.31120929785803475,
"grad_norm": 31.179058074951172,
"kl": 0.560398967564106,
"learning_rate": 8.018669794712835e-07,
"loss": -0.0011,
"reward": 1.5130140781402588,
"reward_std": 0.2716240629553795,
"rewards/code_format_reward": 0.9799999952316284,
"rewards/code_reward": 0.5115070700645447,
"step": 1620,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.06218870538286865,
"clip_ratio/high_mean": 0.008549430634593591,
"clip_ratio/low_mean": 0.007052442076383158,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01560187318827957,
"completion_length": 83.56500091552735,
"epoch": 0.3131303429065412,
"grad_norm": 0.6899747252464294,
"kl": 0.7204694971442223,
"learning_rate": 7.996111811498138e-07,
"loss": 0.0031,
"reward": 1.687961721420288,
"reward_std": 0.19512347355484963,
"rewards/code_format_reward": 0.9924999952316285,
"rewards/code_reward": 0.5958558440208435,
"step": 1630,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.17274489336414262,
"clip_ratio/high_mean": 0.021967002666497138,
"clip_ratio/low_mean": 0.009596503502689303,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03156350784411188,
"completion_length": 80.9175033569336,
"epoch": 0.31505138795504756,
"grad_norm": 2.105334758758545,
"kl": 0.8054538488388061,
"learning_rate": 7.97346286044274e-07,
"loss": -0.0058,
"reward": 1.3176400899887084,
"reward_std": 0.20478213280439378,
"rewards/code_format_reward": 0.98125,
"rewards/code_reward": 0.41350752413272857,
"step": 1640,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.16534482885617763,
"clip_ratio/high_mean": 0.02735080250131432,
"clip_ratio/low_mean": 0.0035748321075516286,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.030925634037703275,
"completion_length": 74.01250228881835,
"epoch": 0.31697243300355393,
"grad_norm": 184916.921875,
"kl": 28.671802641451357,
"learning_rate": 7.950723766960857e-07,
"loss": 5.579,
"reward": 1.6360910892486573,
"reward_std": 0.2874180316925049,
"rewards/code_format_reward": 0.9687500119209289,
"rewards/code_reward": 0.5758580267429352,
"step": 1650,
"zero_std_ratio": 0.375
},
{
"clip_ratio/high_max": 0.10983106552157551,
"clip_ratio/high_mean": 0.016536441215430388,
"clip_ratio/low_mean": 0.011150279239518567,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.027686719762277788,
"completion_length": 84.17750244140625,
"epoch": 0.3188934780520603,
"grad_norm": 219305424.0,
"kl": 106.82060827612877,
"learning_rate": 7.927895359751835e-07,
"loss": 5248.6121,
"reward": 1.5329812049865723,
"reward_std": 0.22349740117788314,
"rewards/code_format_reward": 0.9774999976158142,
"rewards/code_reward": 0.5221156060695649,
"step": 1660,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.13622083119116724,
"clip_ratio/high_mean": 0.01933064509066753,
"clip_ratio/low_mean": 0.005038347843219526,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.024368993006646633,
"completion_length": 80.39500274658204,
"epoch": 0.3208145231005667,
"grad_norm": 9.519110679626465,
"kl": 0.7214748501777649,
"learning_rate": 7.904978470769959e-07,
"loss": -0.0025,
"reward": 1.6617871284484864,
"reward_std": 0.27498180270195005,
"rewards/code_format_reward": 0.95625,
"rewards/code_reward": 0.5918310403823852,
"step": 1670,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.09761472065001726,
"clip_ratio/high_mean": 0.01911984165199101,
"clip_ratio/low_mean": 0.010301339952275158,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02942118220962584,
"completion_length": 74.54750213623046,
"epoch": 0.3227355681490731,
"grad_norm": 6.143461227416992,
"kl": 0.7205829441547393,
"learning_rate": 7.881973935194124e-07,
"loss": 0.0015,
"reward": 1.4262179613113404,
"reward_std": 0.26740061640739443,
"rewards/code_format_reward": 0.9737499952316284,
"rewards/code_reward": 0.4696714758872986,
"step": 1680,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.07396706650033594,
"clip_ratio/high_mean": 0.011737752065528184,
"clip_ratio/low_mean": 0.005250315659213811,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.016988067945931107,
"completion_length": 75.27500228881836,
"epoch": 0.3246566131975795,
"grad_norm": 2.337491989135742,
"kl": 68.4789316162467,
"learning_rate": 7.858882591397403e-07,
"loss": 0.3045,
"reward": 1.527750849723816,
"reward_std": 0.26877219378948214,
"rewards/code_format_reward": 0.9899999976158143,
"rewards/code_reward": 0.5163754165172577,
"step": 1690,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.28693441725336016,
"clip_ratio/high_mean": 0.04205623795860447,
"clip_ratio/low_mean": 0.009473194915335626,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.051529432012466715,
"completion_length": 84.14500274658204,
"epoch": 0.32657765824608587,
"grad_norm": 20.964569091796875,
"kl": 0.5620399042963982,
"learning_rate": 7.835705280916488e-07,
"loss": -0.0051,
"reward": 1.615627408027649,
"reward_std": 0.2002291887998581,
"rewards/code_format_reward": 0.9949999928474427,
"rewards/code_reward": 0.5590636849403381,
"step": 1700,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.2242162274196744,
"clip_ratio/high_mean": 0.036464582500047985,
"clip_ratio/low_mean": 0.010222097241785378,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.046686679660342636,
"completion_length": 78.56000061035157,
"epoch": 0.32849870329459224,
"grad_norm": 3.2044875621795654,
"kl": 0.7747909784317016,
"learning_rate": 7.812442848421032e-07,
"loss": -0.0006,
"reward": 1.6169416427612304,
"reward_std": 0.24999960064888,
"rewards/code_format_reward": 0.9887500047683716,
"rewards/code_reward": 0.5612833142280579,
"step": 1710,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.10125078996643425,
"clip_ratio/high_mean": 0.019883562461473048,
"clip_ratio/low_mean": 0.014126901775307487,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.034010464209131896,
"completion_length": 73.05000152587891,
"epoch": 0.3304197483430986,
"grad_norm": 735.9865112304688,
"kl": 2.3181345582008364,
"learning_rate": 7.789096141682851e-07,
"loss": 0.1213,
"reward": 1.371981406211853,
"reward_std": 0.17790164202451705,
"rewards/code_format_reward": 0.9712499976158142,
"rewards/code_reward": 0.44317818284034727,
"step": 1720,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.15698121464811265,
"clip_ratio/high_mean": 0.026607585436431692,
"clip_ratio/low_mean": 0.004372719774255529,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.030980303999967873,
"completion_length": 78.5425033569336,
"epoch": 0.33234079339160505,
"grad_norm": 2.3281009197235107,
"kl": 1.7815167903900146,
"learning_rate": 7.765666011545045e-07,
"loss": 0.4359,
"reward": 1.669968068599701,
"reward_std": 0.18121034651994705,
"rewards/code_format_reward": 0.9737499833106995,
"rewards/code_reward": 0.5915465235710144,
"step": 1730,
"zero_std_ratio": 0.625
},
{
"clip_ratio/high_max": 0.1189429596066475,
"clip_ratio/high_mean": 0.021151045989245176,
"clip_ratio/low_mean": 0.002452358941081911,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.023603405337780714,
"completion_length": 69.71000289916992,
"epoch": 0.3342618384401114,
"grad_norm": 1720.8326416015625,
"kl": 0.7967777937650681,
"learning_rate": 7.742153311890971e-07,
"loss": 0.0982,
"reward": 1.5440645456314086,
"reward_std": 0.18595425188541412,
"rewards/code_format_reward": 0.9712499976158142,
"rewards/code_reward": 0.5292197823524475,
"step": 1740,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.08902034647762776,
"clip_ratio/high_mean": 0.012681722827255725,
"clip_ratio/low_mean": 0.00311334275174886,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015795065369457007,
"completion_length": 74.49249954223633,
"epoch": 0.3361828834886178,
"grad_norm": 0.09847641736268997,
"kl": 0.8014414094388485,
"learning_rate": 7.718558899613143e-07,
"loss": 0.0099,
"reward": 1.5567015647888183,
"reward_std": 0.14754890371114016,
"rewards/code_format_reward": 0.9649999976158142,
"rewards/code_reward": 0.5371007978916168,
"step": 1750,
"zero_std_ratio": 0.675
},
{
"clip_ratio/high_max": 0.15779653917998077,
"clip_ratio/high_mean": 0.030520046106539668,
"clip_ratio/low_mean": 0.009007267560809851,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03952731299214065,
"completion_length": 77.64000091552734,
"epoch": 0.3381039285371242,
"grad_norm": 16.5263729095459,
"kl": 0.7359155111014843,
"learning_rate": 7.69488363458199e-07,
"loss": -0.0085,
"reward": 1.477712869644165,
"reward_std": 0.26145162880420686,
"rewards/code_format_reward": 0.993749988079071,
"rewards/code_reward": 0.49041891694068906,
"step": 1760,
"zero_std_ratio": 0.4
},
{
"clip_ratio/high_max": 0.17542534926906228,
"clip_ratio/high_mean": 0.025472976046148687,
"clip_ratio/low_mean": 0.005083448148798198,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.030556425044778734,
"completion_length": 78.76000061035157,
"epoch": 0.3400249735856306,
"grad_norm": 2.440377950668335,
"kl": 1.2570879265666008,
"learning_rate": 7.671128379614524e-07,
"loss": -0.0029,
"reward": 1.697490382194519,
"reward_std": 0.21552397906780243,
"rewards/code_format_reward": 0.9887499809265137,
"rewards/code_reward": 0.6015576839447021,
"step": 1770,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.03777246242389083,
"clip_ratio/high_mean": 0.005805602658074349,
"clip_ratio/low_mean": 0.006219673785381019,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012025276734493672,
"completion_length": 78.01500091552734,
"epoch": 0.341946018634137,
"grad_norm": 3.58803129196167,
"kl": 1.3505164757370949,
"learning_rate": 7.647294000442899e-07,
"loss": -0.0008,
"reward": 1.3937680006027222,
"reward_std": 0.1832626909017563,
"rewards/code_format_reward": 0.9912500023841858,
"rewards/code_reward": 0.44907149076461794,
"step": 1780,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.08561003021895885,
"clip_ratio/high_mean": 0.011109948102966883,
"clip_ratio/low_mean": 0.0035756964149186387,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.014685644480050542,
"completion_length": 76.20749969482422,
"epoch": 0.34386706368264336,
"grad_norm": 10.503286361694336,
"kl": 0.552098847925663,
"learning_rate": 7.623381365682855e-07,
"loss": -0.0015,
"reward": 1.6644479036331177,
"reward_std": 0.22849067896604539,
"rewards/code_format_reward": 0.9899999976158143,
"rewards/code_reward": 0.5847239375114441,
"step": 1790,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.06243175007402897,
"clip_ratio/high_mean": 0.009089326043613255,
"clip_ratio/low_mean": 0.005161185140605084,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.014250511419959366,
"completion_length": 69.70000076293945,
"epoch": 0.34578810873114973,
"grad_norm": 4.685351371765137,
"kl": 0.3103115826845169,
"learning_rate": 7.599391346802063e-07,
"loss": -0.0003,
"reward": 1.8390909910202027,
"reward_std": 0.20207120031118392,
"rewards/code_format_reward": 0.9875,
"rewards/code_reward": 0.6726704835891724,
"step": 1800,
"zero_std_ratio": 0.6
},
{
"clip_ratio/high_max": 0.046840774989686904,
"clip_ratio/high_mean": 0.007519985581166111,
"clip_ratio/low_mean": 0.004676173024927266,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01219615869631525,
"completion_length": 80.15500183105469,
"epoch": 0.3477091537796561,
"grad_norm": 21886460.0,
"kl": 0.48781016543507577,
"learning_rate": 7.575324818088367e-07,
"loss": 517.7405,
"reward": 1.6558839797973632,
"reward_std": 0.2796541228890419,
"rewards/code_format_reward": 0.9737500071525573,
"rewards/code_reward": 0.5845044732093811,
"step": 1810,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.18512438922189176,
"clip_ratio/high_mean": 0.0357341198658105,
"clip_ratio/low_mean": 0.0033004880184307694,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03903460723813623,
"completion_length": 78.84000091552734,
"epoch": 0.34963019882816254,
"grad_norm": 9.198795318603516,
"kl": 4.244446061551571,
"learning_rate": 7.551182656617924e-07,
"loss": 0.0031,
"reward": 1.5848650455474853,
"reward_std": 0.17606763169169426,
"rewards/code_format_reward": 0.9862500071525574,
"rewards/code_reward": 0.5458700299263001,
"step": 1820,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.07551750033162534,
"clip_ratio/high_mean": 0.013169253122759983,
"clip_ratio/low_mean": 0.001537335959437769,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.014706589409615844,
"completion_length": 82.8800033569336,
"epoch": 0.3515512438766689,
"grad_norm": 0.724766731262207,
"kl": 0.9274087265133858,
"learning_rate": 7.526965742223234e-07,
"loss": 0.0013,
"reward": 1.5606717586517334,
"reward_std": 0.2877893716096878,
"rewards/code_format_reward": 0.9837499976158142,
"rewards/code_reward": 0.5343983888626098,
"step": 1830,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.1388332260772586,
"clip_ratio/high_mean": 0.021653852658346295,
"clip_ratio/low_mean": 0.008576209528837354,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03023006208240986,
"completion_length": 74.26250076293945,
"epoch": 0.3534722889251753,
"grad_norm": 5.426670074462891,
"kl": 0.7045004338026046,
"learning_rate": 7.502674957461079e-07,
"loss": -0.007,
"reward": 1.5688656568527222,
"reward_std": 0.30554552264511586,
"rewards/code_format_reward": 0.9837499976158142,
"rewards/code_reward": 0.5384953856468201,
"step": 1840,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.07899991576559842,
"clip_ratio/high_mean": 0.013301478006178513,
"clip_ratio/low_mean": 0.01124582380289212,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.024547301628626884,
"completion_length": 74.45500106811524,
"epoch": 0.35539333397368167,
"grad_norm": 2.5104761123657227,
"kl": 0.6198086604475975,
"learning_rate": 7.478311187580363e-07,
"loss": -0.0071,
"reward": 1.5550098896026612,
"reward_std": 0.21109988391399384,
"rewards/code_format_reward": 0.9924999952316285,
"rewards/code_reward": 0.52937992811203,
"step": 1850,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.063601403683424,
"clip_ratio/high_mean": 0.010639100335538387,
"clip_ratio/low_mean": 0.00778028266504407,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.018419382628053427,
"completion_length": 71.92500152587891,
"epoch": 0.35731437902218804,
"grad_norm": 3.805928945541382,
"kl": 1.6179959252476692,
"learning_rate": 7.453875320489842e-07,
"loss": 0.3,
"reward": 1.4410953760147094,
"reward_std": 0.19501519501209258,
"rewards/code_format_reward": 0.981249988079071,
"rewards/code_reward": 0.47523519992828367,
"step": 1860,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.10860318611375988,
"clip_ratio/high_mean": 0.018746975070098416,
"clip_ratio/low_mean": 0.008747255423804745,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02749423016794026,
"completion_length": 69.9375015258789,
"epoch": 0.3592354240706945,
"grad_norm": 2.388782501220703,
"kl": 0.5952992506325245,
"learning_rate": 7.429368246725772e-07,
"loss": 0.0443,
"reward": 1.6972971916198731,
"reward_std": 0.17401356399059295,
"rewards/code_format_reward": 0.9912499904632568,
"rewards/code_reward": 0.6008361041545868,
"step": 1870,
"zero_std_ratio": 0.625
},
{
"clip_ratio/high_max": 0.08630747124552726,
"clip_ratio/high_mean": 0.012746809562668205,
"clip_ratio/low_mean": 0.010304910433478653,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02305172006599605,
"completion_length": 70.83000183105469,
"epoch": 0.36115646911920085,
"grad_norm": 16.255178451538086,
"kl": 0.8730347856879235,
"learning_rate": 7.40479085941945e-07,
"loss": 0.0036,
"reward": 1.467816424369812,
"reward_std": 0.17535984218120576,
"rewards/code_format_reward": 0.9925000071525574,
"rewards/code_reward": 0.48578319549560545,
"step": 1880,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.26251301234588026,
"clip_ratio/high_mean": 0.03827818045392632,
"clip_ratio/low_mean": 0.005526873719645664,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04380505495937541,
"completion_length": 64.74750213623047,
"epoch": 0.3630775141677072,
"grad_norm": 4.061140060424805,
"kl": 0.8530658036470413,
"learning_rate": 7.380144054264669e-07,
"loss": 0.0197,
"reward": 1.498781108856201,
"reward_std": 0.17463037073612214,
"rewards/code_format_reward": 0.9600000023841858,
"rewards/code_reward": 0.509390527009964,
"step": 1890,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.24850870491936802,
"clip_ratio/high_mean": 0.04144583061570302,
"clip_ratio/low_mean": 0.00702623330289498,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04847206423291937,
"completion_length": 75.8375015258789,
"epoch": 0.3649985592162136,
"grad_norm": 3.4472062587738037,
"kl": 1.6324397973716258,
"learning_rate": 7.355428729485071e-07,
"loss": -0.001,
"reward": 1.6619214057922362,
"reward_std": 0.18103656098246573,
"rewards/code_format_reward": 0.9875,
"rewards/code_reward": 0.5840856909751893,
"step": 1900,
"zero_std_ratio": 0.625
},
{
"clip_ratio/high_max": 0.09173821061849594,
"clip_ratio/high_mean": 0.014921509474515916,
"clip_ratio/low_mean": 0.002157307107700035,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.017078816797584294,
"completion_length": 62.185000610351565,
"epoch": 0.36691960426472003,
"grad_norm": 2.0225422382354736,
"kl": 184.02759787738324,
"learning_rate": 7.330645785801417e-07,
"loss": 2.9496,
"reward": 1.7410502433776855,
"reward_std": 0.10668236091732979,
"rewards/code_format_reward": 0.9949999928474427,
"rewards/code_reward": 0.6217751204967499,
"step": 1910,
"zero_std_ratio": 0.75
},
{
"clip_ratio/high_max": 0.16933906488120556,
"clip_ratio/high_mean": 0.02619449864141643,
"clip_ratio/low_mean": 0.014137339405715465,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04033183753490448,
"completion_length": 79.30000152587891,
"epoch": 0.3688406493132264,
"grad_norm": 2.6208443641662598,
"kl": 1.235317513346672,
"learning_rate": 7.305796126398758e-07,
"loss": -0.0012,
"reward": 1.5036948204040528,
"reward_std": 0.20645264089107512,
"rewards/code_format_reward": 0.9762499928474426,
"rewards/code_reward": 0.5077848553657531,
"step": 1920,
"zero_std_ratio": 0.6
},
{
"clip_ratio/high_max": 0.2661599090555683,
"clip_ratio/high_mean": 0.03600101897318382,
"clip_ratio/low_mean": 0.009155643907433841,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.045156662538647654,
"completion_length": 78.10000152587891,
"epoch": 0.3707616943617328,
"grad_norm": 8.953734397888184,
"kl": 0.6204134523868561,
"learning_rate": 7.280880656893518e-07,
"loss": 0.0025,
"reward": 1.4915935516357421,
"reward_std": 0.2376121073961258,
"rewards/code_format_reward": 0.9787499904632568,
"rewards/code_reward": 0.501109266281128,
"step": 1930,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.15203024838119744,
"clip_ratio/high_mean": 0.023713350854814054,
"clip_ratio/low_mean": 0.004282052081543952,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02799540273845196,
"completion_length": 74.42500076293945,
"epoch": 0.37268273941023916,
"grad_norm": 11.845942497253418,
"kl": 0.5031724810600281,
"learning_rate": 7.255900285300496e-07,
"loss": 0.5255,
"reward": 1.6400779724121093,
"reward_std": 0.22267285138368606,
"rewards/code_format_reward": 0.9649999856948852,
"rewards/code_reward": 0.5787889719009399,
"step": 1940,
"zero_std_ratio": 0.625
},
{
"clip_ratio/high_max": 0.09135808227583767,
"clip_ratio/high_mean": 0.012801296508405358,
"clip_ratio/low_mean": 0.01690869364247192,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02970999013632536,
"completion_length": 69.52000198364257,
"epoch": 0.37460378445874554,
"grad_norm": 6.7441229820251465,
"kl": 1.2024895504117012,
"learning_rate": 7.230855921999769e-07,
"loss": 44.3651,
"reward": 1.6912511348724366,
"reward_std": 0.17418113350868225,
"rewards/code_format_reward": 0.9899999976158143,
"rewards/code_reward": 0.5981255412101746,
"step": 1950,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.07453169417567551,
"clip_ratio/high_mean": 0.009913802641676739,
"clip_ratio/low_mean": 0.003736039294744842,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013649841863662004,
"completion_length": 74.01250228881835,
"epoch": 0.37652482950725197,
"grad_norm": 4.616723537445068,
"kl": 0.6156632959842682,
"learning_rate": 7.205748479703515e-07,
"loss": -0.0005,
"reward": 1.846400761604309,
"reward_std": 0.17167636156082153,
"rewards/code_format_reward": 0.9899999976158143,
"rewards/code_reward": 0.6757004141807557,
"step": 1960,
"zero_std_ratio": 0.675
},
{
"clip_ratio/high_max": 0.09189570704475045,
"clip_ratio/high_mean": 0.013587052945513278,
"clip_ratio/low_mean": 0.004667519498616457,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0182545724324882,
"completion_length": 64.46750030517578,
"epoch": 0.37844587455575834,
"grad_norm": 0.17748567461967468,
"kl": 0.4286219261586666,
"learning_rate": 7.180578873422757e-07,
"loss": -0.0046,
"reward": 1.612094521522522,
"reward_std": 0.10822201184928418,
"rewards/code_format_reward": 0.99375,
"rewards/code_reward": 0.5576097548007966,
"step": 1970,
"zero_std_ratio": 0.725
},
{
"clip_ratio/high_max": 0.2088342323899269,
"clip_ratio/high_mean": 0.028434151923283933,
"clip_ratio/low_mean": 0.005974846053868532,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.034408997558057305,
"completion_length": 69.26750106811524,
"epoch": 0.3803669196042647,
"grad_norm": 6.238914966583252,
"kl": 0.7256933867931366,
"learning_rate": 7.155348020434001e-07,
"loss": -0.0046,
"reward": 1.469704508781433,
"reward_std": 0.24035734832286834,
"rewards/code_format_reward": 0.9799999833106995,
"rewards/code_reward": 0.4898522675037384,
"step": 1980,
"zero_std_ratio": 0.4
},
{
"clip_ratio/high_max": 0.05789460870437324,
"clip_ratio/high_mean": 0.007717460609273985,
"clip_ratio/low_mean": 0.003460834617726505,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011178295104764402,
"completion_length": 70.19000244140625,
"epoch": 0.3822879646527711,
"grad_norm": 8.066108703613281,
"kl": 1.1788517452776432,
"learning_rate": 7.130056840245824e-07,
"loss": -0.0005,
"reward": 1.5026792764663697,
"reward_std": 0.2312860034406185,
"rewards/code_format_reward": 0.9962499976158142,
"rewards/code_reward": 0.5022771418094635,
"step": 1990,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.07602796133141965,
"clip_ratio/high_mean": 0.012856367122731171,
"clip_ratio/low_mean": 0.0035519548939191735,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.016408322553616017,
"completion_length": 66.4625015258789,
"epoch": 0.38420900970127747,
"grad_norm": 3.559206962585449,
"kl": 1.225260878354311,
"learning_rate": 7.104706254565358e-07,
"loss": -0.003,
"reward": 1.742388916015625,
"reward_std": 0.12480423972010612,
"rewards/code_format_reward": 0.9924999952316285,
"rewards/code_reward": 0.623069453239441,
"step": 2000,
"zero_std_ratio": 0.675
},
{
"clip_ratio/high_max": 0.11271043051965535,
"clip_ratio/high_mean": 0.017727556044701488,
"clip_ratio/low_mean": 0.005613272835034877,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02334082857705653,
"completion_length": 77.1650001525879,
"epoch": 0.3861300547497839,
"grad_norm": 3.4077274799346924,
"kl": 0.8489379599690438,
"learning_rate": 7.07929718726469e-07,
"loss": 0.0403,
"reward": 1.5602745056152343,
"reward_std": 0.2609230324625969,
"rewards/code_format_reward": 0.9850000023841858,
"rewards/code_reward": 0.5338872492313385,
"step": 2010,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.2993799396790564,
"clip_ratio/high_mean": 0.041865267558023334,
"clip_ratio/low_mean": 0.006948894041124731,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04881416228599846,
"completion_length": 74.0150016784668,
"epoch": 0.3880510997982903,
"grad_norm": 3.2043685913085938,
"kl": 6.086115422844887,
"learning_rate": 7.053830564347206e-07,
"loss": 2.2989,
"reward": 1.5310536623001099,
"reward_std": 0.19302123934030532,
"rewards/code_format_reward": 0.9837500095367432,
"rewards/code_reward": 0.5195893287658692,
"step": 2020,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.06591402762569487,
"clip_ratio/high_mean": 0.009311116795288399,
"clip_ratio/low_mean": 0.0017412514251191169,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01105236830189824,
"completion_length": 73.44250106811523,
"epoch": 0.38997214484679665,
"grad_norm": 2.137256622314453,
"kl": 3.9139866441488267,
"learning_rate": 7.028307313913838e-07,
"loss": 0.0061,
"reward": 1.8796703815460205,
"reward_std": 0.12868851274251938,
"rewards/code_format_reward": 0.9974999904632569,
"rewards/code_reward": 0.6904601573944091,
"step": 2030,
"zero_std_ratio": 0.775
},
{
"clip_ratio/high_max": 0.24738994101062417,
"clip_ratio/high_mean": 0.03705689987400547,
"clip_ratio/low_mean": 0.007423648721305654,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04448054819367826,
"completion_length": 67.09500198364258,
"epoch": 0.39189318989530303,
"grad_norm": 5.504507541656494,
"kl": 1.4878595262765884,
"learning_rate": 7.002728366129242e-07,
"loss": 0.0166,
"reward": 1.8640715599060058,
"reward_std": 0.22610510736703873,
"rewards/code_format_reward": 0.9799999952316284,
"rewards/code_reward": 0.6870357990264893,
"step": 2040,
"zero_std_ratio": 0.65
},
{
"clip_ratio/high_max": 0.09283696161583066,
"clip_ratio/high_mean": 0.014592013147193938,
"clip_ratio/low_mean": 0.0040809189551509915,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01867293259128928,
"completion_length": 72.88500137329102,
"epoch": 0.3938142349438094,
"grad_norm": 1.877032995223999,
"kl": 2.3534633785486223,
"learning_rate": 6.977094653187891e-07,
"loss": 0.3364,
"reward": 1.5182712078094482,
"reward_std": 0.19934598058462144,
"rewards/code_format_reward": 0.9712499976158142,
"rewards/code_reward": 0.5163230776786805,
"step": 2050,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.047755516692996026,
"clip_ratio/high_mean": 0.007312651420943439,
"clip_ratio/low_mean": 0.0007527987050707452,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008065450168214739,
"completion_length": 67.76500091552734,
"epoch": 0.39573527999231584,
"grad_norm": 1.7954281568527222,
"kl": 2.4329017847776413,
"learning_rate": 6.95140710928012e-07,
"loss": 206.5648,
"reward": 1.3761554956436157,
"reward_std": 0.21033956706523896,
"rewards/code_format_reward": 0.9762499928474426,
"rewards/code_reward": 0.44401525855064394,
"step": 2060,
"zero_std_ratio": 0.6
},
{
"clip_ratio/high_max": 0.07075442476198077,
"clip_ratio/high_mean": 0.009443573304452002,
"clip_ratio/low_mean": 0.003901358728762716,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013344932324253022,
"completion_length": 68.6150016784668,
"epoch": 0.3976563250408222,
"grad_norm": 1.3921815156936646,
"kl": 0.6283935949206352,
"learning_rate": 6.925666670558062e-07,
"loss": 1.5274,
"reward": 1.4756604433059692,
"reward_std": 0.2542987480759621,
"rewards/code_format_reward": 0.9837499976158142,
"rewards/code_reward": 0.49189271330833434,
"step": 2070,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.09334485791623592,
"clip_ratio/high_mean": 0.015712386509403587,
"clip_ratio/low_mean": 0.005205962993204594,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02091834945604205,
"completion_length": 75.06750183105468,
"epoch": 0.3995773700893286,
"grad_norm": 1.3997697830200195,
"kl": 0.5330163806676864,
"learning_rate": 6.899874275101538e-07,
"loss": -0.0031,
"reward": 1.7522424459457397,
"reward_std": 0.1803124487400055,
"rewards/code_format_reward": 0.9899999856948852,
"rewards/code_reward": 0.6286212205886841,
"step": 2080,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.08916364926844836,
"clip_ratio/high_mean": 0.014017748599871992,
"clip_ratio/low_mean": 0.003948131998186,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01796588017605245,
"completion_length": 78.19750213623047,
"epoch": 0.40149841513783496,
"grad_norm": 2296.336669921875,
"kl": 1.0256180852651595,
"learning_rate": 6.874030862883879e-07,
"loss": 0.0318,
"reward": 1.2450440883636475,
"reward_std": 0.22890471369028093,
"rewards/code_format_reward": 0.9775000095367432,
"rewards/code_reward": 0.3781470343470573,
"step": 2090,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.22327043637633323,
"clip_ratio/high_mean": 0.04789549903944135,
"clip_ratio/low_mean": 0.00559167112223804,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.053487171232700345,
"completion_length": 70.61250152587891,
"epoch": 0.4034194601863414,
"grad_norm": 3.2615253925323486,
"kl": 8.218332803249359,
"learning_rate": 6.848137375737652e-07,
"loss": 0.0058,
"reward": 1.6430699110031128,
"reward_std": 0.21420457661151887,
"rewards/code_format_reward": 0.96875,
"rewards/code_reward": 0.5793474376201629,
"step": 2100,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.1533806946128607,
"clip_ratio/high_mean": 0.02256658235564828,
"clip_ratio/low_mean": 0.002787484592408873,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.025354066491127016,
"completion_length": 74.33999938964844,
"epoch": 0.40534050523484777,
"grad_norm": 4.315516471862793,
"kl": 1.0426696628332137,
"learning_rate": 6.822194757320354e-07,
"loss": 0.0019,
"reward": 1.6090970516204834,
"reward_std": 0.1758709292858839,
"rewards/code_format_reward": 0.993749988079071,
"rewards/code_reward": 0.5561110019683838,
"step": 2110,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.1336930485442281,
"clip_ratio/high_mean": 0.021989132883027195,
"clip_ratio/low_mean": 0.0070218192064203325,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0290109527297318,
"completion_length": 73.0250015258789,
"epoch": 0.40726155028335415,
"grad_norm": 18.143117904663086,
"kl": 0.4288759011775255,
"learning_rate": 6.796203953080007e-07,
"loss": 0.0005,
"reward": 1.72017080783844,
"reward_std": 0.22243313789367675,
"rewards/code_format_reward": 0.9824999928474426,
"rewards/code_reward": 0.6144603788852692,
"step": 2120,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.08061643750406802,
"clip_ratio/high_mean": 0.011467291257577016,
"clip_ratio/low_mean": 0.011395246715983376,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.022862538695335388,
"completion_length": 68.66250152587891,
"epoch": 0.4091825953318605,
"grad_norm": 1.0005404949188232,
"kl": 0.47304695919156076,
"learning_rate": 6.770165910220709e-07,
"loss": 0.0006,
"reward": 1.4831626653671264,
"reward_std": 0.1916220799088478,
"rewards/code_format_reward": 0.9837499856948853,
"rewards/code_reward": 0.4956438183784485,
"step": 2130,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.06393485199660062,
"clip_ratio/high_mean": 0.011905963439494372,
"clip_ratio/low_mean": 0.0023792986408807336,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01428526220843196,
"completion_length": 74.32250137329102,
"epoch": 0.4111036403803669,
"grad_norm": 2.491830825805664,
"kl": 2.213325946778059,
"learning_rate": 6.744081577668115e-07,
"loss": 0.1532,
"reward": 1.7680244207382203,
"reward_std": 0.18317916095256806,
"rewards/code_format_reward": 0.9687499880790711,
"rewards/code_reward": 0.6418246865272522,
"step": 2140,
"zero_std_ratio": 0.675
},
{
"clip_ratio/high_max": 0.03965856842696667,
"clip_ratio/high_mean": 0.00730013819411397,
"clip_ratio/low_mean": 0.0031650666729547083,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010465204739011824,
"completion_length": 73.1050018310547,
"epoch": 0.41302468542887333,
"grad_norm": 0.353427916765213,
"kl": 0.2898652456700802,
"learning_rate": 6.717951906034856e-07,
"loss": -0.0015,
"reward": 1.6113624095916748,
"reward_std": 0.09930019937455654,
"rewards/code_format_reward": 0.9862499952316284,
"rewards/code_reward": 0.5591186702251434,
"step": 2150,
"zero_std_ratio": 0.725
},
{
"clip_ratio/high_max": 0.03382167350500822,
"clip_ratio/high_mean": 0.005409902473911643,
"clip_ratio/low_mean": 0.0024156818573828785,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007825584337115287,
"completion_length": 68.12750091552735,
"epoch": 0.4149457304773797,
"grad_norm": 3.9950575828552246,
"kl": 0.789361334592104,
"learning_rate": 6.691777847585883e-07,
"loss": 0.048,
"reward": 1.5698497295379639,
"reward_std": 0.1552659712731838,
"rewards/code_format_reward": 0.9725000023841858,
"rewards/code_reward": 0.5417998552322387,
"step": 2160,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.026565033989027143,
"clip_ratio/high_mean": 0.004212364956038073,
"clip_ratio/low_mean": 0.0013839059392921627,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005596270889509469,
"completion_length": 70.80999984741212,
"epoch": 0.4168667755258861,
"grad_norm": 1.3910998106002808,
"kl": 1.4257395297288895,
"learning_rate": 6.665560356203784e-07,
"loss": 0.8731,
"reward": 1.4512264728546143,
"reward_std": 0.14117379933595658,
"rewards/code_format_reward": 0.9924999952316285,
"rewards/code_reward": 0.47748821377754214,
"step": 2170,
"zero_std_ratio": 0.65
},
{
"clip_ratio/high_max": 0.09546168451197445,
"clip_ratio/high_mean": 0.01459201174438931,
"clip_ratio/low_mean": 0.006060798710677773,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.020652810629690065,
"completion_length": 67.89000091552734,
"epoch": 0.41878782057439246,
"grad_norm": 0.6732813715934753,
"kl": 1.1321026906371117,
"learning_rate": 6.639300387353999e-07,
"loss": -0.0002,
"reward": 1.3501636981964111,
"reward_std": 0.21670444533228875,
"rewards/code_format_reward": 0.9924999833106994,
"rewards/code_reward": 0.42695685029029845,
"step": 2180,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.11407874876167626,
"clip_ratio/high_mean": 0.01725804756570142,
"clip_ratio/low_mean": 0.0015681478500482627,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.018826195126166567,
"completion_length": 68.7525016784668,
"epoch": 0.42070886562289883,
"grad_norm": 1.5759879350662231,
"kl": 0.4211964398622513,
"learning_rate": 6.612998898050014e-07,
"loss": -0.0021,
"reward": 1.7485667228698731,
"reward_std": 0.16526954025030136,
"rewards/code_format_reward": 0.9612500071525574,
"rewards/code_reward": 0.6339708626270294,
"step": 2190,
"zero_std_ratio": 0.6
},
{
"clip_ratio/high_max": 0.10751554854214192,
"clip_ratio/high_mean": 0.013745604571886361,
"clip_ratio/low_mean": 0.010064921525190585,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.023810526612214743,
"completion_length": 62.88500137329102,
"epoch": 0.42262991067140526,
"grad_norm": 2.4066975116729736,
"kl": 0.7549011036753654,
"learning_rate": 6.586656846818477e-07,
"loss": 0.2999,
"reward": 1.6932018756866456,
"reward_std": 0.1608109436929226,
"rewards/code_format_reward": 0.9899999976158143,
"rewards/code_reward": 0.5991009473800659,
"step": 2200,
"zero_std_ratio": 0.675
},
{
"clip_ratio/high_max": 0.019488381547853352,
"clip_ratio/high_mean": 0.003436583065195009,
"clip_ratio/low_mean": 0.002801175639615394,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0062377589056268334,
"completion_length": 72.55250244140625,
"epoch": 0.42455095571991164,
"grad_norm": 2.0696611404418945,
"kl": 5.306586292386055,
"learning_rate": 6.56027519366427e-07,
"loss": 0.011,
"reward": 1.611876368522644,
"reward_std": 0.1603232156485319,
"rewards/code_format_reward": 0.9850000023841858,
"rewards/code_reward": 0.5596881568431854,
"step": 2210,
"zero_std_ratio": 0.6
},
{
"clip_ratio/high_max": 0.05246827639639377,
"clip_ratio/high_mean": 0.00732308179140091,
"clip_ratio/low_mean": 0.0034836977836675944,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010806779406266287,
"completion_length": 64.31750183105468,
"epoch": 0.426472000768418,
"grad_norm": 0.12577353417873383,
"kl": 0.5850224502384662,
"learning_rate": 6.533854900035516e-07,
"loss": -0.0015,
"reward": 1.7735862731933594,
"reward_std": 0.13040905613452197,
"rewards/code_format_reward": 0.9875,
"rewards/code_reward": 0.6399181246757507,
"step": 2220,
"zero_std_ratio": 0.7
},
{
"clip_ratio/high_max": 0.24315445288084447,
"clip_ratio/high_mean": 0.031706276966724546,
"clip_ratio/low_mean": 0.011593326600268484,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04329960328759626,
"completion_length": 72.80000152587891,
"epoch": 0.4283930458169244,
"grad_norm": 4.765016078948975,
"kl": 1.5887107208371163,
"learning_rate": 6.507396928788548e-07,
"loss": 0.0023,
"reward": 1.6477301597595215,
"reward_std": 0.12887158915400504,
"rewards/code_format_reward": 0.975,
"rewards/code_reward": 0.5801151037216187,
"step": 2230,
"zero_std_ratio": 0.675
},
{
"clip_ratio/high_max": 0.044854282308369874,
"clip_ratio/high_mean": 0.007485381804872304,
"clip_ratio/low_mean": 0.0028356918206554837,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01032107327482663,
"completion_length": 66.63000183105468,
"epoch": 0.4303140908654308,
"grad_norm": 1.5923104286193848,
"kl": 0.9431760296225548,
"learning_rate": 6.480902244152813e-07,
"loss": -0.0021,
"reward": 1.4723083972930908,
"reward_std": 0.13776133116334677,
"rewards/code_format_reward": 0.9899999856948852,
"rewards/code_reward": 0.48865418434143065,
"step": 2240,
"zero_std_ratio": 0.7
},
{
"clip_ratio/high_max": 0.08558401605114341,
"clip_ratio/high_mean": 0.01418596402509138,
"clip_ratio/low_mean": 0.005716345021210145,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.019902308681048454,
"completion_length": 67.75250015258788,
"epoch": 0.4322351359139372,
"grad_norm": 4.213563442230225,
"kl": 0.7182839468121529,
"learning_rate": 6.454371811695732e-07,
"loss": -0.0032,
"reward": 1.5263491868972778,
"reward_std": 0.215225650370121,
"rewards/code_format_reward": 0.975000011920929,
"rewards/code_reward": 0.51942458152771,
"step": 2250,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.17924602022394537,
"clip_ratio/high_mean": 0.02314122476382181,
"clip_ratio/low_mean": 0.006780697987414897,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.029921922995708884,
"completion_length": 67.31500091552735,
"epoch": 0.43415618096244357,
"grad_norm": 2.018653392791748,
"kl": 0.644180704653263,
"learning_rate": 6.427806598287522e-07,
"loss": -0.0031,
"reward": 1.8284268617630004,
"reward_std": 0.1590463936328888,
"rewards/code_format_reward": 0.993749988079071,
"rewards/code_reward": 0.6657759308815002,
"step": 2260,
"zero_std_ratio": 0.625
},
{
"clip_ratio/high_max": 0.26562999933958054,
"clip_ratio/high_mean": 0.04080731603316963,
"clip_ratio/low_mean": 0.002605196795775555,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.043412512401118875,
"completion_length": 64.91250076293946,
"epoch": 0.43607722601094995,
"grad_norm": 2.8014633655548096,
"kl": 1.4193657219409943,
"learning_rate": 6.401207572065942e-07,
"loss": 0.0075,
"reward": 1.6795406818389893,
"reward_std": 0.1340640414506197,
"rewards/code_format_reward": 0.99375,
"rewards/code_reward": 0.5913328170776367,
"step": 2270,
"zero_std_ratio": 0.625
},
{
"clip_ratio/high_max": 0.15035496577620505,
"clip_ratio/high_mean": 0.021555275144055485,
"clip_ratio/low_mean": 0.007308500797080342,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.028863775311037898,
"completion_length": 83.20750122070312,
"epoch": 0.4379982710594563,
"grad_norm": 5.3116655349731445,
"kl": 1.7165004715323449,
"learning_rate": 6.374575702401019e-07,
"loss": -0.0031,
"reward": 1.694450354576111,
"reward_std": 0.2935485541820526,
"rewards/code_format_reward": 0.9650000095367431,
"rewards/code_reward": 0.6059751749038697,
"step": 2280,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.05178634703624994,
"clip_ratio/high_mean": 0.007199086344917305,
"clip_ratio/low_mean": 0.004959188599605114,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012158275028923526,
"completion_length": 68.35500106811523,
"epoch": 0.43991931610796275,
"grad_norm": 11.67419719696045,
"kl": 0.8460408747196198,
"learning_rate": 6.347911959859725e-07,
"loss": -0.0013,
"reward": 1.6080287456512452,
"reward_std": 0.2270718976855278,
"rewards/code_format_reward": 0.9699999928474426,
"rewards/code_reward": 0.5615143775939941,
"step": 2290,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.07622572851832957,
"clip_ratio/high_mean": 0.011604995708330535,
"clip_ratio/low_mean": 0.0013341609621420503,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012939156647189521,
"completion_length": 68.30750274658203,
"epoch": 0.44184036115646913,
"grad_norm": 332.7762451171875,
"kl": 0.7540152728557586,
"learning_rate": 6.321217316170599e-07,
"loss": 0.1015,
"reward": 1.4850183725357056,
"reward_std": 0.1393202841281891,
"rewards/code_format_reward": 0.9912499904632568,
"rewards/code_reward": 0.49469670057296755,
"step": 2300,
"zero_std_ratio": 0.6
},
{
"clip_ratio/high_max": 0.16746083926409483,
"clip_ratio/high_mean": 0.02103413282893598,
"clip_ratio/low_mean": 0.0068577720652683635,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.027891904639545828,
"completion_length": 64.55000152587891,
"epoch": 0.4437614062049755,
"grad_norm": 0.36056017875671387,
"kl": 0.4329931303858757,
"learning_rate": 6.294492744188335e-07,
"loss": 0.0002,
"reward": 1.4963040232658387,
"reward_std": 0.07247132882475853,
"rewards/code_format_reward": 0.9837499976158142,
"rewards/code_reward": 0.502214539051056,
"step": 2310,
"zero_std_ratio": 0.725
},
{
"clip_ratio/high_max": 0.05429213300812989,
"clip_ratio/high_mean": 0.007803994990536012,
"clip_ratio/low_mean": 0.008226435555843636,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01603043078503106,
"completion_length": 69.78750228881836,
"epoch": 0.4456824512534819,
"grad_norm": 0.1676941215991974,
"kl": 0.276796979829669,
"learning_rate": 6.267739217858329e-07,
"loss": -0.0028,
"reward": 1.7269956827163697,
"reward_std": 0.1742506742477417,
"rewards/code_format_reward": 0.9912499904632568,
"rewards/code_reward": 0.6156853199005127,
"step": 2320,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.03961263382807374,
"clip_ratio/high_mean": 0.00831791803939268,
"clip_ratio/low_mean": 0.008615480939624831,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.016933399019762874,
"completion_length": 70.70000228881835,
"epoch": 0.44760349630198826,
"grad_norm": 6.724217891693115,
"kl": 0.544577070325613,
"learning_rate": 6.240957712181186e-07,
"loss": -0.0041,
"reward": 1.3949034690856934,
"reward_std": 0.21950918734073638,
"rewards/code_format_reward": 0.9774999976158142,
"rewards/code_reward": 0.45307670831680297,
"step": 2330,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.2168802363506984,
"clip_ratio/high_mean": 0.03705684195374488,
"clip_ratio/low_mean": 0.0028890643618069587,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03994590537622571,
"completion_length": 74.0625,
"epoch": 0.4495245413504947,
"grad_norm": 3.073554277420044,
"kl": 0.6307030320167542,
"learning_rate": 6.214149203177182e-07,
"loss": -0.0002,
"reward": 1.679004979133606,
"reward_std": 0.1860196329653263,
"rewards/code_format_reward": 0.9912499904632568,
"rewards/code_reward": 0.5916899800300598,
"step": 2340,
"zero_std_ratio": 0.6
},
{
"clip_ratio/high_max": 0.0932310588657856,
"clip_ratio/high_mean": 0.014429462677799165,
"clip_ratio/low_mean": 0.0065534046734683216,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.020982867432758213,
"completion_length": 67.18000183105468,
"epoch": 0.45144558639900106,
"grad_norm": 3595.65576171875,
"kl": 1.140541896224022,
"learning_rate": 6.187314667850697e-07,
"loss": 0.1447,
"reward": 1.4676954984664916,
"reward_std": 0.20568167939782142,
"rewards/code_format_reward": 0.9849999904632568,
"rewards/code_reward": 0.4875977456569672,
"step": 2350,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.036702672578394414,
"clip_ratio/high_mean": 0.006752843782305717,
"clip_ratio/low_mean": 0.008269340678816661,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015022184286499396,
"completion_length": 80.57000122070312,
"epoch": 0.45336663144750744,
"grad_norm": 2.759171724319458,
"kl": 10.568821829557418,
"learning_rate": 6.160455084154613e-07,
"loss": 1.8532,
"reward": 1.4545687198638917,
"reward_std": 0.23069845288991928,
"rewards/code_format_reward": 0.9837499976158142,
"rewards/code_reward": 0.4813468337059021,
"step": 2360,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.08144733654335141,
"clip_ratio/high_mean": 0.014263840962667019,
"clip_ratio/low_mean": 0.0019872021744959056,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01625104369595647,
"completion_length": 71.55750045776367,
"epoch": 0.4552876764960138,
"grad_norm": 1.9088038206100464,
"kl": 1.3571255028247833,
"learning_rate": 6.133571430954667e-07,
"loss": 0.0026,
"reward": 1.5344175338745116,
"reward_std": 0.16607576459646226,
"rewards/code_format_reward": 0.9737500071525573,
"rewards/code_reward": 0.5237712502479553,
"step": 2370,
"zero_std_ratio": 0.65
},
{
"clip_ratio/high_max": 0.10026950519531966,
"clip_ratio/high_mean": 0.01315019663888961,
"clip_ratio/low_mean": 0.00221524270309601,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015365439187735318,
"completion_length": 72.40750122070312,
"epoch": 0.4572087215445202,
"grad_norm": 4.301158428192139,
"kl": 0.6290791854262352,
"learning_rate": 6.106664687993782e-07,
"loss": -0.0032,
"reward": 1.5749263525009156,
"reward_std": 0.16429235637187958,
"rewards/code_format_reward": 0.9724999785423278,
"rewards/code_reward": 0.5443381488323211,
"step": 2380,
"zero_std_ratio": 0.65
},
{
"clip_ratio/high_max": 0.10308512919582427,
"clip_ratio/high_mean": 0.016338009486207738,
"clip_ratio/low_mean": 0.0017032683303114028,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.018041277857264504,
"completion_length": 76.43750228881837,
"epoch": 0.4591297665930266,
"grad_norm": 6.198258876800537,
"kl": 408884378.2116049,
"learning_rate": 6.079735835856362e-07,
"loss": 1157747.0,
"reward": 1.5280384778976441,
"reward_std": 0.19424125757068395,
"rewards/code_format_reward": 0.9862499952316284,
"rewards/code_reward": 0.517456728219986,
"step": 2390,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.24319633510895072,
"clip_ratio/high_mean": 0.037530579004669565,
"clip_ratio/low_mean": 0.004886501970031531,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04241708111949265,
"completion_length": 74.5425018310547,
"epoch": 0.461050811641533,
"grad_norm": 5.885474681854248,
"kl": 1.4351533338427545,
"learning_rate": 6.052785855932548e-07,
"loss": 0.123,
"reward": 1.4949720859527589,
"reward_std": 0.20392217636108398,
"rewards/code_format_reward": 0.9887500047683716,
"rewards/code_reward": 0.5002985119819641,
"step": 2400,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.23103972100652753,
"clip_ratio/high_mean": 0.0305588347138837,
"clip_ratio/low_mean": 0.002339675696566701,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03289851036388427,
"completion_length": 70.01750106811524,
"epoch": 0.4629718566900394,
"grad_norm": 0.8806352615356445,
"kl": 1.6503019407391548,
"learning_rate": 6.025815730382463e-07,
"loss": 0.8832,
"reward": 1.6588483333587647,
"reward_std": 0.19124363958835602,
"rewards/code_format_reward": 0.9725000143051148,
"rewards/code_reward": 0.5862991452217102,
"step": 2410,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.02757756725186482,
"clip_ratio/high_mean": 0.005332520017691422,
"clip_ratio/low_mean": 0.019763218611478804,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.025095738274103496,
"completion_length": 71.59250183105469,
"epoch": 0.46489290173854575,
"grad_norm": 1.2440141439437866,
"kl": 2.751401698589325,
"learning_rate": 5.998826442100412e-07,
"loss": 362174.725,
"reward": 1.5159764885902405,
"reward_std": 0.1902527991682291,
"rewards/code_format_reward": 0.9799999952316284,
"rewards/code_reward": 0.5129882216453552,
"step": 2420,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.2530499072512612,
"clip_ratio/high_mean": 0.03376921496528666,
"clip_ratio/low_mean": 0.0062858725665137175,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04005508716509212,
"completion_length": 76.0425018310547,
"epoch": 0.4668139467870522,
"grad_norm": 66.4449234008789,
"kl": 2164149.3861157326,
"learning_rate": 5.971818974679065e-07,
"loss": 2449736.0,
"reward": 1.6650853157043457,
"reward_std": 0.24712301939725875,
"rewards/code_format_reward": 0.9887500047683716,
"rewards/code_reward": 0.585355132818222,
"step": 2430,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.2950001623481512,
"clip_ratio/high_mean": 0.042542998865246776,
"clip_ratio/low_mean": 0.0068845050991512835,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.049427504313644025,
"completion_length": 75.27000198364257,
"epoch": 0.46873499183555856,
"grad_norm": 2.206911563873291,
"kl": 11.237105096876622,
"learning_rate": 5.944794312373607e-07,
"loss": 0.0298,
"reward": 1.7914002895355225,
"reward_std": 0.22826257348060608,
"rewards/code_format_reward": 0.9850000023841858,
"rewards/code_reward": 0.649450159072876,
"step": 2440,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.07424517879262567,
"clip_ratio/high_mean": 0.010772422759328038,
"clip_ratio/low_mean": 0.010833968574297614,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02160639046342112,
"completion_length": 71.40500183105469,
"epoch": 0.47065603688406493,
"grad_norm": 76503500980224.0,
"kl": 393.06428125053645,
"learning_rate": 5.917753440065869e-07,
"loss": 909725593.6,
"reward": 1.4975883960723877,
"reward_std": 0.28928079828619957,
"rewards/code_format_reward": 0.9612499833106994,
"rewards/code_reward": 0.5084816813468933,
"step": 2450,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.14294563261792065,
"clip_ratio/high_mean": 0.019953654275741427,
"clip_ratio/low_mean": 0.004493102640844881,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.024446757195983083,
"completion_length": 79.72250061035156,
"epoch": 0.4725770819325713,
"grad_norm": 0.778223991394043,
"kl": 2.2069298341870307,
"learning_rate": 5.89069734322844e-07,
"loss": -0.0085,
"reward": 1.5203648328781127,
"reward_std": 0.1896197520196438,
"rewards/code_format_reward": 0.9712499976158142,
"rewards/code_reward": 0.5173698782920837,
"step": 2460,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.0473501511849463,
"clip_ratio/high_mean": 0.006591684772865846,
"clip_ratio/low_mean": 0.0004718510695965961,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007063536025816575,
"completion_length": 76.70500183105469,
"epoch": 0.4744981269810777,
"grad_norm": 0.5978448390960693,
"kl": 0.6427325546741486,
"learning_rate": 5.863627007888745e-07,
"loss": 0.0007,
"reward": 1.7259918212890626,
"reward_std": 0.1515914086252451,
"rewards/code_format_reward": 0.9774999976158142,
"rewards/code_reward": 0.618620878458023,
"step": 2470,
"zero_std_ratio": 0.625
},
{
"clip_ratio/high_max": 0.048674007039517166,
"clip_ratio/high_mean": 0.010258768184576184,
"clip_ratio/low_mean": 0.012727768435433972,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.022986536961980164,
"completion_length": 78.06500244140625,
"epoch": 0.4764191720295841,
"grad_norm": 4.168500900268555,
"kl": 0.5699560895562172,
"learning_rate": 5.836543420593119e-07,
"loss": -0.0011,
"reward": 1.6060274362564086,
"reward_std": 0.2864475339651108,
"rewards/code_format_reward": 0.9824999928474426,
"rewards/code_reward": 0.557388699054718,
"step": 2480,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.014699839102104307,
"clip_ratio/high_mean": 0.0019118846452329309,
"clip_ratio/low_mean": 0.0005179177765967325,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0024298023723531514,
"completion_length": 85.17000122070313,
"epoch": 0.4783402170780905,
"grad_norm": 4.149423599243164,
"kl": 1.3347756370902062,
"learning_rate": 5.809447568370843e-07,
"loss": 0.0102,
"reward": 1.621114158630371,
"reward_std": 0.21484595835208892,
"rewards/code_format_reward": 0.9774999856948853,
"rewards/code_reward": 0.5661820948123932,
"step": 2490,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.029943293944234027,
"clip_ratio/high_mean": 0.006927184848609613,
"clip_ratio/low_mean": 0.0035072380007477475,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010434422721300508,
"completion_length": 83.86250228881836,
"epoch": 0.48026126212659687,
"grad_norm": 5.97049617767334,
"kl": 4.178053397685289,
"learning_rate": 5.782340438698185e-07,
"loss": -0.0063,
"reward": 1.6789068222045898,
"reward_std": 0.25779220163822175,
"rewards/code_format_reward": 0.9962499976158142,
"rewards/code_reward": 0.5903908908367157,
"step": 2500,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.08102847400587052,
"clip_ratio/high_mean": 0.01392527524731122,
"clip_ratio/low_mean": 0.0045509199095249645,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.018476195022230968,
"completion_length": 83.11250152587891,
"epoch": 0.48218230717510324,
"grad_norm": 5.283038139343262,
"kl": 1.111867392808199,
"learning_rate": 5.755223019462401e-07,
"loss": 17.941,
"reward": 1.577300524711609,
"reward_std": 0.22725088596343995,
"rewards/code_format_reward": 0.9774999976158142,
"rewards/code_reward": 0.5442752420902253,
"step": 2510,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.06463829884305597,
"clip_ratio/high_mean": 0.008858744835015387,
"clip_ratio/low_mean": 0.0054666692391037944,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01432541401591152,
"completion_length": 85.7625015258789,
"epoch": 0.4841033522236096,
"grad_norm": 8.200135231018066,
"kl": 0.4475974731147289,
"learning_rate": 5.728096298925745e-07,
"loss": -0.0057,
"reward": 1.5549763917922974,
"reward_std": 0.23400793820619584,
"rewards/code_format_reward": 0.9774999976158142,
"rewards/code_reward": 0.5331131994724274,
"step": 2520,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.031378739466890695,
"clip_ratio/high_mean": 0.00507326218066737,
"clip_ratio/low_mean": 0.010504274675622583,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015577536821365357,
"completion_length": 79.96750030517578,
"epoch": 0.48602439727211605,
"grad_norm": 2.6766583919525146,
"kl": 0.4622874528169632,
"learning_rate": 5.700961265689434e-07,
"loss": -0.0011,
"reward": 1.8167934179306031,
"reward_std": 0.30146218538284303,
"rewards/code_format_reward": 0.9850000023841858,
"rewards/code_reward": 0.6621467113494873,
"step": 2530,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.07196793179027736,
"clip_ratio/high_mean": 0.013576928357360884,
"clip_ratio/low_mean": 0.0018521397636504845,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015429067742661572,
"completion_length": 87.03000030517578,
"epoch": 0.4879454423206224,
"grad_norm": 1.347899317741394,
"kl": 0.7047492057085037,
"learning_rate": 5.673818908657644e-07,
"loss": -0.0079,
"reward": 1.6893932342529296,
"reward_std": 0.24144218415021895,
"rewards/code_format_reward": 0.9862500071525574,
"rewards/code_reward": 0.5981341004371643,
"step": 2540,
"zero_std_ratio": 0.375
},
{
"clip_ratio/high_max": 0.03861275149974972,
"clip_ratio/high_mean": 0.005072360605117865,
"clip_ratio/low_mean": 0.0013027720240643248,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0063751326058991255,
"completion_length": 78.80750122070313,
"epoch": 0.4898664873691288,
"grad_norm": 1.7946380376815796,
"kl": 0.7765734851360321,
"learning_rate": 5.646670217001451e-07,
"loss": 0.004,
"reward": 1.8638887882232666,
"reward_std": 0.1732952728867531,
"rewards/code_format_reward": 0.9949999928474427,
"rewards/code_reward": 0.6831943988800049,
"step": 2550,
"zero_std_ratio": 0.625
},
{
"clip_ratio/high_max": 0.044772564456798135,
"clip_ratio/high_mean": 0.008199371959199198,
"clip_ratio/low_mean": 0.007188984929234721,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015388356836047024,
"completion_length": 92.03750305175781,
"epoch": 0.4917875324176352,
"grad_norm": 8241.9619140625,
"kl": 3.7090125039219854,
"learning_rate": 5.619516180122789e-07,
"loss": 0.2194,
"reward": 1.346347188949585,
"reward_std": 0.3114967554807663,
"rewards/code_format_reward": 0.9712499976158142,
"rewards/code_reward": 0.4303610801696777,
"step": 2560,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.136108908476308,
"clip_ratio/high_mean": 0.01777363264700398,
"clip_ratio/low_mean": 0.0005986301795928739,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.018372263037599625,
"completion_length": 77.66500091552734,
"epoch": 0.4937085774661416,
"grad_norm": 2.8724048137664795,
"kl": 0.30402788892388344,
"learning_rate": 5.592357787618398e-07,
"loss": -0.0095,
"reward": 1.235116672515869,
"reward_std": 0.16121466904878617,
"rewards/code_format_reward": 0.9875,
"rewards/code_reward": 0.3706833332777023,
"step": 2570,
"zero_std_ratio": 0.6
},
{
"clip_ratio/high_max": 0.21411730786785482,
"clip_ratio/high_mean": 0.02751181152416393,
"clip_ratio/low_mean": 0.005141469169757329,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03265328073175624,
"completion_length": 77.39250030517579,
"epoch": 0.495629622514648,
"grad_norm": 3.1119463443756104,
"kl": 0.516096468269825,
"learning_rate": 5.565196029243746e-07,
"loss": -0.0097,
"reward": 1.7056148529052735,
"reward_std": 0.26717675626277926,
"rewards/code_format_reward": 0.9849999904632568,
"rewards/code_reward": 0.6065573751926422,
"step": 2580,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.06655758274719119,
"clip_ratio/high_mean": 0.00869752592407167,
"clip_ratio/low_mean": 0.0007154849590733647,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009413010929711163,
"completion_length": 78.71000061035156,
"epoch": 0.49755066756315436,
"grad_norm": 9.566883087158203,
"kl": 6.985853771865368,
"learning_rate": 5.538031894876971e-07,
"loss": 0.0154,
"reward": 1.8047074317932128,
"reward_std": 0.2406391829252243,
"rewards/code_format_reward": 0.9862499952316284,
"rewards/code_reward": 0.655791187286377,
"step": 2590,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.04153572088107467,
"clip_ratio/high_mean": 0.0076960999285802245,
"clip_ratio/low_mean": 0.00053562533139484,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008231725194491446,
"completion_length": 87.35249938964844,
"epoch": 0.49947171261166073,
"grad_norm": 4.163487911224365,
"kl": 3.02228729724884,
"learning_rate": 5.510866374482799e-07,
"loss": 0.0014,
"reward": 1.7271404266357422,
"reward_std": 0.20059744864702225,
"rewards/code_format_reward": 0.9862499952316284,
"rewards/code_reward": 0.6170076906681061,
"step": 2600,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.09242721796035766,
"clip_ratio/high_mean": 0.01333312913775444,
"clip_ratio/low_mean": 0.0022988114287727512,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01563194077461958,
"completion_length": 86.66250152587891,
"epoch": 0.5013927576601671,
"grad_norm": 1.7816847562789917,
"kl": 2.135231140255928,
"learning_rate": 5.48370045807647e-07,
"loss": -0.0043,
"reward": 1.5687429666519166,
"reward_std": 0.22490316033363342,
"rewards/code_format_reward": 0.9524999976158142,
"rewards/code_reward": 0.5462464988231659,
"step": 2610,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.10432031177915632,
"clip_ratio/high_mean": 0.01704162026871927,
"clip_ratio/low_mean": 0.0019667694039526397,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01900838967994787,
"completion_length": 98.58000183105469,
"epoch": 0.5033138027086735,
"grad_norm": 2.1069369316101074,
"kl": 2.131927290558815,
"learning_rate": 5.456535135687656e-07,
"loss": -0.0069,
"reward": 1.6628828048706055,
"reward_std": 0.23133169412612914,
"rewards/code_format_reward": 0.9824999928474426,
"rewards/code_reward": 0.5858163475990296,
"step": 2620,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.04196612173691392,
"clip_ratio/high_mean": 0.0064837948535569016,
"clip_ratio/low_mean": 0.0025595034239813685,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009043298207689076,
"completion_length": 86.6,
"epoch": 0.5052348477571799,
"grad_norm": 15.670801162719727,
"kl": 2.1330361180007458,
"learning_rate": 5.429371397324378e-07,
"loss": -0.0054,
"reward": 1.4884859561920165,
"reward_std": 0.3388957381248474,
"rewards/code_format_reward": 0.9887500047683716,
"rewards/code_reward": 0.497055447101593,
"step": 2630,
"zero_std_ratio": 0.35
},
{
"clip_ratio/high_max": 0.036724881688132885,
"clip_ratio/high_mean": 0.005309284973191097,
"clip_ratio/low_mean": 0.003665669827023521,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00897495478275232,
"completion_length": 84.73250122070313,
"epoch": 0.5071558928056863,
"grad_norm": 6.480928421020508,
"kl": 0.9241176024079323,
"learning_rate": 5.402210232936934e-07,
"loss": -0.0009,
"reward": 1.792254877090454,
"reward_std": 0.29597480297088624,
"rewards/code_format_reward": 0.9974999904632569,
"rewards/code_reward": 0.646752405166626,
"step": 2640,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.109601711621508,
"clip_ratio/high_mean": 0.015215938963228837,
"clip_ratio/low_mean": 0.0034228902019094675,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.018638829072006046,
"completion_length": 88.82750244140625,
"epoch": 0.5090769378541927,
"grad_norm": 5.080334186553955,
"kl": 0.6404796183109284,
"learning_rate": 5.37505263238181e-07,
"loss": -0.0032,
"reward": 1.7266260623931884,
"reward_std": 0.27733459770679475,
"rewards/code_format_reward": 0.993749988079071,
"rewards/code_reward": 0.6148755311965942,
"step": 2650,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.0589832967845723,
"clip_ratio/high_mean": 0.009531341239926406,
"clip_ratio/low_mean": 0.00046608211705461143,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00999742336862255,
"completion_length": 88.69250183105468,
"epoch": 0.510997982902699,
"grad_norm": 7.949027061462402,
"kl": 0.6428510576486588,
"learning_rate": 5.347899585385619e-07,
"loss": -0.0028,
"reward": 1.8208046436309815,
"reward_std": 0.32592435777187345,
"rewards/code_format_reward": 0.9824999928474426,
"rewards/code_reward": 0.6647772669792176,
"step": 2660,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.15465617645531893,
"clip_ratio/high_mean": 0.022943795099854468,
"clip_ratio/low_mean": 0.0016213681577937678,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.024565163621446118,
"completion_length": 87.13250274658203,
"epoch": 0.5129190279512055,
"grad_norm": 34.059959411621094,
"kl": 0.5654895901679993,
"learning_rate": 5.320752081509019e-07,
"loss": -0.0048,
"reward": 1.7013320207595826,
"reward_std": 0.27322621941566466,
"rewards/code_format_reward": 0.9875,
"rewards/code_reward": 0.6037909984588623,
"step": 2670,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.04982744911685586,
"clip_ratio/high_mean": 0.007483145385049283,
"clip_ratio/low_mean": 0.0010201202865573577,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008503265725448728,
"completion_length": 91.09750213623047,
"epoch": 0.5148400729997118,
"grad_norm": 3.5055086612701416,
"kl": 0.5736653476953506,
"learning_rate": 5.293611110110661e-07,
"loss": -0.0032,
"reward": 1.672940969467163,
"reward_std": 0.24722242057323457,
"rewards/code_format_reward": 0.9850000023841858,
"rewards/code_reward": 0.5902204990386963,
"step": 2680,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.06388061246834695,
"clip_ratio/high_mean": 0.008427212300011888,
"clip_ratio/low_mean": 0.000504276818537619,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008931488974485546,
"completion_length": 84.06750183105468,
"epoch": 0.5167611180482182,
"grad_norm": 1.1749032735824585,
"kl": 0.6257337100803853,
"learning_rate": 5.266477660311123e-07,
"loss": -0.0049,
"reward": 1.883350706100464,
"reward_std": 0.1923319399356842,
"rewards/code_format_reward": 0.9949999928474427,
"rewards/code_reward": 0.6929253697395324,
"step": 2690,
"zero_std_ratio": 0.6
},
{
"clip_ratio/high_max": 0.032716090651229025,
"clip_ratio/high_mean": 0.004722256149398163,
"clip_ratio/low_mean": 0.00025295682498835956,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0049752129940316085,
"completion_length": 101.05250244140625,
"epoch": 0.5186821630967247,
"grad_norm": 2.250870704650879,
"kl": 0.3336128618568182,
"learning_rate": 5.239352720956869e-07,
"loss": -0.0014,
"reward": 1.803996729850769,
"reward_std": 0.3182943195104599,
"rewards/code_format_reward": 0.9887499928474426,
"rewards/code_reward": 0.6548108696937561,
"step": 2700,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.057230279734358194,
"clip_ratio/high_mean": 0.01016470161266625,
"clip_ratio/low_mean": 0.001601585964090191,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01176628761459142,
"completion_length": 92.2125015258789,
"epoch": 0.520603208145231,
"grad_norm": 1.7528822422027588,
"kl": 0.30482072457671167,
"learning_rate": 5.212237280584214e-07,
"loss": -0.0012,
"reward": 1.6862072706222535,
"reward_std": 0.2419889122247696,
"rewards/code_format_reward": 0.9837499976158142,
"rewards/code_reward": 0.5971661269664764,
"step": 2710,
"zero_std_ratio": 0.375
},
{
"clip_ratio/high_max": 0.060608417179901154,
"clip_ratio/high_mean": 0.00894762706157053,
"clip_ratio/low_mean": 0.0007068538383464329,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009654480942117516,
"completion_length": 92.53000183105469,
"epoch": 0.5225242531937374,
"grad_norm": 274.7859802246094,
"kl": 1.1552282243967056,
"learning_rate": 5.185132327383284e-07,
"loss": 0.1157,
"reward": 1.7673757076263428,
"reward_std": 0.3102965742349625,
"rewards/code_format_reward": 0.9887499809265137,
"rewards/code_reward": 0.6365003228187561,
"step": 2720,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.15797494337894022,
"clip_ratio/high_mean": 0.02083307456341572,
"clip_ratio/low_mean": 0.009061275536078028,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02989435092313215,
"completion_length": 88.4500015258789,
"epoch": 0.5244452982422437,
"grad_norm": 4.456059455871582,
"kl": 1.3563814774155616,
"learning_rate": 5.158038849162024e-07,
"loss": 0.0014,
"reward": 1.5090751886367797,
"reward_std": 0.23531495928764343,
"rewards/code_format_reward": 0.9787499904632568,
"rewards/code_reward": 0.5098500728607178,
"step": 2730,
"zero_std_ratio": 0.4
},
{
"clip_ratio/high_max": 0.054899439518339935,
"clip_ratio/high_mean": 0.008722224002121947,
"clip_ratio/low_mean": 0.0002903619286371395,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009012586006429046,
"completion_length": 85.53750305175781,
"epoch": 0.5263663432907502,
"grad_norm": 1.951745867729187,
"kl": 0.5144835211336612,
"learning_rate": 5.130957833310177e-07,
"loss": -0.0017,
"reward": 1.7648489713668822,
"reward_std": 0.1646851196885109,
"rewards/code_format_reward": 0.987499988079071,
"rewards/code_reward": 0.6355494737625123,
"step": 2740,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.10675688227638602,
"clip_ratio/high_mean": 0.016114802553784103,
"clip_ratio/low_mean": 0.0011672286826069466,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0172820313135162,
"completion_length": 97.54000244140624,
"epoch": 0.5282873883392566,
"grad_norm": 2.7782888412475586,
"kl": 0.484642443805933,
"learning_rate": 5.103890266763317e-07,
"loss": -0.0017,
"reward": 1.7005881071090698,
"reward_std": 0.17179570347070694,
"rewards/code_format_reward": 0.9824999928474426,
"rewards/code_reward": 0.6046690165996551,
"step": 2750,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.04718098001321778,
"clip_ratio/high_mean": 0.0069286267200368455,
"clip_ratio/low_mean": 0.0022616338639636522,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009190260457398836,
"completion_length": 91.71500091552734,
"epoch": 0.5302084333877629,
"grad_norm": 1.6982417106628418,
"kl": 0.40430613309144975,
"learning_rate": 5.076837135966868e-07,
"loss": -0.0001,
"reward": 1.7166170120239257,
"reward_std": 0.12425057031214237,
"rewards/code_format_reward": 0.9887499928474426,
"rewards/code_reward": 0.6111209750175476,
"step": 2760,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.01563614197075367,
"clip_ratio/high_mean": 0.0026440696616191416,
"clip_ratio/low_mean": 0.0005518664722330869,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0031959360814653335,
"completion_length": 93.39000091552734,
"epoch": 0.5321294784362693,
"grad_norm": 0.12160471081733704,
"kl": 0.3728202864527702,
"learning_rate": 5.049799426840166e-07,
"loss": -0.0008,
"reward": 1.8690509557724,
"reward_std": 0.20764816105365752,
"rewards/code_format_reward": 0.9824999928474426,
"rewards/code_reward": 0.6889004349708557,
"step": 2770,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.08729656506329775,
"clip_ratio/high_mean": 0.013787648268043995,
"clip_ratio/low_mean": 0.0016946192088653333,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015482267551124095,
"completion_length": 83.49000091552735,
"epoch": 0.5340505234847757,
"grad_norm": 2.061514377593994,
"kl": 0.2805942878127098,
"learning_rate": 5.02277812474052e-07,
"loss": -0.0005,
"reward": 1.5558062076568604,
"reward_std": 0.18851915150880813,
"rewards/code_format_reward": 0.9924999833106994,
"rewards/code_reward": 0.529778128862381,
"step": 2780,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.047503374773077665,
"clip_ratio/high_mean": 0.007121381178149022,
"clip_ratio/low_mean": 0.003943280148087069,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011064661786076613,
"completion_length": 90.33500213623047,
"epoch": 0.5359715685332821,
"grad_norm": 2.8578014373779297,
"kl": 0.9348004341125489,
"learning_rate": 4.995774214427299e-07,
"loss": -0.0083,
"reward": 1.5787676095962524,
"reward_std": 0.24208036959171295,
"rewards/code_format_reward": 0.9924999952316285,
"rewards/code_reward": 0.5412587821483612,
"step": 2790,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.0682177669601515,
"clip_ratio/high_mean": 0.010770523789688013,
"clip_ratio/low_mean": 0.0030192029429599644,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01378972665697802,
"completion_length": 97.35500030517578,
"epoch": 0.5378926135817885,
"grad_norm": 3.7523272037506104,
"kl": 0.49133365601301193,
"learning_rate": 4.968788680026062e-07,
"loss": 0.0019,
"reward": 1.8675085306167603,
"reward_std": 0.3084888607263565,
"rewards/code_format_reward": 0.9799999952316284,
"rewards/code_reward": 0.6887542605400085,
"step": 2800,
"zero_std_ratio": 0.4
},
{
"clip_ratio/high_max": 0.051895615691319105,
"clip_ratio/high_mean": 0.007092137623112648,
"clip_ratio/low_mean": 0.0009049189888173714,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007997056643944234,
"completion_length": 84.84000244140626,
"epoch": 0.5398136586302948,
"grad_norm": 6879.29296875,
"kl": 41.48030465692282,
"learning_rate": 4.941822504992665e-07,
"loss": 0.3058,
"reward": 1.8456867456436157,
"reward_std": 0.17148398756980895,
"rewards/code_format_reward": 0.9912500023841858,
"rewards/code_reward": 0.6750308394432067,
"step": 2810,
"zero_std_ratio": 0.625
},
{
"clip_ratio/high_max": 0.055018832255154845,
"clip_ratio/high_mean": 0.009382021031342447,
"clip_ratio/low_mean": 0.0012206103128846735,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0106026312103495,
"completion_length": 92.16250152587891,
"epoch": 0.5417347036788013,
"grad_norm": 1.7369046211242676,
"kl": 39.203208688646555,
"learning_rate": 4.914876672077444e-07,
"loss": 0.0739,
"reward": 1.7605399131774901,
"reward_std": 0.22667703181505203,
"rewards/code_format_reward": 0.9862499952316284,
"rewards/code_reward": 0.6337074398994446,
"step": 2820,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.07783090919256211,
"clip_ratio/high_mean": 0.013414820143952965,
"clip_ratio/low_mean": 0.004346576618263498,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.017761396686546506,
"completion_length": 86.75750122070312,
"epoch": 0.5436557487273077,
"grad_norm": 1.3669426441192627,
"kl": 0.6254852950572968,
"learning_rate": 4.887952163289387e-07,
"loss": -0.0037,
"reward": 1.7524815320968627,
"reward_std": 0.18003067299723624,
"rewards/code_format_reward": 0.9962499976158142,
"rewards/code_reward": 0.6271782517433167,
"step": 2830,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.032230034773238006,
"clip_ratio/high_mean": 0.005299290179391391,
"clip_ratio/low_mean": 0.002357826306251809,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007657116468180902,
"completion_length": 92.63250122070312,
"epoch": 0.545576793775814,
"grad_norm": 6.607495307922363,
"kl": 0.6308505192399025,
"learning_rate": 4.861049959860352e-07,
"loss": -0.0026,
"reward": 1.879476284980774,
"reward_std": 0.21936110258102418,
"rewards/code_format_reward": 0.9787499785423279,
"rewards/code_reward": 0.6950506567955017,
"step": 2840,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.07762792855501174,
"clip_ratio/high_mean": 0.012513539101928473,
"clip_ratio/low_mean": 0.0019065381304244511,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.014420077262911946,
"completion_length": 79.06750183105468,
"epoch": 0.5474978388243205,
"grad_norm": 2.1427297592163086,
"kl": 0.7649303644895553,
"learning_rate": 4.834171042209299e-07,
"loss": -0.0016,
"reward": 1.7679643869400024,
"reward_std": 0.2242477983236313,
"rewards/code_format_reward": 0.9725000023841858,
"rewards/code_reward": 0.640857207775116,
"step": 2850,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.10583647433668375,
"clip_ratio/high_mean": 0.015110303135588764,
"clip_ratio/low_mean": 0.0026529163093073293,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.017763219110202046,
"completion_length": 89.09499969482422,
"epoch": 0.5494188838728268,
"grad_norm": 5.39391565322876,
"kl": 1.1404950305819512,
"learning_rate": 4.807316389906573e-07,
"loss": 0.0011,
"reward": 1.6588359355926514,
"reward_std": 0.23765334486961365,
"rewards/code_format_reward": 0.9887500047683716,
"rewards/code_reward": 0.5822304427623749,
"step": 2860,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.13843556838110088,
"clip_ratio/high_mean": 0.022050847904756664,
"clip_ratio/low_mean": 0.006549120438285172,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.028599968180060387,
"completion_length": 85.02750091552734,
"epoch": 0.5513399289213332,
"grad_norm": 6.328859329223633,
"kl": 1.3457766875624657,
"learning_rate": 4.780486981638194e-07,
"loss": 0.004,
"reward": 1.4554174661636352,
"reward_std": 0.291735103726387,
"rewards/code_format_reward": 0.975,
"rewards/code_reward": 0.4839587390422821,
"step": 2870,
"zero_std_ratio": 0.325
},
{
"clip_ratio/high_max": 0.03891797037795186,
"clip_ratio/high_mean": 0.005045192840043455,
"clip_ratio/low_mean": 0.0029750549525488167,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008020247751846909,
"completion_length": 85.00500183105468,
"epoch": 0.5532609739698396,
"grad_norm": 3.746497392654419,
"kl": 1.5130328834056854,
"learning_rate": 4.75368379517019e-07,
"loss": -0.0033,
"reward": 1.8564167737960815,
"reward_std": 0.14603331089019775,
"rewards/code_format_reward": 0.9987499952316284,
"rewards/code_reward": 0.6785208344459533,
"step": 2880,
"zero_std_ratio": 0.7
},
{
"clip_ratio/high_max": 0.23034826815128326,
"clip_ratio/high_mean": 0.037423617928288876,
"clip_ratio/low_mean": 0.0014674156729597599,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03889103210531175,
"completion_length": 81.40750122070312,
"epoch": 0.555182019018346,
"grad_norm": 7.282290458679199,
"kl": 0.5326755799353122,
"learning_rate": 4.7269078073129696e-07,
"loss": 0.0032,
"reward": 1.700506567955017,
"reward_std": 0.3424434006214142,
"rewards/code_format_reward": 0.9700000047683716,
"rewards/code_reward": 0.6077533006668091,
"step": 2890,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.06711816978640854,
"clip_ratio/high_mean": 0.008929741784231737,
"clip_ratio/low_mean": 0.0021304662863258273,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011060208058916032,
"completion_length": 75.07750244140625,
"epoch": 0.5571030640668524,
"grad_norm": 3.718710422515869,
"kl": 0.3807241953909397,
"learning_rate": 4.7001599938857204e-07,
"loss": -0.0016,
"reward": 1.6593467235565185,
"reward_std": 0.2742844566702843,
"rewards/code_format_reward": 0.9899999856948852,
"rewards/code_reward": 0.5821733415126801,
"step": 2900,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.10134089784696698,
"clip_ratio/high_mean": 0.014033923938404769,
"clip_ratio/low_mean": 0.0036910680413711817,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01772499195067212,
"completion_length": 74.57250061035157,
"epoch": 0.5590241091153587,
"grad_norm": 18.590866088867188,
"kl": 0.9125221639871597,
"learning_rate": 4.673441329680844e-07,
"loss": 0.0044,
"reward": 1.6198436498641968,
"reward_std": 0.1470041409134865,
"rewards/code_format_reward": 0.9912500023841858,
"rewards/code_reward": 0.5621092915534973,
"step": 2910,
"zero_std_ratio": 0.7
},
{
"clip_ratio/high_max": 0.042256729071959855,
"clip_ratio/high_mean": 0.007948629459133372,
"clip_ratio/low_mean": 0.001496748169302009,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009445377223892137,
"completion_length": 77.4625015258789,
"epoch": 0.5609451541638651,
"grad_norm": 0.18645010888576508,
"kl": 0.4780749522149563,
"learning_rate": 4.6467527884284365e-07,
"loss": 0.0006,
"reward": 1.8204985857009888,
"reward_std": 0.19856311585754155,
"rewards/code_format_reward": 0.981249988079071,
"rewards/code_reward": 0.6649368166923523,
"step": 2920,
"zero_std_ratio": 0.6
},
{
"clip_ratio/high_max": 0.1271044396329671,
"clip_ratio/high_mean": 0.016186495171859862,
"clip_ratio/low_mean": 0.0011034044640837238,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01728989938274026,
"completion_length": 82.72500305175781,
"epoch": 0.5628661992123716,
"grad_norm": 6.4396162033081055,
"kl": 0.30610462203621863,
"learning_rate": 4.6200953427607927e-07,
"loss": -0.0021,
"reward": 1.7915108680725098,
"reward_std": 0.22729050666093825,
"rewards/code_format_reward": 0.9700000047683716,
"rewards/code_reward": 0.6532554149627685,
"step": 2930,
"zero_std_ratio": 0.6
},
{
"clip_ratio/high_max": 0.23429102210793645,
"clip_ratio/high_mean": 0.03006269016477745,
"clip_ratio/low_mean": 0.001874277341994457,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0319369669421576,
"completion_length": 88.97500152587891,
"epoch": 0.5647872442608779,
"grad_norm": 43.83531951904297,
"kl": 0.5952823750674725,
"learning_rate": 4.5934699641769747e-07,
"loss": -0.0032,
"reward": 1.837431001663208,
"reward_std": 0.3392215400934219,
"rewards/code_format_reward": 0.9825000047683716,
"rewards/code_reward": 0.6730904817581177,
"step": 2940,
"zero_std_ratio": 0.325
},
{
"clip_ratio/high_max": 0.06733583421446383,
"clip_ratio/high_mean": 0.00863017894444056,
"clip_ratio/low_mean": 0.005618994176620618,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.014249173022108153,
"completion_length": 77.96000061035156,
"epoch": 0.5667082893093843,
"grad_norm": 2.642043352127075,
"kl": 0.56968834400177,
"learning_rate": 4.566877623007389e-07,
"loss": 0.0049,
"reward": 1.7328413248062133,
"reward_std": 0.21620932817459107,
"rewards/code_format_reward": 0.9737500071525573,
"rewards/code_reward": 0.6229831516742707,
"step": 2950,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.04462944087572396,
"clip_ratio/high_mean": 0.007475414098007604,
"clip_ratio/low_mean": 0.002004683316772571,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009480097430059686,
"completion_length": 85.0875015258789,
"epoch": 0.5686293343578906,
"grad_norm": 3.8512065410614014,
"kl": 0.33709155321121215,
"learning_rate": 4.540319288378439e-07,
"loss": -0.0057,
"reward": 1.6900140762329101,
"reward_std": 0.21961634010076522,
"rewards/code_format_reward": 0.9887499928474426,
"rewards/code_reward": 0.5978195071220398,
"step": 2960,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.06556220971979201,
"clip_ratio/high_mean": 0.01001431758631952,
"clip_ratio/low_mean": 0.003507485325098969,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01352180291141849,
"completion_length": 92.67500152587891,
"epoch": 0.5705503794063971,
"grad_norm": 2.966658592224121,
"kl": 0.5968067184090614,
"learning_rate": 4.513795928177193e-07,
"loss": 0.0007,
"reward": 1.4343469619750977,
"reward_std": 0.16000542044639587,
"rewards/code_format_reward": 0.9962499976158142,
"rewards/code_reward": 0.4681109845638275,
"step": 2970,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.006220952537842095,
"clip_ratio/high_mean": 0.0009992636245442555,
"clip_ratio/low_mean": 0.0029718225210672244,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0039710860582999885,
"completion_length": 92.65750274658203,
"epoch": 0.5724714244549035,
"grad_norm": 9.493338584899902,
"kl": 0.5875692501664161,
"learning_rate": 4.4873085090161266e-07,
"loss": -0.0009,
"reward": 1.4061829090118407,
"reward_std": 0.20027331858873368,
"rewards/code_format_reward": 0.9762499928474426,
"rewards/code_reward": 0.45902894139289857,
"step": 2980,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.0330589919583872,
"clip_ratio/high_mean": 0.004416179939289578,
"clip_ratio/low_mean": 0.002115111546299886,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006531291425926611,
"completion_length": 79.80750274658203,
"epoch": 0.5743924695034098,
"grad_norm": 1.592423915863037,
"kl": 0.6846940219402313,
"learning_rate": 4.460857996197879e-07,
"loss": -0.0088,
"reward": 1.8656628370285033,
"reward_std": 0.24907293021678925,
"rewards/code_format_reward": 0.9912499904632568,
"rewards/code_reward": 0.6850189208984375,
"step": 2990,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.10685318629257382,
"clip_ratio/high_mean": 0.014238242123974487,
"clip_ratio/low_mean": 0.0005060926268924959,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.014744334877468646,
"completion_length": 75.28500213623047,
"epoch": 0.5763135145519163,
"grad_norm": 11.023285865783691,
"kl": 1.773244822025299,
"learning_rate": 4.434445353680084e-07,
"loss": -0.0004,
"reward": 1.6719849348068236,
"reward_std": 0.23447352051734924,
"rewards/code_format_reward": 0.9887500047683716,
"rewards/code_reward": 0.5888049364089966,
"step": 3000,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.09909297195263207,
"clip_ratio/high_mean": 0.014624686987372116,
"clip_ratio/low_mean": 0.0008105992455966771,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015435286390129477,
"completion_length": 80.88750228881835,
"epoch": 0.5782345596004226,
"grad_norm": 3.5932817459106445,
"kl": 1.2866470351815225,
"learning_rate": 4.4080715440402417e-07,
"loss": 0.0028,
"reward": 1.7477641582489014,
"reward_std": 0.27256832718849183,
"rewards/code_format_reward": 0.9800000071525574,
"rewards/code_reward": 0.628882086277008,
"step": 3010,
"zero_std_ratio": 0.4
},
{
"clip_ratio/high_max": 0.061281491769477725,
"clip_ratio/high_mean": 0.008347922342363746,
"clip_ratio/low_mean": 0.00354889674927108,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011896818911191076,
"completion_length": 75.85250091552734,
"epoch": 0.580155604648929,
"grad_norm": 4.849332809448242,
"kl": 0.476963010430336,
"learning_rate": 4.381737528440624e-07,
"loss": -0.0002,
"reward": 1.5080678462982178,
"reward_std": 0.1984383262693882,
"rewards/code_format_reward": 0.9762500047683715,
"rewards/code_reward": 0.5099714159965515,
"step": 3020,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.02014698493294418,
"clip_ratio/high_mean": 0.0029024946445133535,
"clip_ratio/low_mean": 0.001273224765463965,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0041757194034289565,
"completion_length": 86.35750122070313,
"epoch": 0.5820766496974354,
"grad_norm": 5.408311367034912,
"kl": 1.1033611692488194,
"learning_rate": 4.3554442665932664e-07,
"loss": -0.0044,
"reward": 1.7480007410049438,
"reward_std": 0.20548871904611588,
"rewards/code_format_reward": 0.9674999952316284,
"rewards/code_reward": 0.6321253478527069,
"step": 3030,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.07207567039877176,
"clip_ratio/high_mean": 0.010315603285562247,
"clip_ratio/low_mean": 0.0024311804067110644,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012746783741749822,
"completion_length": 87.45250091552734,
"epoch": 0.5839976947459418,
"grad_norm": 5.45907735824585,
"kl": 0.7388446770608426,
"learning_rate": 4.329192716724974e-07,
"loss": -0.0134,
"reward": 1.617799663543701,
"reward_std": 0.28184359073638915,
"rewards/code_format_reward": 0.9900000095367432,
"rewards/code_reward": 0.5613998055458069,
"step": 3040,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.03178482772782445,
"clip_ratio/high_mean": 0.00484326797304675,
"clip_ratio/low_mean": 0.0010359384352341295,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005879206501413136,
"completion_length": 83.70250091552734,
"epoch": 0.5859187397944482,
"grad_norm": 6.244964122772217,
"kl": 0.8223805136978626,
"learning_rate": 4.3029838355424165e-07,
"loss": -0.0028,
"reward": 1.5551699638366698,
"reward_std": 0.23868169337511064,
"rewards/code_format_reward": 0.9899999976158143,
"rewards/code_reward": 0.5300849676132202,
"step": 3050,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.031123768421821296,
"clip_ratio/high_mean": 0.0042093763331649825,
"clip_ratio/low_mean": 0.00023920949752209708,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0044485858496045695,
"completion_length": 90.6500015258789,
"epoch": 0.5878397848429545,
"grad_norm": 1.844166874885559,
"kl": 0.9453303083777428,
"learning_rate": 4.2768185781972433e-07,
"loss": 0.0038,
"reward": 1.7277095794677735,
"reward_std": 0.22161270976066588,
"rewards/code_format_reward": 0.9849999904632568,
"rewards/code_reward": 0.6176047682762146,
"step": 3060,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.06112067271023989,
"clip_ratio/high_mean": 0.008206171146593989,
"clip_ratio/low_mean": 0.0006491162814199925,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008855287660844624,
"completion_length": 81.27750091552734,
"epoch": 0.589760829891461,
"grad_norm": 3.0321500301361084,
"kl": 0.4705409877002239,
"learning_rate": 4.2506978982512964e-07,
"loss": -0.0002,
"reward": 1.9011548519134522,
"reward_std": 0.2363950289785862,
"rewards/code_format_reward": 0.9875,
"rewards/code_reward": 0.7037024021148681,
"step": 3070,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.028480308945290744,
"clip_ratio/high_mean": 0.00514335140469484,
"clip_ratio/low_mean": 0.0035089968900138047,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008652348211035133,
"completion_length": 88.05750274658203,
"epoch": 0.5916818749399674,
"grad_norm": 4.498425483703613,
"kl": 0.9383749194443226,
"learning_rate": 4.224622747641835e-07,
"loss": -0.0068,
"reward": 1.2419449806213378,
"reward_std": 0.1959183931350708,
"rewards/code_format_reward": 0.9799999952316284,
"rewards/code_reward": 0.37597247362136843,
"step": 3080,
"zero_std_ratio": 0.6
},
{
"clip_ratio/high_max": 0.02165755571331829,
"clip_ratio/high_mean": 0.003493850605445914,
"clip_ratio/low_mean": 0.0001163623295724392,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003610212981584482,
"completion_length": 83.10500030517578,
"epoch": 0.5936029199884737,
"grad_norm": 1.0221151113510132,
"kl": 1.614695566892624,
"learning_rate": 4.1985940766468663e-07,
"loss": 0.1048,
"reward": 1.8437815666198731,
"reward_std": 0.12033854126930237,
"rewards/code_format_reward": 0.9949999928474427,
"rewards/code_reward": 0.6731407642364502,
"step": 3090,
"zero_std_ratio": 0.675
},
{
"clip_ratio/high_max": 0.05757335813250393,
"clip_ratio/high_mean": 0.0107182093168376,
"clip_ratio/low_mean": 0.004042259410198312,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.014760468708118424,
"completion_length": 86.6625,
"epoch": 0.5955239650369801,
"grad_norm": 3.0221967697143555,
"kl": 0.4662696644663811,
"learning_rate": 4.1726128338504997e-07,
"loss": 0.0059,
"reward": 1.6797678232192994,
"reward_std": 0.23598156571388246,
"rewards/code_format_reward": 0.9875,
"rewards/code_reward": 0.5930089056491852,
"step": 3100,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.1632944119395688,
"clip_ratio/high_mean": 0.02386658971372526,
"clip_ratio/low_mean": 0.000367270597780589,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02423386004229542,
"completion_length": 87.04000244140624,
"epoch": 0.5974450100854864,
"grad_norm": 3124.911865234375,
"kl": 1.4825018651783466,
"learning_rate": 4.146679966108374e-07,
"loss": 0.109,
"reward": 1.7368038177490235,
"reward_std": 0.2290027320384979,
"rewards/code_format_reward": 0.9912499785423279,
"rewards/code_reward": 0.620589405298233,
"step": 3110,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.011806602030992508,
"clip_ratio/high_mean": 0.00222149578621611,
"clip_ratio/low_mean": 0.001782867594738491,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004004363450803794,
"completion_length": 76.50500183105468,
"epoch": 0.5993660551339929,
"grad_norm": 5.609122276306152,
"kl": 1.2381610602140427,
"learning_rate": 4.120796418513165e-07,
"loss": 0.0687,
"reward": 1.6538613319396973,
"reward_std": 0.2478315144777298,
"rewards/code_format_reward": 0.9825000047683716,
"rewards/code_reward": 0.5813056170940399,
"step": 3120,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.04111457797698677,
"clip_ratio/high_mean": 0.006102612579707056,
"clip_ratio/low_mean": 0.0006678692123387008,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006770481838611886,
"completion_length": 90.63500213623047,
"epoch": 0.6012871001824993,
"grad_norm": 1.7537983655929565,
"kl": 0.8379382207989693,
"learning_rate": 4.094963134360129e-07,
"loss": 3.0713,
"reward": 1.8111864566802978,
"reward_std": 0.23444892466068268,
"rewards/code_format_reward": 0.9824999928474426,
"rewards/code_reward": 0.6599682211875916,
"step": 3130,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.07487462717108428,
"clip_ratio/high_mean": 0.009757341054501012,
"clip_ratio/low_mean": 0.002470593445468694,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012227934325346723,
"completion_length": 84.99250183105468,
"epoch": 0.6032081452310056,
"grad_norm": 7.498387336730957,
"kl": 0.5894037500023842,
"learning_rate": 4.0691810551127327e-07,
"loss": 0.0462,
"reward": 1.6221882104873657,
"reward_std": 0.25462802946567537,
"rewards/code_format_reward": 0.9975000023841858,
"rewards/code_reward": 0.5617190957069397,
"step": 3140,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.015196678857319058,
"clip_ratio/high_mean": 0.0022096226894063875,
"clip_ratio/low_mean": 0.002686911600176245,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004896534324507229,
"completion_length": 88.44750213623047,
"epoch": 0.6051291902795121,
"grad_norm": 0.7371006011962891,
"kl": 1.5165767412632705,
"learning_rate": 4.0434511203683386e-07,
"loss": 0.0113,
"reward": 1.958918571472168,
"reward_std": 0.17050198167562486,
"rewards/code_format_reward": 0.9962499856948852,
"rewards/code_reward": 0.7303967714309693,
"step": 3150,
"zero_std_ratio": 0.625
},
{
"clip_ratio/high_max": 0.047509356867522,
"clip_ratio/high_mean": 0.0060635729460045695,
"clip_ratio/low_mean": 0.0037405278504593297,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009804100578185171,
"completion_length": 93.71500091552734,
"epoch": 0.6070502353280185,
"grad_norm": 4.062532424926758,
"kl": 164.7577206812799,
"learning_rate": 4.017774267823967e-07,
"loss": 0.3479,
"reward": 1.8433427095413208,
"reward_std": 0.20897280275821686,
"rewards/code_format_reward": 0.9824999928474426,
"rewards/code_reward": 0.6760463416576385,
"step": 3160,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.007103513111360371,
"clip_ratio/high_mean": 0.0009442454349482432,
"clip_ratio/low_mean": 0.0005656339257257059,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0015098793461220338,
"completion_length": 97.03500061035156,
"epoch": 0.6089712803765248,
"grad_norm": 0.3194718658924103,
"kl": 19.38877977654338,
"learning_rate": 3.9921514332421193e-07,
"loss": 0.1279,
"reward": 1.3801440358161927,
"reward_std": 0.26880781557410954,
"rewards/code_format_reward": 0.9699999928474426,
"rewards/code_reward": 0.44757200181484225,
"step": 3170,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.027826181857381015,
"clip_ratio/high_mean": 0.004423137854610104,
"clip_ratio/low_mean": 0.000519216748944018,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004942354625381995,
"completion_length": 99.6375015258789,
"epoch": 0.6108923254250312,
"grad_norm": 133.27151489257812,
"kl": 91.96420569866896,
"learning_rate": 3.966583550416676e-07,
"loss": 284.3821,
"reward": 1.6065278768539428,
"reward_std": 0.2671674907207489,
"rewards/code_format_reward": 0.9737499952316284,
"rewards/code_reward": 0.5598264217376709,
"step": 3180,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.03810381339862943,
"clip_ratio/high_mean": 0.005511091940570622,
"clip_ratio/low_mean": 0.00701818183879368,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01252927360474132,
"completion_length": 90.77000122070312,
"epoch": 0.6128133704735376,
"grad_norm": 2.931155204772949,
"kl": 4.587994083762169,
"learning_rate": 3.9410715511388647e-07,
"loss": 28143.1688,
"reward": 1.7186223268508911,
"reward_std": 0.2031429558992386,
"rewards/code_format_reward": 0.98125,
"rewards/code_reward": 0.6139986515045166,
"step": 3190,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.18900979291647674,
"clip_ratio/high_mean": 0.025313075329177082,
"clip_ratio/low_mean": 0.00013794690457871183,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02545102240983397,
"completion_length": 88.60750122070313,
"epoch": 0.614734415522044,
"grad_norm": 3.9914708137512207,
"kl": 0.678571529686451,
"learning_rate": 3.915616365163304e-07,
"loss": 0.0002,
"reward": 1.818918228149414,
"reward_std": 0.24608779847621917,
"rewards/code_format_reward": 0.9762500047683715,
"rewards/code_reward": 0.6653966069221496,
"step": 3200,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.05510081194806844,
"clip_ratio/high_mean": 0.008429678474203683,
"clip_ratio/low_mean": 0.0015389235399197788,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0099686019733781,
"completion_length": 85.05250091552735,
"epoch": 0.6166554605705504,
"grad_norm": 2.0297534465789795,
"kl": 0.5190044179558754,
"learning_rate": 3.890218920174122e-07,
"loss": -0.0056,
"reward": 1.938026785850525,
"reward_std": 0.2829041987657547,
"rewards/code_format_reward": 0.9887499928474426,
"rewards/code_reward": 0.7218258857727051,
"step": 3210,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.04617450258228928,
"clip_ratio/high_mean": 0.007303895291988738,
"clip_ratio/low_mean": 0.002542783234093804,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009846678806934506,
"completion_length": 92.52000122070312,
"epoch": 0.6185765056190567,
"grad_norm": 3.2283730506896973,
"kl": 0.5362374372780323,
"learning_rate": 3.86488014175114e-07,
"loss": 0.0003,
"reward": 1.7741312742233277,
"reward_std": 0.20447308868169783,
"rewards/code_format_reward": 0.9899999976158143,
"rewards/code_reward": 0.6395656108856201,
"step": 3220,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.059750709845684466,
"clip_ratio/high_mean": 0.00790787541482132,
"clip_ratio/low_mean": 0.0012954409321537241,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009203316466300748,
"completion_length": 90.4375,
"epoch": 0.6204975506675632,
"grad_norm": 2.409045934677124,
"kl": 0.553566773980856,
"learning_rate": 3.8396009533361486e-07,
"loss": -0.0,
"reward": 1.6513851642608643,
"reward_std": 0.24081393480300903,
"rewards/code_format_reward": 0.9799999952316284,
"rewards/code_reward": 0.580692571401596,
"step": 3230,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.03564520282670856,
"clip_ratio/high_mean": 0.004964679945260286,
"clip_ratio/low_mean": 0.004444090686592972,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009408770385198295,
"completion_length": 79.08000183105469,
"epoch": 0.6224185957160695,
"grad_norm": 7.759763717651367,
"kl": 1.2998816877603532,
"learning_rate": 3.814382276199251e-07,
"loss": -0.0006,
"reward": 1.6336610555648803,
"reward_std": 0.1691926121711731,
"rewards/code_format_reward": 0.9949999928474427,
"rewards/code_reward": 0.5680804908275604,
"step": 3240,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.011579358880408109,
"clip_ratio/high_mean": 0.002202258622855879,
"clip_ratio/low_mean": 0.0003946456956327893,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0025969043519580735,
"completion_length": 88.7375,
"epoch": 0.6243396407645759,
"grad_norm": 9.489768981933594,
"kl": 4.286054483056068,
"learning_rate": 3.7892250294052853e-07,
"loss": 31.2761,
"reward": 1.8622464895248414,
"reward_std": 0.2547990471124649,
"rewards/code_format_reward": 0.98125,
"rewards/code_reward": 0.6858106970787048,
"step": 3250,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.07659143296186813,
"clip_ratio/high_mean": 0.010122461079299682,
"clip_ratio/low_mean": 0.0019954566974774933,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012117917880095775,
"completion_length": 99.80750274658203,
"epoch": 0.6262606858130824,
"grad_norm": 2.884183168411255,
"kl": 1.2840011775493623,
"learning_rate": 3.764130129780341e-07,
"loss": 0.0383,
"reward": 1.6670962572097778,
"reward_std": 0.34920003414154055,
"rewards/code_format_reward": 0.9712499976158142,
"rewards/code_reward": 0.5907356142997742,
"step": 3260,
"zero_std_ratio": 0.375
},
{
"clip_ratio/high_max": 0.029844516195589678,
"clip_ratio/high_mean": 0.004244843772175955,
"clip_ratio/low_mean": 0.0002169124811189249,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004461756230011815,
"completion_length": 100.70250091552734,
"epoch": 0.6281817308615887,
"grad_norm": 4.036985397338867,
"kl": 2.1118960954248904,
"learning_rate": 3.7390984918783286e-07,
"loss": 0.9419,
"reward": 1.6084105730056764,
"reward_std": 0.17128639966249465,
"rewards/code_format_reward": 0.9712500095367431,
"rewards/code_reward": 0.5613927602767944,
"step": 3270,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.07067356873303651,
"clip_ratio/high_mean": 0.00971948360092938,
"clip_ratio/low_mean": 0.0006290240438829642,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010348507657181472,
"completion_length": 88.9000015258789,
"epoch": 0.6301027759100951,
"grad_norm": 1.543152928352356,
"kl": 0.5742107287049294,
"learning_rate": 3.714131027947669e-07,
"loss": 0.0006,
"reward": 1.808586883544922,
"reward_std": 0.20984979271888732,
"rewards/code_format_reward": 0.9912499904632568,
"rewards/code_reward": 0.6564809083938599,
"step": 3280,
"zero_std_ratio": 0.6
},
{
"clip_ratio/high_max": 0.060896387742832306,
"clip_ratio/high_mean": 0.00765781793743372,
"clip_ratio/low_mean": 0.01029690281720832,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.017954721208661796,
"completion_length": 80.08500213623047,
"epoch": 0.6320238209586014,
"grad_norm": 2.127617359161377,
"kl": 0.6725200928747654,
"learning_rate": 3.689228647898034e-07,
"loss": 0.1143,
"reward": 1.678031039237976,
"reward_std": 0.19750893712043763,
"rewards/code_format_reward": 0.987499988079071,
"rewards/code_reward": 0.5921404898166657,
"step": 3290,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.05414980174973607,
"clip_ratio/high_mean": 0.007520435960032046,
"clip_ratio/low_mean": 0.00011696565634338185,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007637401651300025,
"completion_length": 92.725,
"epoch": 0.6339448660071079,
"grad_norm": 8.315914154052734,
"kl": 0.30459046363830566,
"learning_rate": 3.6643922592671904e-07,
"loss": -0.0066,
"reward": 1.5898099780082702,
"reward_std": 0.1832955375313759,
"rewards/code_format_reward": 0.99375,
"rewards/code_reward": 0.5464674949645996,
"step": 3300,
"zero_std_ratio": 0.625
},
{
"clip_ratio/high_max": 0.02745349882170558,
"clip_ratio/high_mean": 0.004275670822244138,
"clip_ratio/low_mean": 0.001036624335392844,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005312295141629874,
"completion_length": 86.80250091552735,
"epoch": 0.6358659110556143,
"grad_norm": 4.2797441482543945,
"kl": 2.398578557372093,
"learning_rate": 3.6396227671879267e-07,
"loss": 0.028,
"reward": 1.7730424404144287,
"reward_std": 0.3175764262676239,
"rewards/code_format_reward": 0.9912500023841858,
"rewards/code_reward": 0.6387087047100067,
"step": 3310,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.03558065614197403,
"clip_ratio/high_mean": 0.004970578508800827,
"clip_ratio/low_mean": 0.0008951228694058955,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0058657014247728515,
"completion_length": 91.0875015258789,
"epoch": 0.6377869561041206,
"grad_norm": 5.376333713531494,
"kl": 1.4305558323860168,
"learning_rate": 3.614921074355067e-07,
"loss": 0.0034,
"reward": 1.7029305696487427,
"reward_std": 0.34837333858013153,
"rewards/code_format_reward": 0.9849999904632568,
"rewards/code_reward": 0.6052152514457703,
"step": 3320,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.04346057323855348,
"clip_ratio/high_mean": 0.005737839776702458,
"clip_ratio/low_mean": 0.001675139949657023,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00741297956337803,
"completion_length": 88.75250396728515,
"epoch": 0.639708001152627,
"grad_norm": 2.969228744506836,
"kl": 0.7607076019048691,
"learning_rate": 3.5902880809925704e-07,
"loss": -0.0001,
"reward": 1.6762405157089233,
"reward_std": 0.2515918217599392,
"rewards/code_format_reward": 0.9887499928474426,
"rewards/code_reward": 0.5909327387809753,
"step": 3330,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.05080293011851609,
"clip_ratio/high_mean": 0.006427765643456951,
"clip_ratio/low_mean": 0.00040708604792598634,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006834851735038683,
"completion_length": 88.0750015258789,
"epoch": 0.6416290462011334,
"grad_norm": 12.137472152709961,
"kl": 0.31881698705255984,
"learning_rate": 3.565724684820727e-07,
"loss": 3.6118,
"reward": 1.8916306495666504,
"reward_std": 0.1850387692451477,
"rewards/code_format_reward": 0.99375,
"rewards/code_reward": 0.6973778128623962,
"step": 3340,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.14287711144424975,
"clip_ratio/high_mean": 0.019231261435197666,
"clip_ratio/low_mean": 0.0020263168029487134,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.021257578127551822,
"completion_length": 94.19000091552735,
"epoch": 0.6435500912496398,
"grad_norm": 6.10810661315918,
"kl": 0.8296034529805183,
"learning_rate": 3.541231781023436e-07,
"loss": -0.0004,
"reward": 1.6248144626617431,
"reward_std": 0.2219874605536461,
"rewards/code_format_reward": 0.9887499809265137,
"rewards/code_reward": 0.5652197122573852,
"step": 3350,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.07329725201707334,
"clip_ratio/high_mean": 0.009671362905646675,
"clip_ratio/low_mean": 0.005342914546781685,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015014277724549174,
"completion_length": 97.74500274658203,
"epoch": 0.6454711362981462,
"grad_norm": 2.801866054534912,
"kl": 0.5770246163010597,
"learning_rate": 3.5168102622155894e-07,
"loss": 0.0,
"reward": 1.6838999271392823,
"reward_std": 0.2707583636045456,
"rewards/code_format_reward": 0.9912499904632568,
"rewards/code_reward": 0.594137442111969,
"step": 3360,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.005975091701839119,
"clip_ratio/high_mean": 0.0011488659016322344,
"clip_ratio/low_mean": 0.001098146109143272,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0022470120195066555,
"completion_length": 89.36750030517578,
"epoch": 0.6473921813466526,
"grad_norm": 34.13050079345703,
"kl": 2.2693535044789312,
"learning_rate": 3.492461018410535e-07,
"loss": 0.0028,
"reward": 1.8977232933044434,
"reward_std": 0.2937870219349861,
"rewards/code_format_reward": 0.9862499952316284,
"rewards/code_reward": 0.7022991299629211,
"step": 3370,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.04565345844021067,
"clip_ratio/high_mean": 0.009023689541209023,
"clip_ratio/low_mean": 0.00031946374219842254,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009343153254303616,
"completion_length": 84.38249969482422,
"epoch": 0.649313226395159,
"grad_norm": 0.9170461893081665,
"kl": 108.80695619434118,
"learning_rate": 3.468184936987645e-07,
"loss": 920.5057,
"reward": 1.7496967315673828,
"reward_std": 0.2916144669055939,
"rewards/code_format_reward": 0.99375,
"rewards/code_reward": 0.626410859823227,
"step": 3380,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.04991705315187574,
"clip_ratio/high_mean": 0.007881995162460954,
"clip_ratio/low_mean": 0.00031314246589317916,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008195137570146472,
"completion_length": 88.86000061035156,
"epoch": 0.6512342714436653,
"grad_norm": 3.084516763687134,
"kl": 1331.8980419039726,
"learning_rate": 3.4439829026599765e-07,
"loss": 2.6994,
"reward": 1.7110779523849486,
"reward_std": 0.22298349142074586,
"rewards/code_format_reward": 0.99375,
"rewards/code_reward": 0.6071014523506164,
"step": 3390,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.03301922780228779,
"clip_ratio/high_mean": 0.005802097530977335,
"clip_ratio/low_mean": 0.002479181956732646,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00828127931599738,
"completion_length": 77.6500015258789,
"epoch": 0.6531553164921717,
"grad_norm": 3643.742919921875,
"kl": 629.580971956253,
"learning_rate": 3.4198557974420236e-07,
"loss": 1.3601,
"reward": 1.9027020692825318,
"reward_std": 0.23692196756601333,
"rewards/code_format_reward": 0.9899999856948852,
"rewards/code_reward": 0.7038509964942932,
"step": 3400,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.026931732892990112,
"clip_ratio/high_mean": 0.004060871619731188,
"clip_ratio/low_mean": 0.0013453641964588313,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00540623576962389,
"completion_length": 81.77250213623047,
"epoch": 0.6550763615406782,
"grad_norm": 3.2221176624298096,
"kl": 17.398655989021062,
"learning_rate": 3.3958045006175804e-07,
"loss": 0.0552,
"reward": 1.7479909420013429,
"reward_std": 0.22741918563842772,
"rewards/code_format_reward": 0.9725000023841858,
"rewards/code_reward": 0.6308704853057862,
"step": 3410,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.06737710665911437,
"clip_ratio/high_mean": 0.008767830353463069,
"clip_ratio/low_mean": 0.0005067014892119915,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009274532069684937,
"completion_length": 92.88500061035157,
"epoch": 0.6569974065891845,
"grad_norm": 4.063995838165283,
"kl": 2.0011128395795823,
"learning_rate": 3.3718298887077003e-07,
"loss": 0.0159,
"reward": 1.7235053777694702,
"reward_std": 0.2168472334742546,
"rewards/code_format_reward": 0.98125,
"rewards/code_reward": 0.616440212726593,
"step": 3420,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.019622295489534737,
"clip_ratio/high_mean": 0.003191170998616144,
"clip_ratio/low_mean": 0.0015002752974396572,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0046914463368011635,
"completion_length": 80.16000213623047,
"epoch": 0.6589184516376909,
"grad_norm": 1.253300428390503,
"kl": 0.48067781031131745,
"learning_rate": 3.3479328354387286e-07,
"loss": 0.0008,
"reward": 1.7450715541839599,
"reward_std": 0.1590050458908081,
"rewards/code_format_reward": 0.9924999952316285,
"rewards/code_reward": 0.6244107484817505,
"step": 3430,
"zero_std_ratio": 0.625
},
{
"clip_ratio/high_max": 0.03181373123079538,
"clip_ratio/high_mean": 0.0046242739539593455,
"clip_ratio/low_mean": 0.012107005770667456,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.016731279762461783,
"completion_length": 84.32750244140625,
"epoch": 0.6608394966861972,
"grad_norm": 1.5854672193527222,
"kl": 0.42518851198256014,
"learning_rate": 3.324114211710498e-07,
"loss": 0.0,
"reward": 1.6541699171066284,
"reward_std": 0.1113172210752964,
"rewards/code_format_reward": 0.9962499856948852,
"rewards/code_reward": 0.5780224561691284,
"step": 3440,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.13535575959831475,
"clip_ratio/high_mean": 0.018421862670220435,
"clip_ratio/low_mean": 0.0012572539155371488,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.019679116318002343,
"completion_length": 91.63250122070312,
"epoch": 0.6627605417347037,
"grad_norm": 4.593750476837158,
"kl": 0.7388513803482055,
"learning_rate": 3.300374885564553e-07,
"loss": -0.0,
"reward": 1.5408308625221252,
"reward_std": 0.29571940898895266,
"rewards/code_format_reward": 0.9749999880790711,
"rewards/code_reward": 0.5266654074192048,
"step": 3450,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.04671933995559811,
"clip_ratio/high_mean": 0.0062255718978121875,
"clip_ratio/low_mean": 0.003391482085862663,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009617053843976464,
"completion_length": 78.45250091552734,
"epoch": 0.6646815867832101,
"grad_norm": 2.5849409103393555,
"kl": 10.90581871420145,
"learning_rate": 3.2767157221525437e-07,
"loss": 0.0178,
"reward": 1.5087457418441772,
"reward_std": 0.19353876560926436,
"rewards/code_format_reward": 0.9875,
"rewards/code_reward": 0.5074978828430176,
"step": 3460,
"zero_std_ratio": 0.625
},
{
"clip_ratio/high_max": 0.0318772604689002,
"clip_ratio/high_mean": 0.004644899175036699,
"clip_ratio/low_mean": 0.0032211030862526967,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007866002165246754,
"completion_length": 75.36500091552735,
"epoch": 0.6666026318317164,
"grad_norm": 1.8117616176605225,
"kl": 187030.33910432606,
"learning_rate": 3.253137583704673e-07,
"loss": 374.1458,
"reward": 1.6825225114822389,
"reward_std": 0.2058879092335701,
"rewards/code_format_reward": 0.9962499976158142,
"rewards/code_reward": 0.5921987533569336,
"step": 3470,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.09497236199676991,
"clip_ratio/high_mean": 0.015650217607617378,
"clip_ratio/low_mean": 0.0006928690614586231,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.016343086725100875,
"completion_length": 89.13250198364258,
"epoch": 0.6685236768802229,
"grad_norm": 5.850868225097656,
"kl": 0.5080707125365734,
"learning_rate": 3.229641329498296e-07,
"loss": 0.0463,
"reward": 1.6678599119186401,
"reward_std": 0.2830047011375427,
"rewards/code_format_reward": 0.9724999904632569,
"rewards/code_reward": 0.5908049464225769,
"step": 3480,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.050425410037860274,
"clip_ratio/high_mean": 0.006465365196345374,
"clip_ratio/low_mean": 0.0006157927738968283,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0070811579586006704,
"completion_length": 81.40749969482422,
"epoch": 0.6704447219287293,
"grad_norm": 10.526844024658203,
"kl": 1.5019532606005668,
"learning_rate": 3.2062278158265866e-07,
"loss": -0.0021,
"reward": 1.7323597908020019,
"reward_std": 0.15349715426564217,
"rewards/code_format_reward": 0.9837499976158142,
"rewards/code_reward": 0.6202423751354218,
"step": 3490,
"zero_std_ratio": 0.65
},
{
"clip_ratio/high_max": 0.07051093662157655,
"clip_ratio/high_mean": 0.009594869159627706,
"clip_ratio/low_mean": 0.001997971232049167,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011592840391676873,
"completion_length": 96.4625,
"epoch": 0.6723657669772356,
"grad_norm": 12.833992958068848,
"kl": 0.37389371246099473,
"learning_rate": 3.182897895967338e-07,
"loss": 0.0008,
"reward": 1.6037346363067626,
"reward_std": 0.329493448138237,
"rewards/code_format_reward": 0.98125,
"rewards/code_reward": 0.556554788351059,
"step": 3500,
"zero_std_ratio": 0.325
},
{
"clip_ratio/high_max": 0.017038265755400062,
"clip_ratio/high_mean": 0.0027443476661574095,
"clip_ratio/low_mean": 0.000347714369854657,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003092062100768089,
"completion_length": 84.05,
"epoch": 0.674286812025742,
"grad_norm": 6.119350910186768,
"kl": 0.4559432238340378,
"learning_rate": 3.15965242015187e-07,
"loss": 0.0298,
"reward": 1.6935490131378175,
"reward_std": 0.26772548258304596,
"rewards/code_format_reward": 0.9862499952316284,
"rewards/code_reward": 0.6002120196819305,
"step": 3510,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.03435098186600953,
"clip_ratio/high_mean": 0.005973302901838906,
"clip_ratio/low_mean": 0.0006568559459992684,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006630158764892258,
"completion_length": 95.0,
"epoch": 0.6762078570742484,
"grad_norm": 4.796656608581543,
"kl": 0.3851431407034397,
"learning_rate": 3.1364922355340346e-07,
"loss": 0.0214,
"reward": 1.8059131860733033,
"reward_std": 0.18592590391635894,
"rewards/code_format_reward": 0.9899999976158143,
"rewards/code_reward": 0.6554565787315368,
"step": 3520,
"zero_std_ratio": 0.6
},
{
"clip_ratio/high_max": 0.024073917022906243,
"clip_ratio/high_mean": 0.0035194387339288367,
"clip_ratio/low_mean": 0.0002464063392835669,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0037658450222807006,
"completion_length": 86.9800018310547,
"epoch": 0.6781289021227548,
"grad_norm": 7.799978256225586,
"kl": 0.2617302156984806,
"learning_rate": 3.113418186159349e-07,
"loss": -0.0088,
"reward": 1.515157699584961,
"reward_std": 0.2593328535556793,
"rewards/code_format_reward": 0.975,
"rewards/code_reward": 0.5138288617134095,
"step": 3530,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.014256173744797707,
"clip_ratio/high_mean": 0.002001363394083455,
"clip_ratio/low_mean": 0.001286455297667999,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0032878186670131982,
"completion_length": 93.04500274658203,
"epoch": 0.6800499471712612,
"grad_norm": 1.323721170425415,
"kl": 0.32287237197160723,
"learning_rate": 3.090431112934235e-07,
"loss": -0.0056,
"reward": 1.8219903230667114,
"reward_std": 0.28862411081790923,
"rewards/code_format_reward": 0.9875,
"rewards/code_reward": 0.6641201436519623,
"step": 3540,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.0389601900940761,
"clip_ratio/high_mean": 0.005975415915600024,
"clip_ratio/low_mean": 0.0006638197373831645,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006639235676266253,
"completion_length": 95.84250183105469,
"epoch": 0.6819709922197675,
"grad_norm": 4.850042819976807,
"kl": 1.8627108559012413,
"learning_rate": 3.067531853595369e-07,
"loss": 1.6968,
"reward": 1.8796481132507323,
"reward_std": 0.13976119682192803,
"rewards/code_format_reward": 0.9837500095367432,
"rewards/code_reward": 0.6938865780830383,
"step": 3550,
"zero_std_ratio": 0.65
},
{
"clip_ratio/high_max": 0.01971529610455036,
"clip_ratio/high_mean": 0.002580236754147336,
"clip_ratio/low_mean": 0.0005060694311396219,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0030863061954732986,
"completion_length": 85.99000091552735,
"epoch": 0.683892037268274,
"grad_norm": 499.5800476074219,
"kl": 3.8079170405864717,
"learning_rate": 3.0447212426791546e-07,
"loss": 0.0153,
"reward": 1.73906729221344,
"reward_std": 0.21255102157592773,
"rewards/code_format_reward": 0.9875,
"rewards/code_reward": 0.622658634185791,
"step": 3560,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.03546578506939113,
"clip_ratio/high_mean": 0.005373837990919128,
"clip_ratio/low_mean": 0.0011442803021054714,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0065181183628737925,
"completion_length": 93.75249938964843,
"epoch": 0.6858130823167803,
"grad_norm": 3.144973039627075,
"kl": 0.7828342400491237,
"learning_rate": 3.022000111491309e-07,
"loss": 0.0001,
"reward": 1.8471190690994264,
"reward_std": 0.27725095450878146,
"rewards/code_format_reward": 0.9487499952316284,
"rewards/code_reward": 0.686372023820877,
"step": 3570,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.01367393396794796,
"clip_ratio/high_mean": 0.001831050164764747,
"clip_ratio/low_mean": 0.0008013980732357595,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002632448251824826,
"completion_length": 96.29500122070313,
"epoch": 0.6877341273652867,
"grad_norm": 3.9027657508850098,
"kl": 0.8669951900839805,
"learning_rate": 2.99936928807657e-07,
"loss": -0.0007,
"reward": 1.6410433769226074,
"reward_std": 0.25681858956813813,
"rewards/code_format_reward": 0.9875,
"rewards/code_reward": 0.5736466705799103,
"step": 3580,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.028569919406436384,
"clip_ratio/high_mean": 0.0037314103537937626,
"clip_ratio/low_mean": 0.0012738955876557157,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005005306130624376,
"completion_length": 84.3875015258789,
"epoch": 0.6896551724137931,
"grad_norm": 1.8412340879440308,
"kl": 0.6606554225087166,
"learning_rate": 2.976829597188506e-07,
"loss": -0.0007,
"reward": 1.6131571292877198,
"reward_std": 0.15807003602385522,
"rewards/code_format_reward": 0.9950000047683716,
"rewards/code_reward": 0.5578285574913024,
"step": 3590,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.045477401558309795,
"clip_ratio/high_mean": 0.007051247591152787,
"clip_ratio/low_mean": 0.00021358822996262461,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007264835678506642,
"completion_length": 92.69250030517578,
"epoch": 0.6915762174622995,
"grad_norm": 4.787570953369141,
"kl": 0.2786871612071991,
"learning_rate": 2.9543818602594826e-07,
"loss": 0.0001,
"reward": 1.6197675943374634,
"reward_std": 0.2863120764493942,
"rewards/code_format_reward": 0.9787499904632568,
"rewards/code_reward": 0.5651962697505951,
"step": 3600,
"zero_std_ratio": 0.4
},
{
"clip_ratio/high_max": 0.047238136362284425,
"clip_ratio/high_mean": 0.006483453582040966,
"clip_ratio/low_mean": 0.0015064548759255558,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007989908382296563,
"completion_length": 83.11750183105468,
"epoch": 0.6934972625108059,
"grad_norm": 1.5795401334762573,
"kl": 0.512858135998249,
"learning_rate": 2.932026895370697e-07,
"loss": 0.0021,
"reward": 1.6763751983642579,
"reward_std": 0.12559455148875714,
"rewards/code_format_reward": 0.9912499904632568,
"rewards/code_reward": 0.5903751432895661,
"step": 3610,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.031613614642992616,
"clip_ratio/high_mean": 0.00453935784753412,
"clip_ratio/low_mean": 0.0026672417909139766,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007206599647179246,
"completion_length": 89.80250244140625,
"epoch": 0.6954183075593122,
"grad_norm": 0.9828081130981445,
"kl": 2.053369848430157,
"learning_rate": 2.909765517222392e-07,
"loss": -0.0015,
"reward": 1.6560463190078736,
"reward_std": 0.2526627391576767,
"rewards/code_format_reward": 0.9837499976158142,
"rewards/code_reward": 0.5820856630802155,
"step": 3620,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.00962083850754425,
"clip_ratio/high_mean": 0.0013845860186847859,
"clip_ratio/low_mean": 0.00100141861839802,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0023860046290792524,
"completion_length": 93.0125015258789,
"epoch": 0.6973393526078187,
"grad_norm": 1.4326051473617554,
"kl": 0.7425350762903691,
"learning_rate": 2.887598537104141e-07,
"loss": 0.017,
"reward": 1.608488416671753,
"reward_std": 0.18181688338518143,
"rewards/code_format_reward": 0.9887499928474426,
"rewards/code_reward": 0.557056725025177,
"step": 3630,
"zero_std_ratio": 0.6
},
{
"clip_ratio/high_max": 0.10694800971541554,
"clip_ratio/high_mean": 0.016866487907827833,
"clip_ratio/low_mean": 0.000146488708560355,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.017012976511614396,
"completion_length": 86.96750183105469,
"epoch": 0.6992603976563251,
"grad_norm": 5.3714776039123535,
"kl": 0.5909165881574154,
"learning_rate": 2.8655267628653044e-07,
"loss": 0.0005,
"reward": 1.6461472749710082,
"reward_std": 0.22788509875535964,
"rewards/code_format_reward": 0.9862500071525574,
"rewards/code_reward": 0.5765111327171326,
"step": 3640,
"zero_std_ratio": 0.6
},
{
"clip_ratio/high_max": 0.015923467138782142,
"clip_ratio/high_mean": 0.0022047571546863765,
"clip_ratio/low_mean": 0.0014544774603564292,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0036592346790712328,
"completion_length": 91.53250122070312,
"epoch": 0.7011814427048314,
"grad_norm": 7.581000328063965,
"kl": 3.2652564592659474,
"learning_rate": 2.8435509988855683e-07,
"loss": -0.0019,
"reward": 1.6843700885772706,
"reward_std": 0.20299706608057022,
"rewards/code_format_reward": 0.993749988079071,
"rewards/code_reward": 0.5937475442886353,
"step": 3650,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.04569347179494798,
"clip_ratio/high_mean": 0.00580012007849291,
"clip_ratio/low_mean": 0.003195645064988639,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008995765156578272,
"completion_length": 82.49500122070313,
"epoch": 0.7031024877533378,
"grad_norm": 10.031012535095215,
"kl": 0.3446802504360676,
"learning_rate": 2.821672046045642e-07,
"loss": -0.003,
"reward": 1.9148546934127808,
"reward_std": 0.15906044691801072,
"rewards/code_format_reward": 0.99375,
"rewards/code_reward": 0.7089898109436035,
"step": 3660,
"zero_std_ratio": 0.6
},
{
"clip_ratio/high_max": 0.01652005296200514,
"clip_ratio/high_mean": 0.0032194001134485005,
"clip_ratio/low_mean": 0.0004348491333075799,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003654249230748974,
"completion_length": 87.48000030517578,
"epoch": 0.7050235328018442,
"grad_norm": 4.5817551612854,
"kl": 0.5381794683635235,
"learning_rate": 2.799890701698068e-07,
"loss": -0.0018,
"reward": 1.4432553768157959,
"reward_std": 0.19258553311228752,
"rewards/code_format_reward": 0.9887499928474426,
"rewards/code_reward": 0.4744401514530182,
"step": 3670,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.03230769606307149,
"clip_ratio/high_mean": 0.004254726751241833,
"clip_ratio/low_mean": 0.0003341716161230579,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0045888983644545075,
"completion_length": 91.96000366210937,
"epoch": 0.7069445778503506,
"grad_norm": 3.1825077533721924,
"kl": 0.5493438571691514,
"learning_rate": 2.7782077596381596e-07,
"loss": 0.0032,
"reward": 1.8943065643310546,
"reward_std": 0.22485891729593277,
"rewards/code_format_reward": 0.9924999952316285,
"rewards/code_reward": 0.6990282416343689,
"step": 3680,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.017006529681384563,
"clip_ratio/high_mean": 0.0026059710187837483,
"clip_ratio/low_mean": 0.00022266755404416473,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0028286385582759976,
"completion_length": 92.6875015258789,
"epoch": 0.708865622898857,
"grad_norm": 3.126534938812256,
"kl": 2.302929486706853,
"learning_rate": 2.7566240100750794e-07,
"loss": 0.0024,
"reward": 1.6279277324676513,
"reward_std": 0.3058730036020279,
"rewards/code_format_reward": 0.9837499976158142,
"rewards/code_reward": 0.568026351928711,
"step": 3690,
"zero_std_ratio": 0.4
},
{
"clip_ratio/high_max": 0.020053896540775894,
"clip_ratio/high_mean": 0.0029980215302202852,
"clip_ratio/low_mean": 0.0004860887274844572,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003484110155841336,
"completion_length": 97.92250061035156,
"epoch": 0.7107866679473633,
"grad_norm": 4.224461555480957,
"kl": 4.42233342602849,
"learning_rate": 2.735140239603034e-07,
"loss": -0.0003,
"reward": 1.960454559326172,
"reward_std": 0.24239360094070433,
"rewards/code_format_reward": 0.981249988079071,
"rewards/code_reward": 0.7349147796630859,
"step": 3700,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.02752018291503191,
"clip_ratio/high_mean": 0.005125764373224229,
"clip_ratio/low_mean": 0.00023403638042509555,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005359800613950938,
"completion_length": 101.37250061035157,
"epoch": 0.7127077129958698,
"grad_norm": 4.285885334014893,
"kl": 0.952894814312458,
"learning_rate": 2.713757231172611e-07,
"loss": -0.0013,
"reward": 1.6773537874221802,
"reward_std": 0.2778655707836151,
"rewards/code_format_reward": 0.9837499976158142,
"rewards/code_reward": 0.5927394092082977,
"step": 3710,
"zero_std_ratio": 0.375
},
{
"clip_ratio/high_max": 0.021348989009857176,
"clip_ratio/high_mean": 0.0030060237273573875,
"clip_ratio/low_mean": 0.0013181588088627904,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004324182611890137,
"completion_length": 95.14250183105469,
"epoch": 0.7146287580443761,
"grad_norm": 2.7202091217041016,
"kl": 2.8931914918124675,
"learning_rate": 2.692475764062245e-07,
"loss": -0.0021,
"reward": 1.8867613315582275,
"reward_std": 0.18746355026960373,
"rewards/code_format_reward": 0.9987499952316284,
"rewards/code_reward": 0.6936931371688843,
"step": 3720,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.007143327506491914,
"clip_ratio/high_mean": 0.0009208801442582626,
"clip_ratio/low_mean": 0.00037053466949146243,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001291414822480874,
"completion_length": 94.1875015258789,
"epoch": 0.7165498030928825,
"grad_norm": 2.7853496074676514,
"kl": 0.6755535811185837,
"learning_rate": 2.6712966138498174e-07,
"loss": -0.003,
"reward": 1.723927640914917,
"reward_std": 0.2750594407320023,
"rewards/code_format_reward": 0.9825000047683716,
"rewards/code_reward": 0.6163387894630432,
"step": 3730,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.019027433777227997,
"clip_ratio/high_mean": 0.002618219889700413,
"clip_ratio/low_mean": 0.0018478537182090803,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004466073628282175,
"completion_length": 102.04000091552734,
"epoch": 0.718470848141389,
"grad_norm": 5.998534202575684,
"kl": 0.9062080264091492,
"learning_rate": 2.650220552384391e-07,
"loss": 0.0289,
"reward": 1.8737354516983031,
"reward_std": 0.34540517926216124,
"rewards/code_format_reward": 0.9824999928474426,
"rewards/code_reward": 0.6912427186965943,
"step": 3740,
"zero_std_ratio": 0.375
},
{
"clip_ratio/high_max": 0.056439303827937694,
"clip_ratio/high_mean": 0.007310985976073425,
"clip_ratio/low_mean": 0.0005420514833531342,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007853037484164816,
"completion_length": 92.48250122070313,
"epoch": 0.7203918931898953,
"grad_norm": 5.3343424797058105,
"kl": 0.3819971337914467,
"learning_rate": 2.6292483477580816e-07,
"loss": -0.011,
"reward": 1.672910475730896,
"reward_std": 0.2516419067978859,
"rewards/code_format_reward": 0.9774999976158142,
"rewards/code_reward": 0.5920802116394043,
"step": 3750,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.13834233868401496,
"clip_ratio/high_mean": 0.018591971611022017,
"clip_ratio/low_mean": 0.0006771487518562935,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.019269120390526952,
"completion_length": 99.33000030517579,
"epoch": 0.7223129382384017,
"grad_norm": 1.4892189502716064,
"kl": 0.9441468060016632,
"learning_rate": 2.6083807642780644e-07,
"loss": -0.0084,
"reward": 1.5579908847808839,
"reward_std": 0.272139647603035,
"rewards/code_format_reward": 0.9787500023841857,
"rewards/code_reward": 0.5343079507350922,
"step": 3760,
"zero_std_ratio": 0.4
},
{
"clip_ratio/high_max": 0.023900310718454422,
"clip_ratio/high_mean": 0.005545906673069112,
"clip_ratio/low_mean": 0.0007872088695876301,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006333115603774786,
"completion_length": 90.84000396728516,
"epoch": 0.724233983286908,
"grad_norm": 12.181316375732422,
"kl": 8.179486125707626,
"learning_rate": 2.5876185624387225e-07,
"loss": 0.0398,
"reward": 1.743166995048523,
"reward_std": 0.3216101437807083,
"rewards/code_format_reward": 0.9824999928474426,
"rewards/code_reward": 0.6259585380554199,
"step": 3770,
"zero_std_ratio": 0.325
},
{
"clip_ratio/high_max": 0.00846199265215546,
"clip_ratio/high_mean": 0.0012625553936231881,
"clip_ratio/low_mean": 0.00030621195983258074,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001568767352728173,
"completion_length": 118.35750122070313,
"epoch": 0.7261550283354145,
"grad_norm": 1.6517783403396606,
"kl": 0.968211068212986,
"learning_rate": 2.5669624988939287e-07,
"loss": 0.1551,
"reward": 1.7871047019958497,
"reward_std": 0.21420088410377502,
"rewards/code_format_reward": 0.9949999928474427,
"rewards/code_reward": 0.644802349805832,
"step": 3780,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.02817701958119869,
"clip_ratio/high_mean": 0.0037564294645562766,
"clip_ratio/low_mean": 0.011859719056519679,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015616148672415875,
"completion_length": 93.14750213623047,
"epoch": 0.7280760733839209,
"grad_norm": 11.322369575500488,
"kl": 0.45075275003910065,
"learning_rate": 2.5464133264294705e-07,
"loss": -0.0008,
"reward": 1.662767267227173,
"reward_std": 0.24967537969350814,
"rewards/code_format_reward": 0.9862500071525574,
"rewards/code_reward": 0.5848211228847504,
"step": 3790,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.05006286900024861,
"clip_ratio/high_mean": 0.007249254969065077,
"clip_ratio/low_mean": 0.00040258544613607227,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007651840391918086,
"completion_length": 110.32750396728515,
"epoch": 0.7299971184324272,
"grad_norm": 16.862590789794922,
"kl": 0.3901309326291084,
"learning_rate": 2.5259717939356175e-07,
"loss": -0.0019,
"reward": 1.7777814149856568,
"reward_std": 0.25982470586895945,
"rewards/code_format_reward": 0.987500011920929,
"rewards/code_reward": 0.6420157194137573,
"step": 3800,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.007159786019474268,
"clip_ratio/high_mean": 0.0011859470629133283,
"clip_ratio/low_mean": 0.0021440873795654626,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0033300343551672996,
"completion_length": 96.07000122070312,
"epoch": 0.7319181634809336,
"grad_norm": 2.4953460693359375,
"kl": 0.3146058402955532,
"learning_rate": 2.505638646379831e-07,
"loss": -0.0042,
"reward": 1.7296765804290772,
"reward_std": 0.3011175274848938,
"rewards/code_format_reward": 0.9837499976158142,
"rewards/code_reward": 0.6189007639884949,
"step": 3810,
"zero_std_ratio": 0.325
},
{
"clip_ratio/high_max": 0.03548359724227339,
"clip_ratio/high_mean": 0.004679994014441036,
"clip_ratio/low_mean": 0.00017329893162241206,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004853292935877107,
"completion_length": 101.32000122070312,
"epoch": 0.7338392085294401,
"grad_norm": 3.954063892364502,
"kl": 0.34448319524526594,
"learning_rate": 2.485414624779603e-07,
"loss": -0.0051,
"reward": 1.690654444694519,
"reward_std": 0.24299487322568894,
"rewards/code_format_reward": 0.981249988079071,
"rewards/code_reward": 0.6000146985054016,
"step": 3820,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.00636189088691026,
"clip_ratio/high_mean": 0.0008543322241166606,
"clip_ratio/low_mean": 0.00028777473780792204,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0011421069371863267,
"completion_length": 94.74000244140625,
"epoch": 0.7357602535779464,
"grad_norm": 1.0420587062835693,
"kl": 0.28902386128902435,
"learning_rate": 2.4653004661754703e-07,
"loss": 0.0021,
"reward": 1.929768443107605,
"reward_std": 0.19695264101028442,
"rewards/code_format_reward": 0.9899999976158143,
"rewards/code_reward": 0.7173841595649719,
"step": 3830,
"zero_std_ratio": 0.7
},
{
"clip_ratio/high_max": 0.0385974693344906,
"clip_ratio/high_mean": 0.0054892279236810285,
"clip_ratio/low_mean": 0.0004371934803202748,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005926421421463601,
"completion_length": 100.23250122070313,
"epoch": 0.7376812986264528,
"grad_norm": 6.22709846496582,
"kl": 0.39053357392549515,
"learning_rate": 2.445296903604131e-07,
"loss": -0.0123,
"reward": 1.7683161497116089,
"reward_std": 0.4236398935317993,
"rewards/code_format_reward": 0.9712499976158142,
"rewards/code_reward": 0.6413455486297608,
"step": 3840,
"zero_std_ratio": 0.3
},
{
"clip_ratio/high_max": 0.013776408764533699,
"clip_ratio/high_mean": 0.0019065461441641674,
"clip_ratio/low_mean": 0.0035487653221935034,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005455311315017752,
"completion_length": 91.36000213623046,
"epoch": 0.7396023436749591,
"grad_norm": 3.84639573097229,
"kl": 9.267435324192046,
"learning_rate": 2.4254046660717555e-07,
"loss": 0.0107,
"reward": 1.7194789409637452,
"reward_std": 0.23012096285820008,
"rewards/code_format_reward": 0.98125,
"rewards/code_reward": 0.6144269347190857,
"step": 3850,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.019276025268482044,
"clip_ratio/high_mean": 0.0034578723403683397,
"clip_ratio/low_mean": 0.0028569042566232382,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006314776389626786,
"completion_length": 96.3250015258789,
"epoch": 0.7415233887234656,
"grad_norm": 4.765519142150879,
"kl": 0.5375766545534134,
"learning_rate": 2.4056244785273895e-07,
"loss": -0.0038,
"reward": 1.713827419281006,
"reward_std": 0.28884910941123965,
"rewards/code_format_reward": 0.9824999928474426,
"rewards/code_reward": 0.6112887144088746,
"step": 3860,
"zero_std_ratio": 0.35
},
{
"clip_ratio/high_max": 0.06692883024225012,
"clip_ratio/high_mean": 0.008779459849756676,
"clip_ratio/low_mean": 0.0002708235711907037,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009050283460237552,
"completion_length": 103.41250152587891,
"epoch": 0.743444433771972,
"grad_norm": 2.68007493019104,
"kl": 0.34222877621650694,
"learning_rate": 2.3859570618365614e-07,
"loss": -0.0009,
"reward": 1.74418466091156,
"reward_std": 0.20953620076179505,
"rewards/code_format_reward": 0.9912499785423279,
"rewards/code_reward": 0.6242798089981079,
"step": 3870,
"zero_std_ratio": 0.6
},
{
"clip_ratio/high_max": 0.01482260066550225,
"clip_ratio/high_mean": 0.0023997865355340764,
"clip_ratio/low_mean": 0.00038790585967944934,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002787692387937568,
"completion_length": 98.42250061035156,
"epoch": 0.7453654788204783,
"grad_norm": 4.816893100738525,
"kl": 0.4661983668804169,
"learning_rate": 2.366403132754995e-07,
"loss": -0.0019,
"reward": 1.6338875532150268,
"reward_std": 0.21452725008130075,
"rewards/code_format_reward": 0.9887500047683716,
"rewards/code_reward": 0.5697562634944916,
"step": 3880,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.02494101980701089,
"clip_ratio/high_mean": 0.003492716047912836,
"clip_ratio/low_mean": 0.00024301124794874341,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0037357273045927285,
"completion_length": 97.44000091552735,
"epoch": 0.7472865238689848,
"grad_norm": 82.46282958984375,
"kl": 0.5981974095106125,
"learning_rate": 2.3469634039024927e-07,
"loss": 0.0024,
"reward": 1.8161945581436156,
"reward_std": 0.17759706005454062,
"rewards/code_format_reward": 0.9837499976158142,
"rewards/code_reward": 0.6621597528457641,
"step": 3890,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.0019452353473752737,
"clip_ratio/high_mean": 0.00039936143439263106,
"clip_ratio/low_mean": 0.0002315789126441814,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0006309403397608548,
"completion_length": 94.07750091552734,
"epoch": 0.7492075689174911,
"grad_norm": 6.090396404266357,
"kl": 0.8421477146446705,
"learning_rate": 2.3276385837369632e-07,
"loss": 0.014,
"reward": 1.4471250534057618,
"reward_std": 0.25895166750997306,
"rewards/code_format_reward": 0.9849999904632568,
"rewards/code_reward": 0.4773125171661377,
"step": 3900,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.02143551183398813,
"clip_ratio/high_mean": 0.002903820894425735,
"clip_ratio/low_mean": 0.00011704202042892576,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0030208629119442775,
"completion_length": 89.32750091552734,
"epoch": 0.7511286139659975,
"grad_norm": 7.675207614898682,
"kl": 4.630686198174954,
"learning_rate": 2.3084293765286074e-07,
"loss": 0.0109,
"reward": 1.7639801740646361,
"reward_std": 0.32505679726600645,
"rewards/code_format_reward": 0.9837499976158142,
"rewards/code_reward": 0.6360525727272034,
"step": 3910,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.007216949050780385,
"clip_ratio/high_mean": 0.0012314463703660295,
"clip_ratio/low_mean": 0.000596191274235025,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0018276376475114375,
"completion_length": 93.16250152587891,
"epoch": 0.7530496590145039,
"grad_norm": 3.4967644214630127,
"kl": 0.9979558669030666,
"learning_rate": 2.2893364823342454e-07,
"loss": 0.0016,
"reward": 1.5569410085678101,
"reward_std": 0.2807903170585632,
"rewards/code_format_reward": 0.9674999952316284,
"rewards/code_reward": 0.5365955173969269,
"step": 3920,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.01850514723919332,
"clip_ratio/high_mean": 0.003044746146770194,
"clip_ratio/low_mean": 0.0006967324326978997,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0037414785125292837,
"completion_length": 95.95500183105469,
"epoch": 0.7549707040630103,
"grad_norm": 2.8742544651031494,
"kl": 0.44021010398864746,
"learning_rate": 2.270360596971809e-07,
"loss": -0.0037,
"reward": 1.823073673248291,
"reward_std": 0.24968771934509276,
"rewards/code_format_reward": 0.9924999952316285,
"rewards/code_reward": 0.663411819934845,
"step": 3930,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.015126590803265571,
"clip_ratio/high_mean": 0.0023361636558547616,
"clip_ratio/low_mean": 0.00015787699958309532,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002494040655437857,
"completion_length": 91.19500122070312,
"epoch": 0.7568917491115167,
"grad_norm": 3.40413236618042,
"kl": 0.386103405430913,
"learning_rate": 2.2515024119949826e-07,
"loss": -0.011,
"reward": 1.5718731164932251,
"reward_std": 0.2807211749255657,
"rewards/code_format_reward": 0.9774999976158142,
"rewards/code_reward": 0.5415615499019623,
"step": 3940,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.015597179555334151,
"clip_ratio/high_mean": 0.0027747701620683073,
"clip_ratio/low_mean": 0.00042166481143794953,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003196435049176216,
"completion_length": 98.425,
"epoch": 0.758812794160023,
"grad_norm": 4.560734272003174,
"kl": 0.4831135801970959,
"learning_rate": 2.2327626146679974e-07,
"loss": -0.0022,
"reward": 1.7759766340255738,
"reward_std": 0.2547271862626076,
"rewards/code_format_reward": 0.9625,
"rewards/code_reward": 0.6473633050918579,
"step": 3950,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.008683394081890583,
"clip_ratio/high_mean": 0.0011145618045702577,
"clip_ratio/low_mean": 0.0009394719265401364,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0020540336496196686,
"completion_length": 102.31250305175782,
"epoch": 0.7607338392085294,
"grad_norm": 0.1577247530221939,
"kl": 1.2770531885325909,
"learning_rate": 2.2141418879405855e-07,
"loss": 0.0032,
"reward": 1.7324957370758056,
"reward_std": 0.19914634823799132,
"rewards/code_format_reward": 0.9850000023841858,
"rewards/code_reward": 0.6199978470802308,
"step": 3960,
"zero_std_ratio": 0.625
},
{
"clip_ratio/high_max": 0.004086668835952878,
"clip_ratio/high_mean": 0.0005708287237212062,
"clip_ratio/low_mean": 2.821670495904982e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0005990454228594899,
"completion_length": 95.21750335693359,
"epoch": 0.7626548842570359,
"grad_norm": 268.0164794921875,
"kl": 3.985953611135483,
"learning_rate": 2.1956409104230986e-07,
"loss": 0.0127,
"reward": 1.7277408480644225,
"reward_std": 0.19516595900058747,
"rewards/code_format_reward": 0.9737500071525573,
"rewards/code_reward": 0.6204329133033752,
"step": 3970,
"zero_std_ratio": 0.625
},
{
"clip_ratio/high_max": 0.02115430913399905,
"clip_ratio/high_mean": 0.003100222998182289,
"clip_ratio/low_mean": 0.00045731081045232715,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0035575337868067438,
"completion_length": 99.47250213623047,
"epoch": 0.7645759293055422,
"grad_norm": 4.087578773498535,
"kl": 0.2619202695786953,
"learning_rate": 2.1772603563617603e-07,
"loss": -0.0024,
"reward": 1.6976868152618407,
"reward_std": 0.31094631999731065,
"rewards/code_format_reward": 0.9787499904632568,
"rewards/code_reward": 0.6041558861732483,
"step": 3980,
"zero_std_ratio": 0.4
},
{
"clip_ratio/high_max": 0.022111613873858006,
"clip_ratio/high_mean": 0.0033171431292430497,
"clip_ratio/low_mean": 0.00019350402581039817,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003510647150687873,
"completion_length": 93.09000091552734,
"epoch": 0.7664969743540486,
"grad_norm": 2.557553291320801,
"kl": 0.4590821463614702,
"learning_rate": 2.1590008956141137e-07,
"loss": -0.0014,
"reward": 1.7825278520584107,
"reward_std": 0.26515288949012755,
"rewards/code_format_reward": 0.9887500047683716,
"rewards/code_reward": 0.6440764307975769,
"step": 3990,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.03076116186566651,
"clip_ratio/high_mean": 0.004437833256088197,
"clip_ratio/low_mean": 0.0004819675668841228,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004919800782226957,
"completion_length": 89.73500061035156,
"epoch": 0.7684180194025549,
"grad_norm": 2.5422067642211914,
"kl": 0.26607592329382895,
"learning_rate": 2.1408631936245908e-07,
"loss": 0.0026,
"reward": 1.8288384914398192,
"reward_std": 0.2508297085762024,
"rewards/code_format_reward": 0.9837499856948853,
"rewards/code_reward": 0.6684817314147949,
"step": 4000,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.020826040930114687,
"clip_ratio/high_mean": 0.0040985049330629405,
"clip_ratio/low_mean": 0.000369196553947404,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004467701492831111,
"completion_length": 97.69500122070312,
"epoch": 0.7703390644510614,
"grad_norm": 2.079371929168701,
"kl": 0.3304180882871151,
"learning_rate": 2.122847911400278e-07,
"loss": 0.0019,
"reward": 1.693557620048523,
"reward_std": 0.21333991810679437,
"rewards/code_format_reward": 0.9962499976158142,
"rewards/code_reward": 0.5977162718772888,
"step": 4010,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.009364375309087337,
"clip_ratio/high_mean": 0.0013745424774242565,
"clip_ratio/low_mean": 0.0020853754234849476,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034599179547512905,
"completion_length": 94.00750274658203,
"epoch": 0.7722601094995678,
"grad_norm": 3.2660512924194336,
"kl": 0.6432372182607651,
"learning_rate": 2.1049557054868082e-07,
"loss": 0.0073,
"reward": 1.8483120203018188,
"reward_std": 0.316910046339035,
"rewards/code_format_reward": 0.9649999856948852,
"rewards/code_reward": 0.6829060018062592,
"step": 4020,
"zero_std_ratio": 0.375
},
{
"clip_ratio/high_max": 0.08980275879148394,
"clip_ratio/high_mean": 0.011746273408061825,
"clip_ratio/low_mean": 0.000331929670937825,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012078202966949902,
"completion_length": 92.7925018310547,
"epoch": 0.7741811545480741,
"grad_norm": 3.004549503326416,
"kl": 0.74478175714612,
"learning_rate": 2.0871872279444554e-07,
"loss": -0.0021,
"reward": 1.7010861873626708,
"reward_std": 0.25111902356147764,
"rewards/code_format_reward": 0.9737499952316284,
"rewards/code_reward": 0.6071055889129638,
"step": 4030,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.0778543038177304,
"clip_ratio/high_mean": 0.00988141688721953,
"clip_ratio/low_mean": 0.0002543082577176392,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01013572499359725,
"completion_length": 105.63250122070312,
"epoch": 0.7761021995965806,
"grad_norm": 6.268821716308594,
"kl": 0.32837071269750595,
"learning_rate": 2.0695431263243512e-07,
"loss": -0.0003,
"reward": 1.716653084754944,
"reward_std": 0.2870768278837204,
"rewards/code_format_reward": 0.9899999976158143,
"rewards/code_reward": 0.6108265280723572,
"step": 4040,
"zero_std_ratio": 0.4
},
{
"clip_ratio/high_max": 0.07360692555084825,
"clip_ratio/high_mean": 0.009302017895970493,
"clip_ratio/low_mean": 0.0003425976261496544,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009644615522120148,
"completion_length": 91.73750152587891,
"epoch": 0.7780232446450869,
"grad_norm": 4.801341533660889,
"kl": 13.291237189993263,
"learning_rate": 2.052024043644897e-07,
"loss": 0.0294,
"reward": 1.7232446193695068,
"reward_std": 0.24269133806228638,
"rewards/code_format_reward": 0.9924999952316285,
"rewards/code_reward": 0.6134972870349884,
"step": 4050,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.012731208954937756,
"clip_ratio/high_mean": 0.00180651948612649,
"clip_ratio/low_mean": 0.00015854310477152466,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0019650626258226112,
"completion_length": 92.22000274658203,
"epoch": 0.7799442896935933,
"grad_norm": 0.6561126112937927,
"kl": 0.4966626279056072,
"learning_rate": 2.0346306183683254e-07,
"loss": 0.0001,
"reward": 1.8969059467315674,
"reward_std": 0.33292114436626435,
"rewards/code_format_reward": 0.9800000071525574,
"rewards/code_reward": 0.7034529447555542,
"step": 4060,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.015003547444939614,
"clip_ratio/high_mean": 0.002088976529194042,
"clip_ratio/low_mean": 0.0003269152017310262,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002415891730925068,
"completion_length": 88.61250152587891,
"epoch": 0.7818653347420997,
"grad_norm": 3.062511920928955,
"kl": 27.40203034952283,
"learning_rate": 2.0173634843774363e-07,
"loss": 0.0554,
"reward": 1.7011754512786865,
"reward_std": 0.3188599109649658,
"rewards/code_format_reward": 0.981250011920929,
"rewards/code_reward": 0.6052752196788788,
"step": 4070,
"zero_std_ratio": 0.375
},
{
"clip_ratio/high_max": 0.006837105128215626,
"clip_ratio/high_mean": 0.0008925169277063105,
"clip_ratio/low_mean": 0.0005512935545993969,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0014438104728469626,
"completion_length": 91.84750213623047,
"epoch": 0.7837863797906061,
"grad_norm": 3.0254440307617188,
"kl": 1.3981286019086838,
"learning_rate": 2.0002232709524897e-07,
"loss": 0.0033,
"reward": 1.6401101350784302,
"reward_std": 0.26853239685297015,
"rewards/code_format_reward": 0.9849999904632568,
"rewards/code_reward": 0.5738050699234009,
"step": 4080,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.03983018643921241,
"clip_ratio/high_mean": 0.005185264609463047,
"clip_ratio/low_mean": 0.0019072047754889355,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007092469278723002,
"completion_length": 88.79250030517578,
"epoch": 0.7857074248391125,
"grad_norm": 2.8119072914123535,
"kl": 0.41205914914608,
"learning_rate": 1.983210602748279e-07,
"loss": -0.0029,
"reward": 1.9083050966262818,
"reward_std": 0.29446094632148745,
"rewards/code_format_reward": 0.9825000047683716,
"rewards/code_reward": 0.7085274815559387,
"step": 4090,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.013765955006238072,
"clip_ratio/high_mean": 0.0018926289907540196,
"clip_ratio/low_mean": 0.0033484802523162218,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00524110905098496,
"completion_length": 85.72500305175781,
"epoch": 0.7876284698876188,
"grad_norm": 9.436022758483887,
"kl": 0.5864221028983593,
"learning_rate": 1.966326099771361e-07,
"loss": -0.0013,
"reward": 1.8478533029556274,
"reward_std": 0.2244624227285385,
"rewards/code_format_reward": 0.987499988079071,
"rewards/code_reward": 0.6770516157150268,
"step": 4100,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.008409230364486575,
"clip_ratio/high_mean": 0.0011749810015317052,
"clip_ratio/low_mean": 0.00043775633239420133,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001612737326649949,
"completion_length": 91.16000213623047,
"epoch": 0.7895495149361252,
"grad_norm": 6.15724515914917,
"kl": 19.288723162561656,
"learning_rate": 1.9495703773574628e-07,
"loss": 0.0383,
"reward": 1.6099607944488525,
"reward_std": 0.30300846993923186,
"rewards/code_format_reward": 0.9799999952316284,
"rewards/code_reward": 0.5599803984165191,
"step": 4110,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.009142859559506177,
"clip_ratio/high_mean": 0.001581054090638645,
"clip_ratio/low_mean": 0.0003552554393536411,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0019363095459993928,
"completion_length": 91.99500122070313,
"epoch": 0.7914705599846317,
"grad_norm": 6.634824752807617,
"kl": 6.53539779484272,
"learning_rate": 1.9329440461490576e-07,
"loss": 0.0342,
"reward": 1.647179627418518,
"reward_std": 0.2863168239593506,
"rewards/code_format_reward": 0.9862499952316284,
"rewards/code_reward": 0.5770273089408875,
"step": 4120,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.002549535338766873,
"clip_ratio/high_mean": 0.0003399143257411197,
"clip_ratio/low_mean": 0.00017536718805786223,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.000515281516709365,
"completion_length": 90.89250183105469,
"epoch": 0.793391605033138,
"grad_norm": 2.817605972290039,
"kl": 2.417458937317133,
"learning_rate": 1.9164477120731066e-07,
"loss": 0.0066,
"reward": 1.7660948038101196,
"reward_std": 0.2769928514957428,
"rewards/code_format_reward": 0.96875,
"rewards/code_reward": 0.640859854221344,
"step": 4130,
"zero_std_ratio": 0.375
},
{
"clip_ratio/high_max": 0.037738511635689066,
"clip_ratio/high_mean": 0.0050403060296957845,
"clip_ratio/low_mean": 0.0007518758837250061,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005792181929427898,
"completion_length": 96.35750122070313,
"epoch": 0.7953126500816444,
"grad_norm": 4.240172386169434,
"kl": 0.28425633125007155,
"learning_rate": 1.900081976318983e-07,
"loss": 0.002,
"reward": 1.6942025184631349,
"reward_std": 0.3146607309579849,
"rewards/code_format_reward": 0.9737499833106995,
"rewards/code_reward": 0.6036637306213379,
"step": 4140,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.005694918753579259,
"clip_ratio/high_mean": 0.0007544978521764279,
"clip_ratio/low_mean": 0.000571403895446565,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001325901737436652,
"completion_length": 91.79500122070313,
"epoch": 0.7972336951301509,
"grad_norm": 3.9649434089660645,
"kl": 0.5314306125044823,
"learning_rate": 1.8838474353165547e-07,
"loss": -0.0054,
"reward": 1.7638010501861572,
"reward_std": 0.2793388396501541,
"rewards/code_format_reward": 0.9824999928474426,
"rewards/code_reward": 0.6362755179405213,
"step": 4150,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.07577090607956052,
"clip_ratio/high_mean": 0.009897856542374938,
"clip_ratio/low_mean": 0.00011142043076688424,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010009276978962589,
"completion_length": 94.04000244140624,
"epoch": 0.7991547401786572,
"grad_norm": 2.2340188026428223,
"kl": 0.524626237899065,
"learning_rate": 1.8677446807144554e-07,
"loss": -0.0045,
"reward": 1.7472325563430786,
"reward_std": 0.3027869775891304,
"rewards/code_format_reward": 0.9787499904632568,
"rewards/code_reward": 0.6289287328720092,
"step": 4160,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.012572024948894978,
"clip_ratio/high_mean": 0.0020916348788887263,
"clip_ratio/low_mean": 0.00022255638323258607,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0023141912854043765,
"completion_length": 94.53000183105469,
"epoch": 0.8010757852271636,
"grad_norm": 10.561907768249512,
"kl": 2.102495136484504,
"learning_rate": 1.8517742993585178e-07,
"loss": 0.0137,
"reward": 1.7456205368041993,
"reward_std": 0.2167625606060028,
"rewards/code_format_reward": 0.9862500071525574,
"rewards/code_reward": 0.6262477397918701,
"step": 4170,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.06855184989399277,
"clip_ratio/high_mean": 0.008778795686521335,
"clip_ratio/low_mean": 5.122950533404946e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008830025191855384,
"completion_length": 101.0425018310547,
"epoch": 0.8029968302756699,
"grad_norm": 5.673184871673584,
"kl": 0.428597304970026,
"learning_rate": 1.835936873270389e-07,
"loss": -0.0078,
"reward": 1.818405318260193,
"reward_std": 0.23994216322898865,
"rewards/code_format_reward": 0.9862499952316284,
"rewards/code_reward": 0.6626401782035828,
"step": 4180,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.002845590282231569,
"clip_ratio/high_mean": 0.0004983038117643446,
"clip_ratio/low_mean": 0.00037282529519870876,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0008711291156942025,
"completion_length": 92.31250305175782,
"epoch": 0.8049178753241764,
"grad_norm": 6.281589508056641,
"kl": 0.4346353754401207,
"learning_rate": 1.8202329796263172e-07,
"loss": -0.0009,
"reward": 1.8768694639205932,
"reward_std": 0.21767425537109375,
"rewards/code_format_reward": 0.9862499952316284,
"rewards/code_reward": 0.6918722629547119,
"step": 4190,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.003876271191984415,
"clip_ratio/high_mean": 0.0004845338989980519,
"clip_ratio/low_mean": 0.0001560977878398262,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0006406316824723035,
"completion_length": 75.7275016784668,
"epoch": 0.8068389203726828,
"grad_norm": 1.0423272848129272,
"kl": 0.9193772681057453,
"learning_rate": 1.8046631907361226e-07,
"loss": 0.0041,
"reward": 1.8756553649902343,
"reward_std": 0.18836807161569596,
"rewards/code_format_reward": 0.99375,
"rewards/code_reward": 0.6893901348114013,
"step": 4200,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.004609162057749927,
"clip_ratio/high_mean": 0.0007380300055956468,
"clip_ratio/low_mean": 0.00015018127305665985,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.000888211271376349,
"completion_length": 86.45750122070312,
"epoch": 0.8087599654211891,
"grad_norm": 4.096966743469238,
"kl": 0.45643181502819063,
"learning_rate": 1.7892280740223303e-07,
"loss": -0.004,
"reward": 1.5836501359939574,
"reward_std": 0.2258547842502594,
"rewards/code_format_reward": 0.9799999952316284,
"rewards/code_reward": 0.5468250632286071,
"step": 4210,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.007562826108187437,
"clip_ratio/high_mean": 0.0010166528285481037,
"clip_ratio/low_mean": 0.000701455632224679,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017181085073389112,
"completion_length": 90.34250335693359,
"epoch": 0.8106810104696955,
"grad_norm": 0.29423439502716064,
"kl": 0.2636001568287611,
"learning_rate": 1.7739281919995045e-07,
"loss": 0.0161,
"reward": 1.5646157741546631,
"reward_std": 0.12648468129336835,
"rewards/code_format_reward": 0.9837499976158142,
"rewards/code_reward": 0.5363703727722168,
"step": 4220,
"zero_std_ratio": 0.625
},
{
"clip_ratio/high_max": 0.02073557274416089,
"clip_ratio/high_mean": 0.002738419675733894,
"clip_ratio/low_mean": 0.001565844019933138,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004304263507947326,
"completion_length": 85.92750091552735,
"epoch": 0.8126020555182019,
"grad_norm": 3.8796801567077637,
"kl": 0.6587013073265553,
"learning_rate": 1.7587641022537335e-07,
"loss": -0.0031,
"reward": 1.598485040664673,
"reward_std": 0.23664331436157227,
"rewards/code_format_reward": 0.9862499952316284,
"rewards/code_reward": 0.5526800036430359,
"step": 4230,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.0027086240705102684,
"clip_ratio/high_mean": 0.0003605463745770976,
"clip_ratio/low_mean": 0.0004666288397856988,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0008271752245491371,
"completion_length": 86.6875,
"epoch": 0.8145231005667083,
"grad_norm": 6.270168781280518,
"kl": 3.9651204235851765,
"learning_rate": 1.7437363574223244e-07,
"loss": 0.0141,
"reward": 1.8213656187057494,
"reward_std": 0.2221561223268509,
"rewards/code_format_reward": 0.9837499976158142,
"rewards/code_reward": 0.66474529504776,
"step": 4240,
"zero_std_ratio": 0.6
},
{
"clip_ratio/high_max": 0.009644670388661325,
"clip_ratio/high_mean": 0.0013658979878528044,
"clip_ratio/low_mean": 0.0006750999338692055,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002040997930453159,
"completion_length": 86.42250061035156,
"epoch": 0.8164441456152147,
"grad_norm": 4.402440071105957,
"kl": 0.27487861886620524,
"learning_rate": 1.7288455051736474e-07,
"loss": -0.0005,
"reward": 1.6581492662429809,
"reward_std": 0.14444592781364918,
"rewards/code_format_reward": 0.9862499952316284,
"rewards/code_reward": 0.5825121104717255,
"step": 4250,
"zero_std_ratio": 0.65
},
{
"clip_ratio/high_max": 0.017426467640325426,
"clip_ratio/high_mean": 0.0023936200188472865,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0023936200188472865,
"completion_length": 88.66250152587891,
"epoch": 0.818365190663721,
"grad_norm": 15.625293731689453,
"kl": 0.5453658372163772,
"learning_rate": 1.7140920881871927e-07,
"loss": 0.0001,
"reward": 1.9025921821594238,
"reward_std": 0.1951783686876297,
"rewards/code_format_reward": 0.9899999856948852,
"rewards/code_reward": 0.7037960886955261,
"step": 4260,
"zero_std_ratio": 0.65
},
{
"clip_ratio/high_max": 0.024276501801796257,
"clip_ratio/high_mean": 0.0037041545001557097,
"clip_ratio/low_mean": 0.0004929742426611483,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004197128777741454,
"completion_length": 94.525,
"epoch": 0.8202862357122275,
"grad_norm": 19.728607177734375,
"kl": 3.983573118597269,
"learning_rate": 1.699476644133778e-07,
"loss": 0.0122,
"reward": 1.7488954544067383,
"reward_std": 0.2558127373456955,
"rewards/code_format_reward": 0.9899999976158143,
"rewards/code_reward": 0.6269477069377899,
"step": 4270,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.00913497168221511,
"clip_ratio/high_mean": 0.0011987716374278535,
"clip_ratio/low_mean": 0.0007998564062290825,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0019986280538432767,
"completion_length": 87.73999938964843,
"epoch": 0.8222072807607338,
"grad_norm": 4.567457675933838,
"kl": 0.6975361555814743,
"learning_rate": 1.6849997056559662e-07,
"loss": -0.0116,
"reward": 1.7202219009399413,
"reward_std": 0.27057143300771713,
"rewards/code_format_reward": 0.9725000023841858,
"rewards/code_reward": 0.6169859290122985,
"step": 4280,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.016106344643048942,
"clip_ratio/high_mean": 0.002376156343962066,
"clip_ratio/low_mean": 5.540161509998143e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0024315579648828134,
"completion_length": 94.21750030517578,
"epoch": 0.8241283258092402,
"grad_norm": 17.505773544311523,
"kl": 1.1381098613142968,
"learning_rate": 1.670661800348644e-07,
"loss": -0.0006,
"reward": 1.7664429664611816,
"reward_std": 0.283676877617836,
"rewards/code_format_reward": 0.9849999904632568,
"rewards/code_reward": 0.6369715094566345,
"step": 4290,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.05613061334006488,
"clip_ratio/high_mean": 0.0073514855874236675,
"clip_ratio/low_mean": 0.00012296391359996052,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0074744494573678825,
"completion_length": 94.24500122070313,
"epoch": 0.8260493708577467,
"grad_norm": 36.729576110839844,
"kl": 2.2444246262311935,
"learning_rate": 1.656463450739801e-07,
"loss": 0.0024,
"reward": 1.7431164741516114,
"reward_std": 0.29661422967910767,
"rewards/code_format_reward": 0.9787499904632568,
"rewards/code_reward": 0.6268707036972045,
"step": 4300,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.003962649451568723,
"clip_ratio/high_mean": 0.0005655559070874006,
"clip_ratio/low_mean": 0.00023454214970115573,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0008000980655197054,
"completion_length": 91.94250030517578,
"epoch": 0.827970415906253,
"grad_norm": 5.331088066101074,
"kl": 0.6558065637946129,
"learning_rate": 1.6424051742714851e-07,
"loss": 0.0002,
"reward": 1.76786208152771,
"reward_std": 0.17127570807933806,
"rewards/code_format_reward": 0.9912500023841858,
"rewards/code_reward": 0.6361185550689697,
"step": 4310,
"zero_std_ratio": 0.6
},
{
"clip_ratio/high_max": 0.007816581195220352,
"clip_ratio/high_mean": 0.001516599569004029,
"clip_ratio/low_mean": 4.7630388871766625e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0015642299549654126,
"completion_length": 82.7000015258789,
"epoch": 0.8298914609547594,
"grad_norm": 9.622750282287598,
"kl": 0.9509772717952728,
"learning_rate": 1.6284874832809436e-07,
"loss": 0.0023,
"reward": 1.9346927881240845,
"reward_std": 0.3074748650193214,
"rewards/code_format_reward": 0.99375,
"rewards/code_reward": 0.7189089298248291,
"step": 4320,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.02933923137607053,
"clip_ratio/high_mean": 0.004640504893905018,
"clip_ratio/low_mean": 0.00011013215407729149,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004750637047982309,
"completion_length": 88.08000183105469,
"epoch": 0.8318125060032657,
"grad_norm": 1.8961539268493652,
"kl": 1.2761327236890794,
"learning_rate": 1.614710884981951e-07,
"loss": 0.0002,
"reward": 1.5815791606903076,
"reward_std": 0.24661691784858703,
"rewards/code_format_reward": 0.9862499833106995,
"rewards/code_reward": 0.5442270636558533,
"step": 4330,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.035589413810521366,
"clip_ratio/high_mean": 0.005689902242738754,
"clip_ratio/low_mean": 0.00015636042662663384,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005846262606792152,
"completion_length": 89.16500244140624,
"epoch": 0.8337335510517722,
"grad_norm": 1.6006284952163696,
"kl": 0.6420656457543373,
"learning_rate": 1.6010758814463287e-07,
"loss": 0.0027,
"reward": 1.643228530883789,
"reward_std": 0.2129346549510956,
"rewards/code_format_reward": 0.9899999856948852,
"rewards/code_reward": 0.5741142451763153,
"step": 4340,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.007347659638617188,
"clip_ratio/high_mean": 0.001005946182704065,
"clip_ratio/low_mean": 0.00028780620195902886,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0012937523904838599,
"completion_length": 98.85000152587891,
"epoch": 0.8356545961002786,
"grad_norm": 5.479083061218262,
"kl": 0.3409851986914873,
"learning_rate": 1.5875829695856406e-07,
"loss": -0.0007,
"reward": 1.882705855369568,
"reward_std": 0.22037020921707154,
"rewards/code_format_reward": 0.9899999856948852,
"rewards/code_reward": 0.6938528895378113,
"step": 4350,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.032100778096355496,
"clip_ratio/high_mean": 0.004409284892608412,
"clip_ratio/low_mean": 4.9924499762710184e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004459209358901717,
"completion_length": 90.73500213623046,
"epoch": 0.8375756411487849,
"grad_norm": 56.100852966308594,
"kl": 0.22566271349787712,
"learning_rate": 1.5742326411330942e-07,
"loss": 0.0011,
"reward": 1.8064903020858765,
"reward_std": 0.1691088706254959,
"rewards/code_format_reward": 0.9962499976158142,
"rewards/code_reward": 0.65418261885643,
"step": 4360,
"zero_std_ratio": 0.675
},
{
"clip_ratio/high_max": 0.005500979837961495,
"clip_ratio/high_mean": 0.0008934643206885085,
"clip_ratio/low_mean": 0.0005694760067854077,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001462940318742767,
"completion_length": 93.03750305175781,
"epoch": 0.8394966861972913,
"grad_norm": 7.828958034515381,
"kl": 0.6565275602042675,
"learning_rate": 1.5610253826256036e-07,
"loss": 0.003,
"reward": 1.7732144832611083,
"reward_std": 0.33924323320388794,
"rewards/code_format_reward": 0.9825000047683716,
"rewards/code_reward": 0.6409822225570678,
"step": 4370,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.0038046793546527625,
"clip_ratio/high_mean": 0.0004755849193315953,
"clip_ratio/low_mean": 0.0006387827248545364,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0011143676441861317,
"completion_length": 85.95250244140625,
"epoch": 0.8414177312457977,
"grad_norm": 3.0064802169799805,
"kl": 9.46174124404788,
"learning_rate": 1.5479616753860792e-07,
"loss": 0.0195,
"reward": 1.8130270481109618,
"reward_std": 0.1679749459028244,
"rewards/code_format_reward": 0.9924999952316285,
"rewards/code_reward": 0.6583885312080383,
"step": 4380,
"zero_std_ratio": 0.675
},
{
"clip_ratio/high_max": 0.020394568890333177,
"clip_ratio/high_mean": 0.002549321111291647,
"clip_ratio/low_mean": 0.0012285682838410138,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003777889395132661,
"completion_length": 94.72500305175781,
"epoch": 0.8433387762943041,
"grad_norm": 8.23426342010498,
"kl": 0.3538756832480431,
"learning_rate": 1.5350419955058645e-07,
"loss": -0.0046,
"reward": 1.6075192928314208,
"reward_std": 0.16927714347839357,
"rewards/code_format_reward": 0.9962499976158142,
"rewards/code_reward": 0.5546970963478088,
"step": 4390,
"zero_std_ratio": 0.675
},
{
"clip_ratio/high_max": 0.06588131491444074,
"clip_ratio/high_mean": 0.009102540424646578,
"clip_ratio/low_mean": 0.0007730728961178101,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009875613666372374,
"completion_length": 90.04500274658203,
"epoch": 0.8452598213428105,
"grad_norm": 7.7215776443481445,
"kl": 0.2362464390695095,
"learning_rate": 1.522266813827407e-07,
"loss": 0.0036,
"reward": 1.8586368560791016,
"reward_std": 0.2194239765405655,
"rewards/code_format_reward": 0.9949999928474427,
"rewards/code_reward": 0.6805683970451355,
"step": 4400,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.004174966411665082,
"clip_ratio/high_mean": 0.0007423789938911796,
"clip_ratio/low_mean": 7.375134955509566e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0008161303412634879,
"completion_length": 88.90750122070312,
"epoch": 0.8471808663913168,
"grad_norm": 2.829716920852661,
"kl": 1.5581397600471973,
"learning_rate": 1.509636595927078e-07,
"loss": 0.003,
"reward": 1.9052275657653808,
"reward_std": 0.256375952064991,
"rewards/code_format_reward": 0.9787499904632568,
"rewards/code_reward": 0.7079262495040893,
"step": 4410,
"zero_std_ratio": 0.6
},
{
"clip_ratio/high_max": 0.07640773041639477,
"clip_ratio/high_mean": 0.009898414360941387,
"clip_ratio/low_mean": 9.467430354561656e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00999308866157662,
"completion_length": 95.14500122070312,
"epoch": 0.8491019114398233,
"grad_norm": 0.3017069101333618,
"kl": 0.8497596487402916,
"learning_rate": 1.4971518020982232e-07,
"loss": -0.0017,
"reward": 1.5574845552444458,
"reward_std": 0.1220448928885162,
"rewards/code_format_reward": 0.9862499952316284,
"rewards/code_reward": 0.5321797609329224,
"step": 4420,
"zero_std_ratio": 0.6
},
{
"clip_ratio/high_max": 0.02955477687064558,
"clip_ratio/high_mean": 0.00414732932113111,
"clip_ratio/low_mean": 4.42216987721622e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00419155100826174,
"completion_length": 99.7,
"epoch": 0.8510229564883296,
"grad_norm": 5.942404747009277,
"kl": 0.5168043114244938,
"learning_rate": 1.4848128873343773e-07,
"loss": -0.0003,
"reward": 1.6633994817733764,
"reward_std": 0.2619109332561493,
"rewards/code_format_reward": 0.9762499928474426,
"rewards/code_reward": 0.5876372039318085,
"step": 4430,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.014663098810706288,
"clip_ratio/high_mean": 0.0024809099428239278,
"clip_ratio/low_mean": 3.1672295881435276e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002512582238705363,
"completion_length": 100.20750122070312,
"epoch": 0.852944001536836,
"grad_norm": 3.1078836917877197,
"kl": 0.39873379915952684,
"learning_rate": 1.4726203013126844e-07,
"loss": 0.006,
"reward": 1.7631917238235473,
"reward_std": 0.22433922737836837,
"rewards/code_format_reward": 0.9862499952316284,
"rewards/code_reward": 0.6350333511829376,
"step": 4440,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.014706605696119368,
"clip_ratio/high_mean": 0.00257694432802964,
"clip_ratio/low_mean": 0.00032811136916279795,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002905055697192438,
"completion_length": 99.6875,
"epoch": 0.8548650465853425,
"grad_norm": 8.708415985107422,
"kl": 0.4677444875240326,
"learning_rate": 1.4605744883775122e-07,
"loss": -0.0036,
"reward": 1.8840698957443238,
"reward_std": 0.2510286644101143,
"rewards/code_format_reward": 0.9850000143051147,
"rewards/code_reward": 0.6957849264144897,
"step": 4450,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.008664844953455032,
"clip_ratio/high_mean": 0.0019024941400857642,
"clip_ratio/low_mean": 0.002259151160251349,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004161645277054049,
"completion_length": 89.9000015258789,
"epoch": 0.8567860916338488,
"grad_norm": 7.514847755432129,
"kl": 0.3879747323691845,
"learning_rate": 1.4486758875242557e-07,
"loss": -0.0046,
"reward": 1.9147763013839723,
"reward_std": 0.2857444554567337,
"rewards/code_format_reward": 0.9887499928474426,
"rewards/code_reward": 0.7102006316184998,
"step": 4460,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.01213214877061546,
"clip_ratio/high_mean": 0.0017360628451569937,
"clip_ratio/low_mean": 0.0008027118048630655,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002538774654385634,
"completion_length": 100.34250183105469,
"epoch": 0.8587071366823552,
"grad_norm": 4.4957380294799805,
"kl": 0.7120470233261585,
"learning_rate": 1.436924932383341e-07,
"loss": -0.0029,
"reward": 1.7210463523864745,
"reward_std": 0.348609185218811,
"rewards/code_format_reward": 0.9762500047683715,
"rewards/code_reward": 0.6164606809616089,
"step": 4470,
"zero_std_ratio": 0.375
},
{
"clip_ratio/high_max": 0.04909939672797918,
"clip_ratio/high_mean": 0.006639666750561446,
"clip_ratio/low_mean": 0.0001961415633559227,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006835808313917368,
"completion_length": 89.65500030517578,
"epoch": 0.8606281817308616,
"grad_norm": 0.6291245818138123,
"kl": 0.914973171055317,
"learning_rate": 1.4253220512044194e-07,
"loss": 0.0052,
"reward": 1.5310453414916991,
"reward_std": 0.2040669571608305,
"rewards/code_format_reward": 0.9849999904632568,
"rewards/code_reward": 0.519272655248642,
"step": 4480,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.023789329756982624,
"clip_ratio/high_mean": 0.0034216867323266344,
"clip_ratio/low_mean": 6.479026051238179e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003486476981197484,
"completion_length": 94.68500061035157,
"epoch": 0.862549226779368,
"grad_norm": 3.6435203552246094,
"kl": 0.24871882200241088,
"learning_rate": 1.4138676668407637e-07,
"loss": -0.004,
"reward": 1.7728254079818726,
"reward_std": 0.21846108362078667,
"rewards/code_format_reward": 0.9912499904632568,
"rewards/code_reward": 0.6386001646518707,
"step": 4490,
"zero_std_ratio": 0.6
},
{
"clip_ratio/high_max": 0.02858473571250215,
"clip_ratio/high_mean": 0.004445229801058303,
"clip_ratio/low_mean": 0.005347848287783563,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009793078135407996,
"completion_length": 94.14000091552734,
"epoch": 0.8644702718278744,
"grad_norm": 7.250815391540527,
"kl": 1.268965845555067,
"learning_rate": 1.402562196733855e-07,
"loss": 0.1222,
"reward": 1.6482325553894044,
"reward_std": 0.321136474609375,
"rewards/code_format_reward": 0.9712499976158142,
"rewards/code_reward": 0.5813037693500519,
"step": 4500,
"zero_std_ratio": 0.4
},
{
"clip_ratio/high_max": 0.0016861034324392675,
"clip_ratio/high_mean": 0.00025714511721162123,
"clip_ratio/low_mean": 8.047257215366699e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0003376176857273094,
"completion_length": 89.82750244140625,
"epoch": 0.8663913168763807,
"grad_norm": 1.5697243213653564,
"kl": 0.3187939524650574,
"learning_rate": 1.3914060528981713e-07,
"loss": -0.0008,
"reward": 1.6549904108047486,
"reward_std": 0.15924324840307236,
"rewards/code_format_reward": 0.9912499785423279,
"rewards/code_reward": 0.5796826839447021,
"step": 4510,
"zero_std_ratio": 0.65
},
{
"clip_ratio/high_max": 0.005441831634379923,
"clip_ratio/high_mean": 0.0007462791429134086,
"clip_ratio/low_mean": 0.0008039395906962454,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0015502187496167607,
"completion_length": 97.60250091552734,
"epoch": 0.8683123619248871,
"grad_norm": 2.864607334136963,
"kl": 0.36184127181768416,
"learning_rate": 1.38039964190617e-07,
"loss": -0.0068,
"reward": 1.5000358819961548,
"reward_std": 0.22264644205570222,
"rewards/code_format_reward": 0.9850000023841858,
"rewards/code_reward": 0.5037679553031922,
"step": 4520,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.045888486225157975,
"clip_ratio/high_mean": 0.006488210440147668,
"clip_ratio/low_mean": 4.380840982776135e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006532018849975429,
"completion_length": 107.06000061035157,
"epoch": 0.8702334069733936,
"grad_norm": 3.5723934173583984,
"kl": 0.21280892938375473,
"learning_rate": 1.369543364873474e-07,
"loss": 0.0008,
"reward": 1.8976154088974,
"reward_std": 0.22375442534685136,
"rewards/code_format_reward": 0.9737499952316284,
"rewards/code_reward": 0.7053701996803283,
"step": 4530,
"zero_std_ratio": 0.625
},
{
"clip_ratio/high_max": 0.021734172268770634,
"clip_ratio/high_mean": 0.00285033899708651,
"clip_ratio/low_mean": 0.00015430593703058548,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003004644898464903,
"completion_length": 90.5125,
"epoch": 0.8721544520218999,
"grad_norm": 26.33332633972168,
"kl": 16.64756402745843,
"learning_rate": 1.3588376174442495e-07,
"loss": 0.0407,
"reward": 1.8465018033981324,
"reward_std": 0.26863393038511274,
"rewards/code_format_reward": 0.9900000095367432,
"rewards/code_reward": 0.6757509171962738,
"step": 4540,
"zero_std_ratio": 0.6
},
{
"clip_ratio/high_max": 0.01540006476570852,
"clip_ratio/high_mean": 0.00195431642132462,
"clip_ratio/low_mean": 0.0003294220077805221,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0022837384400190785,
"completion_length": 91.96750183105469,
"epoch": 0.8740754970704063,
"grad_norm": 5.637061595916748,
"kl": 0.5615961387753486,
"learning_rate": 1.348282789776792e-07,
"loss": 0.0006,
"reward": 1.7335857629776001,
"reward_std": 0.16677757501602172,
"rewards/code_format_reward": 0.9712500095367431,
"rewards/code_reward": 0.6239803791046142,
"step": 4550,
"zero_std_ratio": 0.65
},
{
"clip_ratio/high_max": 0.012607228197157382,
"clip_ratio/high_mean": 0.0017772652208805084,
"clip_ratio/low_mean": 0.00020470973395276816,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0019819749519228934,
"completion_length": 90.42000122070313,
"epoch": 0.8759965421189126,
"grad_norm": 4.87404727935791,
"kl": 0.5051522366702557,
"learning_rate": 1.3378792665293032e-07,
"loss": -0.0007,
"reward": 1.8114176988601685,
"reward_std": 0.27143858969211576,
"rewards/code_format_reward": 0.9687499880790711,
"rewards/code_reward": 0.6635213375091553,
"step": 4560,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.003521555650513619,
"clip_ratio/high_mean": 0.0005311336179147474,
"clip_ratio/low_mean": 0.00039719248161418363,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0009283260951633565,
"completion_length": 96.31000061035157,
"epoch": 0.8779175871674191,
"grad_norm": 3.5294971466064453,
"kl": 0.44891551434993743,
"learning_rate": 1.3276274268458749e-07,
"loss": -0.0011,
"reward": 1.8015916109085084,
"reward_std": 0.23535949736833572,
"rewards/code_format_reward": 0.9849999904632568,
"rewards/code_reward": 0.65454580783844,
"step": 4570,
"zero_std_ratio": 0.625
},
{
"clip_ratio/high_max": 0.016813984792679548,
"clip_ratio/high_mean": 0.0026305554260034115,
"clip_ratio/low_mean": 0.00013278115511639044,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0027633365796646105,
"completion_length": 92.12000122070313,
"epoch": 0.8798386322159255,
"grad_norm": 3.3064281940460205,
"kl": 147.6638460204005,
"learning_rate": 1.3175276443426704e-07,
"loss": 0.3018,
"reward": 1.8557111263275146,
"reward_std": 0.21927002370357512,
"rewards/code_format_reward": 0.9924999833106994,
"rewards/code_reward": 0.6797305464744567,
"step": 4580,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.004338507051579654,
"clip_ratio/high_mean": 0.0005835221760207787,
"clip_ratio/low_mean": 9.975638386094943e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0006832785555161535,
"completion_length": 96.14250183105469,
"epoch": 0.8817596772644318,
"grad_norm": 5.933443546295166,
"kl": 0.7469953082501888,
"learning_rate": 1.3075802870943102e-07,
"loss": -0.0005,
"reward": 1.7140401601791382,
"reward_std": 0.32567469477653505,
"rewards/code_format_reward": 0.9699999928474426,
"rewards/code_reward": 0.6145200908184052,
"step": 4590,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.008221420878544449,
"clip_ratio/high_mean": 0.0010466745734447613,
"clip_ratio/low_mean": 0.00021989296365063638,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0012665675370953978,
"completion_length": 96.91000213623047,
"epoch": 0.8836807223129383,
"grad_norm": 3.6585068702697754,
"kl": 0.2884219281375408,
"learning_rate": 1.2977857176204554e-07,
"loss": -0.0014,
"reward": 1.745366358757019,
"reward_std": 0.28437634110450744,
"rewards/code_format_reward": 0.9612499952316285,
"rewards/code_reward": 0.6323706865310669,
"step": 4600,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.013106013461947442,
"clip_ratio/high_mean": 0.001983167743310332,
"clip_ratio/low_mean": 0.0010545071098022162,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0030376748647540806,
"completion_length": 95.46000366210937,
"epoch": 0.8856017673614446,
"grad_norm": 3.166572332382202,
"kl": 0.7999920375645161,
"learning_rate": 1.2881442928725997e-07,
"loss": 0.0024,
"reward": 1.7604058027267455,
"reward_std": 0.1588110476732254,
"rewards/code_format_reward": 0.9837499976158142,
"rewards/code_reward": 0.6342653870582581,
"step": 4610,
"zero_std_ratio": 0.725
},
{
"clip_ratio/high_max": 0.03971324802841991,
"clip_ratio/high_mean": 0.005240282195154577,
"clip_ratio/low_mean": 0.00012449334171833472,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005364775560155977,
"completion_length": 91.37250213623047,
"epoch": 0.887522812409951,
"grad_norm": 1.2688926458358765,
"kl": 52.058202140033245,
"learning_rate": 1.2786563642210536e-07,
"loss": 0.1059,
"reward": 1.6578764081001283,
"reward_std": 0.1922210179269314,
"rewards/code_format_reward": 0.9724999904632569,
"rewards/code_reward": 0.5858131945133209,
"step": 4620,
"zero_std_ratio": 0.6
},
{
"clip_ratio/high_max": 0.014312215382233262,
"clip_ratio/high_mean": 0.002295189391588792,
"clip_ratio/low_mean": 0.0010927254450507462,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0033879148249980062,
"completion_length": 92.44750061035157,
"epoch": 0.8894438574584574,
"grad_norm": 1.0473991632461548,
"kl": 0.48051133900880816,
"learning_rate": 1.269322277442151e-07,
"loss": 0.0015,
"reward": 1.8454564094543457,
"reward_std": 0.23949076235294342,
"rewards/code_format_reward": 0.9824999809265137,
"rewards/code_reward": 0.6771032094955445,
"step": 4630,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.040262592025101185,
"clip_ratio/high_mean": 0.005372756696306169,
"clip_ratio/low_mean": 0.0007898360927356407,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006162592757027597,
"completion_length": 84.6050018310547,
"epoch": 0.8913649025069638,
"grad_norm": 6.553028106689453,
"kl": 0.6895815744996071,
"learning_rate": 1.2601423727056346e-07,
"loss": -0.0001,
"reward": 1.6561978340148926,
"reward_std": 0.36703028678894045,
"rewards/code_format_reward": 0.975,
"rewards/code_reward": 0.5843489110469818,
"step": 4640,
"zero_std_ratio": 0.325
},
{
"clip_ratio/high_max": 0.06538669131696224,
"clip_ratio/high_mean": 0.009138646663632243,
"clip_ratio/low_mean": 0.0017342485502013006,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010872895480133593,
"completion_length": 88.39750061035156,
"epoch": 0.8932859475554702,
"grad_norm": 4.167427062988281,
"kl": 1.728559673577547,
"learning_rate": 1.2511169845622699e-07,
"loss": 0.0019,
"reward": 1.6277015209197998,
"reward_std": 0.21625073552131652,
"rewards/code_format_reward": 0.975000011920929,
"rewards/code_reward": 0.5701007604598999,
"step": 4650,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.042488472175318745,
"clip_ratio/high_mean": 0.005791870540997479,
"clip_ratio/low_mean": 2.0525451691355557e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005812396005785559,
"completion_length": 92.57250213623047,
"epoch": 0.8952069926039765,
"grad_norm": 6.026858806610107,
"kl": 0.7588046140968799,
"learning_rate": 1.2422464419316432e-07,
"loss": 0.0034,
"reward": 1.7008742094039917,
"reward_std": 0.27438378930091856,
"rewards/code_format_reward": 0.9699999928474426,
"rewards/code_reward": 0.6079370617866516,
"step": 4660,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.013316971366293728,
"clip_ratio/high_mean": 0.0019765587523579596,
"clip_ratio/low_mean": 8.563735173083842e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002062196109909564,
"completion_length": 93.35000305175781,
"epoch": 0.897128037652483,
"grad_norm": 4.863064765930176,
"kl": 7.6627843722701074,
"learning_rate": 1.233531068090184e-07,
"loss": 0.011,
"reward": 1.8806322813034058,
"reward_std": 0.28162118047475815,
"rewards/code_format_reward": 0.9887500047683716,
"rewards/code_reward": 0.6931286633014679,
"step": 4670,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.004586372757330537,
"clip_ratio/high_mean": 0.0006299379543634132,
"clip_ratio/low_mean": 1.7313018906861545e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0006472509849118069,
"completion_length": 92.36500091552735,
"epoch": 0.8990490827009894,
"grad_norm": 2.1931703090667725,
"kl": 0.2519014351069927,
"learning_rate": 1.2249711806593762e-07,
"loss": 0.0034,
"reward": 1.8040930509567261,
"reward_std": 0.24223610311746596,
"rewards/code_format_reward": 0.9837499976158142,
"rewards/code_reward": 0.6561090111732483,
"step": 4680,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.006883417209610343,
"clip_ratio/high_mean": 0.0009808192204218357,
"clip_ratio/low_mean": 0.00029269491278682835,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001273514133208664,
"completion_length": 91.60500030517578,
"epoch": 0.9009701277494957,
"grad_norm": 21.0294132232666,
"kl": 0.25964570268988607,
"learning_rate": 1.2165670915941866e-07,
"loss": -0.0043,
"reward": 1.9244711637496947,
"reward_std": 0.1629927098751068,
"rewards/code_format_reward": 0.9862499952316284,
"rewards/code_reward": 0.7156730651855469,
"step": 4690,
"zero_std_ratio": 0.65
},
{
"clip_ratio/high_max": 0.010230390657670795,
"clip_ratio/high_mean": 0.0014585633180104196,
"clip_ratio/low_mean": 3.0266345129348336e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001488829671870917,
"completion_length": 87.84500274658203,
"epoch": 0.9028911727980021,
"grad_norm": 1.7218002080917358,
"kl": 16.447690600901844,
"learning_rate": 1.2083191071716937e-07,
"loss": 0.0339,
"reward": 1.940086579322815,
"reward_std": 0.16455088555812836,
"rewards/code_format_reward": 0.993749988079071,
"rewards/code_reward": 0.7216057777404785,
"step": 4700,
"zero_std_ratio": 0.675
},
{
"clip_ratio/high_max": 0.025706328079104425,
"clip_ratio/high_mean": 0.003573437442537397,
"clip_ratio/low_mean": 3.392130311112851e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0036073587427381424,
"completion_length": 81.84500274658203,
"epoch": 0.9048122178465084,
"grad_norm": 0.22543705999851227,
"kl": 0.31741214692592623,
"learning_rate": 1.2002275279799288e-07,
"loss": -0.0056,
"reward": 1.8292718410491944,
"reward_std": 0.12828939855098725,
"rewards/code_format_reward": 0.9987499952316284,
"rewards/code_reward": 0.6649484276771546,
"step": 4710,
"zero_std_ratio": 0.65
},
{
"clip_ratio/high_max": 0.009241180948447437,
"clip_ratio/high_mean": 0.0013442957555525937,
"clip_ratio/low_mean": 4.643963038688526e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0013907353932154365,
"completion_length": 95.63500213623047,
"epoch": 0.9067332628950149,
"grad_norm": 5.23514986038208,
"kl": 0.804936108738184,
"learning_rate": 1.192292648906918e-07,
"loss": 0.0031,
"reward": 1.925449275970459,
"reward_std": 0.2213977299630642,
"rewards/code_format_reward": 0.9899999976158143,
"rewards/code_reward": 0.7152246475219727,
"step": 4720,
"zero_std_ratio": 0.6
},
{
"clip_ratio/high_max": 0.021669640118489042,
"clip_ratio/high_mean": 0.003889294656983111,
"clip_ratio/low_mean": 0.00044275675172684715,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004332051414530724,
"completion_length": 92.25250244140625,
"epoch": 0.9086543079435213,
"grad_norm": 66.00515747070312,
"kl": 2.1086502872407435,
"learning_rate": 1.1845147591299378e-07,
"loss": 0.0162,
"reward": 1.5327723979949952,
"reward_std": 0.2872114762663841,
"rewards/code_format_reward": 0.9725000023841858,
"rewards/code_reward": 0.5232611894607544,
"step": 4730,
"zero_std_ratio": 0.4
},
{
"clip_ratio/high_max": 0.006079713994404301,
"clip_ratio/high_mean": 0.0010439059922646265,
"clip_ratio/low_mean": 0.0013928397551353556,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002436745767045068,
"completion_length": 97.43000030517578,
"epoch": 0.9105753529920276,
"grad_norm": 2.8770546913146973,
"kl": 3.1866038836538793,
"learning_rate": 1.1768941421049768e-07,
"loss": 0.0069,
"reward": 1.7776832818984984,
"reward_std": 0.29561240673065187,
"rewards/code_format_reward": 0.9949999928474427,
"rewards/code_reward": 0.6400915861129761,
"step": 4740,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.005499497149139642,
"clip_ratio/high_mean": 0.0006874371436424553,
"clip_ratio/low_mean": 0.000482194940559566,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0011696320783812554,
"completion_length": 88.9500015258789,
"epoch": 0.9124963980405341,
"grad_norm": 8.540057182312012,
"kl": 0.9072364956140518,
"learning_rate": 1.1694310755564014e-07,
"loss": -0.0021,
"reward": 1.6791202545166015,
"reward_std": 0.326928648352623,
"rewards/code_format_reward": 0.9774999976158142,
"rewards/code_reward": 0.595185148715973,
"step": 4750,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.008684736292343587,
"clip_ratio/high_mean": 0.001148225087672472,
"clip_ratio/low_mean": 0.0005504319502506405,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0016986570524750277,
"completion_length": 95.68250122070313,
"epoch": 0.9144174430890404,
"grad_norm": 4.539205551147461,
"kl": 0.860013198107481,
"learning_rate": 1.1621258314668402e-07,
"loss": 0.0,
"reward": 1.7214089155197143,
"reward_std": 0.1847836285829544,
"rewards/code_format_reward": 0.9674999952316284,
"rewards/code_reward": 0.6188294351100921,
"step": 4760,
"zero_std_ratio": 0.625
},
{
"clip_ratio/high_max": 0.012012088089250028,
"clip_ratio/high_mean": 0.0020424059097422288,
"clip_ratio/low_mean": 7.006222731433808e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0021124681399669496,
"completion_length": 93.21750183105469,
"epoch": 0.9163384881375468,
"grad_norm": 6.60590124130249,
"kl": 0.45315413996577264,
"learning_rate": 1.1549786760672676e-07,
"loss": -0.0013,
"reward": 1.7664082288742065,
"reward_std": 0.24015129953622819,
"rewards/code_format_reward": 0.9849999904632568,
"rewards/code_reward": 0.6369540929794312,
"step": 4770,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.028257530624978246,
"clip_ratio/high_mean": 0.00573968501703348,
"clip_ratio/low_mean": 0.00028595919138751924,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006025644272449426,
"completion_length": 93.59000244140626,
"epoch": 0.9182595331860532,
"grad_norm": 3.693448781967163,
"kl": 0.5863466400653123,
"learning_rate": 1.1479898698273037e-07,
"loss": 0.0001,
"reward": 1.7522862911224366,
"reward_std": 0.24038469642400742,
"rewards/code_format_reward": 0.9762500047683715,
"rewards/code_reward": 0.6320806205272674,
"step": 4780,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.006044295988976956,
"clip_ratio/high_mean": 0.0008545084856450558,
"clip_ratio/low_mean": 0.0004956311546266079,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0013501396053470672,
"completion_length": 100.22000122070312,
"epoch": 0.9201805782345596,
"grad_norm": 17.894886016845703,
"kl": 0.33935268595814705,
"learning_rate": 1.1411596674457193e-07,
"loss": -0.0019,
"reward": 1.697510004043579,
"reward_std": 0.16087576895952224,
"rewards/code_format_reward": 0.987499988079071,
"rewards/code_reward": 0.6018799901008606,
"step": 4790,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.005959878279827535,
"clip_ratio/high_mean": 0.0009170519857434556,
"clip_ratio/low_mean": 7.972503372002392e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0009967770281946286,
"completion_length": 98.54750213623046,
"epoch": 0.922101623283066,
"grad_norm": 3.242460250854492,
"kl": 0.46977903619408606,
"learning_rate": 1.1344883178411565e-07,
"loss": -0.0036,
"reward": 1.7927821159362793,
"reward_std": 0.24044746458530425,
"rewards/code_format_reward": 0.9699999809265136,
"rewards/code_reward": 0.6538910627365112,
"step": 4800,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.00756604690104723,
"clip_ratio/high_mean": 0.0010185762541368604,
"clip_ratio/low_mean": 0.00016034738000598737,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001178923639236018,
"completion_length": 99.49000091552735,
"epoch": 0.9240226683315724,
"grad_norm": 7.225472927093506,
"kl": 0.2285786397755146,
"learning_rate": 1.1279760641430568e-07,
"loss": 0.0001,
"reward": 1.7233760595321654,
"reward_std": 0.22306990921497344,
"rewards/code_format_reward": 0.9875,
"rewards/code_reward": 0.6148130118846893,
"step": 4810,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.011060118256136776,
"clip_ratio/high_mean": 0.0017047788191121072,
"clip_ratio/low_mean": 0.00028281604463700207,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0019875948812114073,
"completion_length": 92.36500091552735,
"epoch": 0.9259437133800787,
"grad_norm": 4.390386581420898,
"kl": 0.8032988727092742,
"learning_rate": 1.1216231436827974e-07,
"loss": 0.0005,
"reward": 1.7829072952270508,
"reward_std": 0.21434771865606309,
"rewards/code_format_reward": 0.9862500071525574,
"rewards/code_reward": 0.6448911607265473,
"step": 4820,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.013947398256277665,
"clip_ratio/high_mean": 0.0018392194229818414,
"clip_ratio/low_mean": 0.0004060613617184572,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002245280790521065,
"completion_length": 103.5125015258789,
"epoch": 0.9278647584285852,
"grad_norm": 6.774899482727051,
"kl": 0.34710453301668165,
"learning_rate": 1.1154297879850462e-07,
"loss": 0.0003,
"reward": 1.7023445606231689,
"reward_std": 0.23593612909317016,
"rewards/code_format_reward": 0.96875,
"rewards/code_reward": 0.60898477435112,
"step": 4830,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.00909699429757893,
"clip_ratio/high_mean": 0.0014705892943311482,
"clip_ratio/low_mean": 0.0004355724740889855,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0019061617698753253,
"completion_length": 91.47000274658203,
"epoch": 0.9297858034770915,
"grad_norm": 1.7924318313598633,
"kl": 0.5235365644097328,
"learning_rate": 1.1093962227593214e-07,
"loss": 0.0017,
"reward": 1.823938512802124,
"reward_std": 0.18318418860435487,
"rewards/code_format_reward": 0.987499988079071,
"rewards/code_reward": 0.6650941967964172,
"step": 4840,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.008868952537886799,
"clip_ratio/high_mean": 0.0013198618631577118,
"clip_ratio/low_mean": 6.896776030771435e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0013888296205550432,
"completion_length": 97.04750061035156,
"epoch": 0.9317068485255979,
"grad_norm": 5.492427825927734,
"kl": 0.27957614585757257,
"learning_rate": 1.1035226678917662e-07,
"loss": 0.0001,
"reward": 1.7743586778640748,
"reward_std": 0.19067177027463914,
"rewards/code_format_reward": 0.9699999928474426,
"rewards/code_reward": 0.6446793019771576,
"step": 4850,
"zero_std_ratio": 0.6
},
{
"clip_ratio/high_max": 0.00021865542512387038,
"clip_ratio/high_mean": 2.7331928140483797e-05,
"clip_ratio/low_mean": 0.00022580694640055298,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0002531388745410368,
"completion_length": 91.65750274658203,
"epoch": 0.9336278935741044,
"grad_norm": 8.045164108276367,
"kl": 0.20759812816977502,
"learning_rate": 1.0978093374371373e-07,
"loss": -0.0004,
"reward": 1.7663999795913696,
"reward_std": 0.281513449549675,
"rewards/code_format_reward": 0.9912499904632568,
"rewards/code_reward": 0.6353874802589417,
"step": 4860,
"zero_std_ratio": 0.4
},
{
"clip_ratio/high_max": 0.02832627217285335,
"clip_ratio/high_mean": 0.0035600741393864155,
"clip_ratio/low_mean": 0.00011176664993399754,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0036718408693559466,
"completion_length": 84.46500244140626,
"epoch": 0.9355489386226107,
"grad_norm": 4.819484233856201,
"kl": 0.5664212189614772,
"learning_rate": 1.0922564396109993e-07,
"loss": -0.0008,
"reward": 1.7755849838256836,
"reward_std": 0.20761601328849794,
"rewards/code_format_reward": 0.9899999856948852,
"rewards/code_reward": 0.640292489528656,
"step": 4870,
"zero_std_ratio": 0.625
},
{
"clip_ratio/high_max": 0.006872700434178114,
"clip_ratio/high_mean": 0.0009693403088022023,
"clip_ratio/low_mean": 3.415665923967026e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0010034969680418726,
"completion_length": 92.47500152587891,
"epoch": 0.9374699836711171,
"grad_norm": 2.605060338973999,
"kl": 0.6489929877221584,
"learning_rate": 1.0868641767821432e-07,
"loss": -0.0041,
"reward": 1.9151075601577758,
"reward_std": 0.2566168040037155,
"rewards/code_format_reward": 0.9849999904632568,
"rewards/code_reward": 0.7113037467002868,
"step": 4880,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.018258474441245197,
"clip_ratio/high_mean": 0.003355332469800487,
"clip_ratio/low_mean": 0.000607103164657019,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003962435649009421,
"completion_length": 90.51000366210937,
"epoch": 0.9393910287196234,
"grad_norm": 4.408846378326416,
"kl": 0.35625301077961924,
"learning_rate": 1.0816327454652044e-07,
"loss": -0.0018,
"reward": 1.7154739379882813,
"reward_std": 0.2987362504005432,
"rewards/code_format_reward": 0.9612499952316285,
"rewards/code_reward": 0.6174244284629822,
"step": 4890,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.010325380798894912,
"clip_ratio/high_mean": 0.0015298718310077675,
"clip_ratio/low_mean": 0.000294900168228196,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0018247719475766645,
"completion_length": 100.19250183105468,
"epoch": 0.9413120737681299,
"grad_norm": 9.08279037475586,
"kl": 0.23486268445849418,
"learning_rate": 1.0765623363135061e-07,
"loss": -0.0011,
"reward": 1.5800267338752747,
"reward_std": 0.26311944872140886,
"rewards/code_format_reward": 0.9862499952316284,
"rewards/code_reward": 0.5434508502483368,
"step": 4900,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.004708675656002015,
"clip_ratio/high_mean": 0.0008911975004593842,
"clip_ratio/low_mean": 0.0001348661200609058,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001026063623430673,
"completion_length": 85.6300033569336,
"epoch": 0.9432331188166363,
"grad_norm": 2.5798628330230713,
"kl": 0.5353534445166588,
"learning_rate": 1.071653134112109e-07,
"loss": -0.0018,
"reward": 1.7293733358383179,
"reward_std": 0.23426424115896224,
"rewards/code_format_reward": 0.9862499833106995,
"rewards/code_reward": 0.6181241631507873,
"step": 4910,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.031931064534001054,
"clip_ratio/high_mean": 0.004416047394624911,
"clip_ratio/low_mean": 0.00037934551510261373,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0047953929373761636,
"completion_length": 93.08500061035156,
"epoch": 0.9451541638651426,
"grad_norm": 3.0407347679138184,
"kl": 0.3617399115115404,
"learning_rate": 1.0669053177710766e-07,
"loss": -0.0023,
"reward": 1.602178120613098,
"reward_std": 0.23843889832496643,
"rewards/code_format_reward": 0.987499988079071,
"rewards/code_reward": 0.5542140543460846,
"step": 4920,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.008546069997828453,
"clip_ratio/high_mean": 0.0011838132908451372,
"clip_ratio/low_mean": 6.596306338906288e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0012497763542341999,
"completion_length": 102.79500122070313,
"epoch": 0.947075208913649,
"grad_norm": 5.987438678741455,
"kl": 0.28761252388358116,
"learning_rate": 1.0623190603189566e-07,
"loss": 0.0011,
"reward": 1.5471005201339723,
"reward_std": 0.28855718672275543,
"rewards/code_format_reward": 0.9674999952316284,
"rewards/code_reward": 0.5316752552986145,
"step": 4930,
"zero_std_ratio": 0.4
},
{
"clip_ratio/high_max": 0.03786106104962528,
"clip_ratio/high_mean": 0.005264365172479302,
"clip_ratio/low_mean": 0.001157468621386215,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00642183352902066,
"completion_length": 95.2625015258789,
"epoch": 0.9489962539621554,
"grad_norm": 4.088647842407227,
"kl": 9114.393886435031,
"learning_rate": 1.0578945288964734e-07,
"loss": 18.226,
"reward": 1.5625978589057923,
"reward_std": 0.22688832581043245,
"rewards/code_format_reward": 0.9762499928474426,
"rewards/code_reward": 0.5372364044189453,
"step": 4940,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.009068883489817381,
"clip_ratio/high_mean": 0.0015044378931634128,
"clip_ratio/low_mean": 8.003948896657675e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0015844773850403726,
"completion_length": 88.91999969482421,
"epoch": 0.9509172990106618,
"grad_norm": 4.558302879333496,
"kl": 0.322134206071496,
"learning_rate": 1.0536318847504383e-07,
"loss": 0.0008,
"reward": 1.683999252319336,
"reward_std": 0.15837213546037673,
"rewards/code_format_reward": 0.9887500047683716,
"rewards/code_reward": 0.5948120951652527,
"step": 4950,
"zero_std_ratio": 0.65
},
{
"clip_ratio/high_max": 0.004135725944070146,
"clip_ratio/high_mean": 0.0006349837080051657,
"clip_ratio/low_mean": 0.0002028582151979208,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0008378419290238526,
"completion_length": 88.58000030517579,
"epoch": 0.9528383440591682,
"grad_norm": 1.3143569231033325,
"kl": 0.32492467686533927,
"learning_rate": 1.0495312832278721e-07,
"loss": 0.001,
"reward": 1.757376217842102,
"reward_std": 0.18446292728185654,
"rewards/code_format_reward": 0.9875,
"rewards/code_reward": 0.6318130671977997,
"step": 4960,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.004577422246802599,
"clip_ratio/high_mean": 0.00067823924619006,
"clip_ratio/low_mean": 0.00020590101485140622,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0008841402595862746,
"completion_length": 91.77249908447266,
"epoch": 0.9547593891076745,
"grad_norm": 2.7616970539093018,
"kl": 0.6282597549259663,
"learning_rate": 1.0455928737703441e-07,
"loss": 0.0001,
"reward": 1.665701198577881,
"reward_std": 0.1566584974527359,
"rewards/code_format_reward": 0.99375,
"rewards/code_reward": 0.5844130754470825,
"step": 4970,
"zero_std_ratio": 0.6
},
{
"clip_ratio/high_max": 0.012442531622946262,
"clip_ratio/high_mean": 0.0018589732819236815,
"clip_ratio/low_mean": 6.720430101267994e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0019261775829363613,
"completion_length": 90.92250213623046,
"epoch": 0.956680434156181,
"grad_norm": 2.84462308883667,
"kl": 0.3018207371234894,
"learning_rate": 1.0418167999085259e-07,
"loss": 0.0041,
"reward": 1.7472755432128906,
"reward_std": 0.24319706559181214,
"rewards/code_format_reward": 0.9774999856948853,
"rewards/code_reward": 0.6292627692222595,
"step": 4980,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.01709002295974642,
"clip_ratio/high_mean": 0.002679444645764306,
"clip_ratio/low_mean": 0.0003698979213368148,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003049342567101121,
"completion_length": 92.97750091552734,
"epoch": 0.9586014792046873,
"grad_norm": 11.978320121765137,
"kl": 1.2294385731220245,
"learning_rate": 1.0382031992569592e-07,
"loss": 0.0036,
"reward": 1.739167046546936,
"reward_std": 0.29275294244289396,
"rewards/code_format_reward": 0.9875,
"rewards/code_reward": 0.622708535194397,
"step": 4990,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.007942511793226003,
"clip_ratio/high_mean": 0.001185902243014425,
"clip_ratio/low_mean": 5.571418441832066e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0012416164390742779,
"completion_length": 93.31250305175782,
"epoch": 0.9605225242531937,
"grad_norm": 3.364788055419922,
"kl": 0.35085868686437605,
"learning_rate": 1.0347522035090446e-07,
"loss": -0.0003,
"reward": 1.9564055442810058,
"reward_std": 0.2229623466730118,
"rewards/code_format_reward": 0.9912499904632568,
"rewards/code_reward": 0.7303902268409729,
"step": 5000,
"zero_std_ratio": 0.6
},
{
"clip_ratio/high_max": 0.015932422177866102,
"clip_ratio/high_mean": 0.0028564550855662675,
"clip_ratio/low_mean": 0.00020086783915758134,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003057322936365381,
"completion_length": 96.12750091552735,
"epoch": 0.9624435693017002,
"grad_norm": 5.283419609069824,
"kl": 0.3115640334784985,
"learning_rate": 1.0314639384322356e-07,
"loss": -0.0037,
"reward": 1.6293291807174684,
"reward_std": 0.2581008836627007,
"rewards/code_format_reward": 0.981249988079071,
"rewards/code_reward": 0.5693520545959473,
"step": 5010,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.003986756759695708,
"clip_ratio/high_mean": 0.0006319725507637486,
"clip_ratio/low_mean": 0.000603693921584636,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0012356664577964694,
"completion_length": 86.04750366210938,
"epoch": 0.9643646143502065,
"grad_norm": 8.70874309539795,
"kl": 0.47548493221402166,
"learning_rate": 1.0283385238634632e-07,
"loss": 0.0041,
"reward": 1.622909712791443,
"reward_std": 0.2179076835513115,
"rewards/code_format_reward": 0.9712499976158142,
"rewards/code_reward": 0.5686423420906067,
"step": 5020,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.005893218703567982,
"clip_ratio/high_mean": 0.000819433806464076,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.000819433806464076,
"completion_length": 88.9375015258789,
"epoch": 0.9662856593987129,
"grad_norm": 6.6337199211120605,
"kl": 0.5933880299329758,
"learning_rate": 1.0253760737047606e-07,
"loss": -0.0043,
"reward": 1.7307970523834229,
"reward_std": 0.1557233951985836,
"rewards/code_format_reward": 0.9924999952316285,
"rewards/code_reward": 0.6172735095024109,
"step": 5030,
"zero_std_ratio": 0.625
},
{
"clip_ratio/high_max": 0.010898534208536148,
"clip_ratio/high_mean": 0.0015226851450279356,
"clip_ratio/low_mean": 0.009100449224933981,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010623134509660303,
"completion_length": 87.65500183105469,
"epoch": 0.9682067044472192,
"grad_norm": 12.837902069091797,
"kl": 0.1521947119385004,
"learning_rate": 1.0225766959191187e-07,
"loss": 0.0007,
"reward": 1.766017746925354,
"reward_std": 0.1697022169828415,
"rewards/code_format_reward": 0.9924999952316285,
"rewards/code_reward": 0.6348838567733764,
"step": 5040,
"zero_std_ratio": 0.625
},
{
"clip_ratio/high_max": 0.039930257271043955,
"clip_ratio/high_mean": 0.005228024450480007,
"clip_ratio/low_mean": 0.0012023555900668725,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006430380133679137,
"completion_length": 99.63500213623047,
"epoch": 0.9701277494957257,
"grad_norm": 3.0135834217071533,
"kl": 0.5389343507587909,
"learning_rate": 1.0199404925265473e-07,
"loss": -0.0011,
"reward": 1.5655887126922607,
"reward_std": 0.1425598829984665,
"rewards/code_format_reward": 0.9849999904632568,
"rewards/code_reward": 0.5365443468093872,
"step": 5050,
"zero_std_ratio": 0.575
},
{
"clip_ratio/high_max": 0.013292990019544959,
"clip_ratio/high_mean": 0.0019570814620237797,
"clip_ratio/low_mean": 0.0004139953598496504,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0023710768204182387,
"completion_length": 92.15750122070312,
"epoch": 0.9720487945442321,
"grad_norm": 8.622629165649414,
"kl": 0.3708019584417343,
"learning_rate": 1.0174675596003588e-07,
"loss": -0.0037,
"reward": 1.6285043001174926,
"reward_std": 0.21171441301703453,
"rewards/code_format_reward": 0.9675000071525574,
"rewards/code_reward": 0.5723771452903748,
"step": 5060,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.011086594103835523,
"clip_ratio/high_mean": 0.001482552892412059,
"clip_ratio/low_mean": 7.31003499822691e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001555653239483945,
"completion_length": 92.72000122070312,
"epoch": 0.9739698395927384,
"grad_norm": 10.519503593444824,
"kl": 0.42225370053201916,
"learning_rate": 1.0151579872636673e-07,
"loss": 0.0073,
"reward": 1.9428821086883545,
"reward_std": 0.2824172407388687,
"rewards/code_format_reward": 0.981249988079071,
"rewards/code_reward": 0.7261285543441772,
"step": 5070,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.02070889645256102,
"clip_ratio/high_mean": 0.0035216436022892593,
"clip_ratio/low_mean": 0.0003085655207542004,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0038302090688375756,
"completion_length": 105.0050048828125,
"epoch": 0.9758908846412448,
"grad_norm": 4.139841079711914,
"kl": 0.3159520372748375,
"learning_rate": 1.0130118596861028e-07,
"loss": -0.0044,
"reward": 1.6708447217941285,
"reward_std": 0.30501508712768555,
"rewards/code_format_reward": 0.9837499976158142,
"rewards/code_reward": 0.5894848227500915,
"step": 5080,
"zero_std_ratio": 0.4
},
{
"clip_ratio/high_max": 0.008058706868905575,
"clip_ratio/high_mean": 0.0012073565638274885,
"clip_ratio/low_mean": 0.00031636476196581496,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0015237213257933036,
"completion_length": 84.28750152587891,
"epoch": 0.9778119296897512,
"grad_norm": 4.015879154205322,
"kl": 0.2918614260852337,
"learning_rate": 1.0110292550807451e-07,
"loss": -0.0012,
"reward": 1.7721335172653199,
"reward_std": 0.286711610853672,
"rewards/code_format_reward": 0.9899999976158143,
"rewards/code_reward": 0.6385667800903321,
"step": 5090,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.019471552316099407,
"clip_ratio/high_mean": 0.0026317643467336895,
"clip_ratio/low_mean": 0.0003316317946882918,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0029633961850777267,
"completion_length": 90.81000213623047,
"epoch": 0.9797329747382576,
"grad_norm": 1.132954716682434,
"kl": 0.2704964060336351,
"learning_rate": 1.0092102457012717e-07,
"loss": -0.0022,
"reward": 1.6570582151412965,
"reward_std": 0.21210518777370452,
"rewards/code_format_reward": 0.9899999976158143,
"rewards/code_reward": 0.5810291051864624,
"step": 5100,
"zero_std_ratio": 0.5
},
{
"clip_ratio/high_max": 0.011018617497757077,
"clip_ratio/high_mean": 0.0013860319217201323,
"clip_ratio/low_mean": 3.4722223062999547e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0014207541418727488,
"completion_length": 93.61250305175781,
"epoch": 0.981654019786764,
"grad_norm": 16.08737564086914,
"kl": 0.26382347345352175,
"learning_rate": 1.0075548978393277e-07,
"loss": -0.0002,
"reward": 1.8070130348205566,
"reward_std": 0.1673865035176277,
"rewards/code_format_reward": 0.9912500023841858,
"rewards/code_reward": 0.6556940078735352,
"step": 5110,
"zero_std_ratio": 0.625
},
{
"clip_ratio/high_max": 0.010004310857038946,
"clip_ratio/high_mean": 0.0012777127660228871,
"clip_ratio/low_mean": 0.0003342236072057858,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0016119363936013542,
"completion_length": 89.3,
"epoch": 0.9835750648352704,
"grad_norm": 0.4556010961532593,
"kl": 0.4934497371315956,
"learning_rate": 1.0060632718221066e-07,
"loss": 0.0026,
"reward": 1.3408710062503815,
"reward_std": 0.16168890111148357,
"rewards/code_format_reward": 0.9875,
"rewards/code_reward": 0.42356050610542295,
"step": 5120,
"zero_std_ratio": 0.7
},
{
"clip_ratio/high_max": 0.05311971204355359,
"clip_ratio/high_mean": 0.0075716287479735914,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0075716287479735914,
"completion_length": 102.92249908447266,
"epoch": 0.9854961098837768,
"grad_norm": 3.9305222034454346,
"kl": 0.27781638093292715,
"learning_rate": 1.0047354220101518e-07,
"loss": -0.0011,
"reward": 1.630450439453125,
"reward_std": 0.18297318816185,
"rewards/code_format_reward": 0.9887499928474426,
"rewards/code_reward": 0.5680376827716828,
"step": 5130,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.004692732833791524,
"clip_ratio/high_mean": 0.0006299943852354772,
"clip_ratio/low_mean": 0.00031122941145440565,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0009412237734068186,
"completion_length": 88.23250122070313,
"epoch": 0.9874171549322832,
"grad_norm": 4.31157112121582,
"kl": 0.2751577727496624,
"learning_rate": 1.0035713967953797e-07,
"loss": -0.0038,
"reward": 1.635274839401245,
"reward_std": 0.29494107216596605,
"rewards/code_format_reward": 0.9849999904632568,
"rewards/code_reward": 0.5713874340057373,
"step": 5140,
"zero_std_ratio": 0.45
},
{
"clip_ratio/high_max": 0.012987824180163443,
"clip_ratio/high_mean": 0.0019960356265073644,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0019960356265073644,
"completion_length": 86.49750061035157,
"epoch": 0.9893381999807895,
"grad_norm": 7.45393705368042,
"kl": 0.3619408316910267,
"learning_rate": 1.0025712385993115e-07,
"loss": 0.0012,
"reward": 1.687432312965393,
"reward_std": 0.2386924833059311,
"rewards/code_format_reward": 0.9912499904632568,
"rewards/code_reward": 0.5959036707878113,
"step": 5150,
"zero_std_ratio": 0.475
},
{
"clip_ratio/high_max": 0.014351918507600203,
"clip_ratio/high_mean": 0.002000216278975131,
"clip_ratio/low_mean": 7.898250914877281e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0020791988128621595,
"completion_length": 89.34250030517578,
"epoch": 0.991259245029296,
"grad_norm": 35.51432418823242,
"kl": 0.2617587223649025,
"learning_rate": 1.0017349838715278e-07,
"loss": -0.004,
"reward": 1.2408424496650696,
"reward_std": 0.21315770447254181,
"rewards/code_format_reward": 0.9774999976158142,
"rewards/code_reward": 0.3760462045669556,
"step": 5160,
"zero_std_ratio": 0.525
},
{
"clip_ratio/high_max": 0.003973034140653908,
"clip_ratio/high_mean": 0.0004966292675817385,
"clip_ratio/low_mean": 6.530825339723379e-06,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0005031600929214619,
"completion_length": 99.66499938964844,
"epoch": 0.9931802900778023,
"grad_norm": 2.5418131351470947,
"kl": 0.1747375037521124,
"learning_rate": 1.0010626630883432e-07,
"loss": 0.003,
"reward": 1.421428418159485,
"reward_std": 0.09218620862811804,
"rewards/code_format_reward": 0.9612499952316285,
"rewards/code_reward": 0.4704016923904419,
"step": 5170,
"zero_std_ratio": 0.675
},
{
"clip_ratio/high_max": 0.02674068254418671,
"clip_ratio/high_mean": 0.003380646219011396,
"clip_ratio/low_mean": 9.682812378741801e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034774743369780483,
"completion_length": 92.72500152587891,
"epoch": 0.9951013351263087,
"grad_norm": 6.10137414932251,
"kl": 0.41192906014621256,
"learning_rate": 1.0005543007516928e-07,
"loss": -0.0051,
"reward": 1.5263760328292846,
"reward_std": 0.28926219046115875,
"rewards/code_format_reward": 0.9899999976158143,
"rewards/code_reward": 0.5156879663467407,
"step": 5180,
"zero_std_ratio": 0.425
},
{
"clip_ratio/high_max": 0.1273454572306946,
"clip_ratio/high_mean": 0.016399703072966076,
"clip_ratio/low_mean": 0.0004187120386632159,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.016818415274610744,
"completion_length": 81.02250213623047,
"epoch": 0.9970223801748151,
"grad_norm": 7.77527379989624,
"kl": 0.7098278045654297,
"learning_rate": 1.0002099153882402e-07,
"loss": -0.0041,
"reward": 1.6053562879562377,
"reward_std": 0.16601394787430762,
"rewards/code_format_reward": 0.9824999928474426,
"rewards/code_reward": 0.557053166627884,
"step": 5190,
"zero_std_ratio": 0.6
},
{
"clip_ratio/high_max": 0.0028753917664289474,
"clip_ratio/high_mean": 0.00045008738234173504,
"clip_ratio/low_mean": 0.00016858125454746186,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.000618668642709963,
"completion_length": 95.49250183105468,
"epoch": 0.9989434252233215,
"grad_norm": 6.507387638092041,
"kl": 0.9340068377554417,
"learning_rate": 1.0000295195487024e-07,
"loss": -0.0018,
"reward": 1.4542541027069091,
"reward_std": 0.20283248797059059,
"rewards/code_format_reward": 0.981249988079071,
"rewards/code_reward": 0.4818145722150803,
"step": 5200,
"zero_std_ratio": 0.55
},
{
"clip_ratio/high_max": 0.010171899455599487,
"clip_ratio/high_mean": 0.0014317149762064219,
"clip_ratio/low_mean": 0.00016592920292168856,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0015976441791281104,
"completion_length": 90.05000305175781,
"epoch": 0.999711843242724,
"kl": 0.5218422394245863,
"reward": 1.0329873859882355,
"reward_std": 0.19616412371397018,
"rewards/code_format_reward": 0.934374988079071,
"rewards/code_reward": 0.28289994597435,
"step": 5204,
"total_flos": 0.0,
"train_loss": 1756184.5472393532,
"train_runtime": 149594.4727,
"train_samples_per_second": 0.139,
"train_steps_per_second": 0.035,
"zero_std_ratio": 0.5625
}
],
"logging_steps": 10,
"max_steps": 5205,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 5,
"trial_name": null,
"trial_params": null
}