| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.999711843242724, |
| "eval_steps": 500, |
| "global_step": 5204, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 220.77500915527344, |
| "epoch": 0.00019210450485063874, |
| "grad_norm": 2.5577025413513184, |
| "kl": 0.0, |
| "learning_rate": 0.0, |
| "loss": 0.038, |
| "reward": 0.37062498927116394, |
| "reward_std": 0.34713491797447205, |
| "rewards/code_format_reward": 0.26875001192092896, |
| "rewards/code_reward": 0.11812499910593033, |
| "step": 1, |
| "zero_std_ratio": 0.375 |
| }, |
| { |
| "clip_ratio/high_max": 0.01640424354829722, |
| "clip_ratio/high_mean": 0.003707133045989192, |
| "clip_ratio/low_mean": 0.0004983297904901621, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.004205462749167863, |
| "completion_length": 164.34375381469727, |
| "epoch": 0.0019210450485063874, |
| "grad_norm": 2.2875964641571045, |
| "kl": 0.13929970601263145, |
| "learning_rate": 9.999947520846931e-07, |
| "loss": 0.0575, |
| "reward": 0.655464380979538, |
| "reward_std": 0.6216425597667694, |
| "rewards/code_format_reward": 0.5078125074505806, |
| "rewards/code_reward": 0.20077905245125294, |
| "step": 10, |
| "zero_std_ratio": 0.125 |
| }, |
| { |
| "clip_ratio/high_max": 0.04116484243422747, |
| "clip_ratio/high_mean": 0.007335515914019197, |
| "clip_ratio/low_mean": 0.00010183055419474841, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.007437346538063138, |
| "completion_length": 100.84750213623047, |
| "epoch": 0.003842090097012775, |
| "grad_norm": 2.409867286682129, |
| "kl": 1.1695969879627228, |
| "learning_rate": 9.999734326385416e-07, |
| "loss": -0.0111, |
| "reward": 0.9829235672950745, |
| "reward_std": 0.5127422153949738, |
| "rewards/code_format_reward": 0.84375, |
| "rewards/code_reward": 0.2805242508649826, |
| "step": 20, |
| "zero_std_ratio": 0.075 |
| }, |
| { |
| "clip_ratio/high_max": 0.039504543878138065, |
| "clip_ratio/high_mean": 0.0051267803879454735, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0051267803879454735, |
| "completion_length": 97.57750091552734, |
| "epoch": 0.005763135145519163, |
| "grad_norm": 4.608696460723877, |
| "kl": 2.0701700329780577, |
| "learning_rate": 9.99935714443203e-07, |
| "loss": -0.019, |
| "reward": 1.1568554759025573, |
| "reward_std": 0.6407819569110871, |
| "rewards/code_format_reward": 0.8674999952316285, |
| "rewards/code_reward": 0.3615527212619781, |
| "step": 30, |
| "zero_std_ratio": 0.025 |
| }, |
| { |
| "clip_ratio/high_max": 0.005034898268058896, |
| "clip_ratio/high_mean": 0.0006811309984186664, |
| "clip_ratio/low_mean": 0.00013664715661434456, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0008177781579433941, |
| "completion_length": 83.10500030517578, |
| "epoch": 0.00768418019402555, |
| "grad_norm": 4.833131313323975, |
| "kl": 2.2019619703292848, |
| "learning_rate": 9.99881598873272e-07, |
| "loss": -0.02, |
| "reward": 1.1795239448547363, |
| "reward_std": 0.7194581270217896, |
| "rewards/code_format_reward": 0.8987499952316285, |
| "rewards/code_reward": 0.36507447361946105, |
| "step": 40, |
| "zero_std_ratio": 0.05 |
| }, |
| { |
| "clip_ratio/high_max": 0.00872214906848967, |
| "clip_ratio/high_mean": 0.0010902686335612088, |
| "clip_ratio/low_mean": 0.00040422612219117584, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0014944947557523846, |
| "completion_length": 88.22500152587891, |
| "epoch": 0.009605225242531937, |
| "grad_norm": 2.778585433959961, |
| "kl": 2.4498987793922424, |
| "learning_rate": 9.998110879009265e-07, |
| "loss": -0.0035, |
| "reward": 1.2663686752319336, |
| "reward_std": 0.6244019389152526, |
| "rewards/code_format_reward": 0.918750011920929, |
| "rewards/code_reward": 0.40349680185317993, |
| "step": 50, |
| "zero_std_ratio": 0.075 |
| }, |
| { |
| "clip_ratio/high_max": 0.016367838624864815, |
| "clip_ratio/high_mean": 0.002741052128840238, |
| "clip_ratio/low_mean": 0.0008432979579083621, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.003584350028540939, |
| "completion_length": 91.3375, |
| "epoch": 0.011526270291038325, |
| "grad_norm": 2.5201120376586914, |
| "kl": 2.7947509050369264, |
| "learning_rate": 9.997241840958557e-07, |
| "loss": 0.005, |
| "reward": 1.0697558522224426, |
| "reward_std": 0.49940577149391174, |
| "rewards/code_format_reward": 0.9200000047683716, |
| "rewards/code_reward": 0.30487790107727053, |
| "step": 60, |
| "zero_std_ratio": 0.025 |
| }, |
| { |
| "clip_ratio/high_max": 0.031629907339811324, |
| "clip_ratio/high_mean": 0.005140213097911328, |
| "clip_ratio/low_mean": 0.003656612744089216, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.008796826144680381, |
| "completion_length": 84.79750213623046, |
| "epoch": 0.013447315339544713, |
| "grad_norm": 7.281564712524414, |
| "kl": 1.7218781247735024, |
| "learning_rate": 9.99620890625166e-07, |
| "loss": -0.0261, |
| "reward": 1.1421246886253358, |
| "reward_std": 0.5977877795696258, |
| "rewards/code_format_reward": 0.9275000095367432, |
| "rewards/code_reward": 0.33918734490871427, |
| "step": 70, |
| "zero_std_ratio": 0.05 |
| }, |
| { |
| "clip_ratio/high_max": 0.10261552361771464, |
| "clip_ratio/high_mean": 0.014289343578275293, |
| "clip_ratio/low_mean": 0.0031720689148642123, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.017461412807460875, |
| "completion_length": 75.67250061035156, |
| "epoch": 0.0153683603880511, |
| "grad_norm": 3.359511137008667, |
| "kl": 0.3687619216740131, |
| "learning_rate": 9.995012112532654e-07, |
| "loss": -0.0037, |
| "reward": 1.2640612244606018, |
| "reward_std": 0.5189764618873596, |
| "rewards/code_format_reward": 0.9087499976158142, |
| "rewards/code_reward": 0.40484309792518614, |
| "step": 80, |
| "zero_std_ratio": 0.075 |
| }, |
| { |
| "clip_ratio/high_max": 0.053048994287382814, |
| "clip_ratio/high_mean": 0.0092369354548282, |
| "clip_ratio/low_mean": 0.00010360952001065016, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.009340544970473274, |
| "completion_length": 84.56500091552735, |
| "epoch": 0.01728940543655749, |
| "grad_norm": 2.177191734313965, |
| "kl": 0.5693678379058837, |
| "learning_rate": 9.993651503417269e-07, |
| "loss": -0.008, |
| "reward": 1.1986377000808717, |
| "reward_std": 0.49277395009994507, |
| "rewards/code_format_reward": 0.9112500071525573, |
| "rewards/code_reward": 0.3715063512325287, |
| "step": 90, |
| "zero_std_ratio": 0.1 |
| }, |
| { |
| "clip_ratio/high_max": 0.04136249013245106, |
| "clip_ratio/high_mean": 0.00551328391302377, |
| "clip_ratio/low_mean": 0.001408070686738938, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006921354681253433, |
| "completion_length": 80.8550033569336, |
| "epoch": 0.019210450485063875, |
| "grad_norm": 2.0033416748046875, |
| "kl": 0.8493028253316879, |
| "learning_rate": 9.992127128491296e-07, |
| "loss": 0.0027, |
| "reward": 1.1780336141586303, |
| "reward_std": 0.4479735493659973, |
| "rewards/code_format_reward": 0.9275000095367432, |
| "rewards/code_reward": 0.3571417987346649, |
| "step": 100, |
| "zero_std_ratio": 0.125 |
| }, |
| { |
| "clip_ratio/high_max": 0.0585523322224617, |
| "clip_ratio/high_mean": 0.008032404945697635, |
| "clip_ratio/low_mean": 0.006125411042012275, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.014157815964426845, |
| "completion_length": 74.19000091552735, |
| "epoch": 0.02113149553357026, |
| "grad_norm": 2.267624855041504, |
| "kl": 1.1039492040872574, |
| "learning_rate": 9.990439043308776e-07, |
| "loss": -0.0238, |
| "reward": 1.2784739494323731, |
| "reward_std": 0.49057124853134154, |
| "rewards/code_format_reward": 0.9475000023841857, |
| "rewards/code_reward": 0.40236196517944334, |
| "step": 110, |
| "zero_std_ratio": 0.175 |
| }, |
| { |
| "clip_ratio/high_max": 0.07124514738097787, |
| "clip_ratio/high_mean": 0.01569047374650836, |
| "clip_ratio/low_mean": 0.0004420768018462695, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.01613254987169057, |
| "completion_length": 68.35750045776368, |
| "epoch": 0.02305254058207665, |
| "grad_norm": 4.1564249992370605, |
| "kl": 1.4338344126939773, |
| "learning_rate": 9.988587309389975e-07, |
| "loss": -0.0026, |
| "reward": 1.1606964468955994, |
| "reward_std": 0.46601226925849915, |
| "rewards/code_format_reward": 0.9475000023841857, |
| "rewards/code_reward": 0.34347322285175325, |
| "step": 120, |
| "zero_std_ratio": 0.175 |
| }, |
| { |
| "clip_ratio/high_max": 0.07592196827754379, |
| "clip_ratio/high_mean": 0.014454811741597951, |
| "clip_ratio/low_mean": 0.0013599038298707455, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.015814715722808615, |
| "completion_length": 72.15750122070312, |
| "epoch": 0.024973585630583037, |
| "grad_norm": 3.9662575721740723, |
| "kl": 1.5312897458672523, |
| "learning_rate": 9.98657199421914e-07, |
| "loss": -0.0024, |
| "reward": 1.1610160946846009, |
| "reward_std": 0.3773229032754898, |
| "rewards/code_format_reward": 0.9587499976158143, |
| "rewards/code_reward": 0.3408205330371857, |
| "step": 130, |
| "zero_std_ratio": 0.25 |
| }, |
| { |
| "clip_ratio/high_max": 0.07943324451334774, |
| "clip_ratio/high_mean": 0.014111382194096222, |
| "clip_ratio/low_mean": 0.0036704083904623985, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.017781790602020918, |
| "completion_length": 83.8675033569336, |
| "epoch": 0.026894630679089426, |
| "grad_norm": 9.37182331085205, |
| "kl": 0.5293755233287811, |
| "learning_rate": 9.984393171242054e-07, |
| "loss": -0.0045, |
| "reward": 1.3634901762008667, |
| "reward_std": 0.5678210258483887, |
| "rewards/code_format_reward": 0.9512500047683716, |
| "rewards/code_reward": 0.4439325869083405, |
| "step": 140, |
| "zero_std_ratio": 0.175 |
| }, |
| { |
| "clip_ratio/high_max": 0.13982175141572953, |
| "clip_ratio/high_mean": 0.018845621962100267, |
| "clip_ratio/low_mean": 0.0009358229042845778, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.019781444873660802, |
| "completion_length": 79.41999969482421, |
| "epoch": 0.028815675727595812, |
| "grad_norm": 3.3437957763671875, |
| "kl": 1.0034890450537204, |
| "learning_rate": 9.982050919863332e-07, |
| "loss": -0.0003, |
| "reward": 1.332119607925415, |
| "reward_std": 0.4401752531528473, |
| "rewards/code_format_reward": 0.9674999952316284, |
| "rewards/code_reward": 0.4241847813129425, |
| "step": 150, |
| "zero_std_ratio": 0.2 |
| }, |
| { |
| "clip_ratio/high_max": 0.08667803611606359, |
| "clip_ratio/high_mean": 0.01204416286200285, |
| "clip_ratio/low_mean": 0.0012475995084969328, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.013291762379230932, |
| "completion_length": 80.43250122070313, |
| "epoch": 0.0307367207761022, |
| "grad_norm": 3.763737678527832, |
| "kl": 0.9024959966540337, |
| "learning_rate": 9.979545325443564e-07, |
| "loss": -0.0043, |
| "reward": 1.3518987059593202, |
| "reward_std": 0.46767728328704833, |
| "rewards/code_format_reward": 0.9450000047683715, |
| "rewards/code_reward": 0.4396993488073349, |
| "step": 160, |
| "zero_std_ratio": 0.25 |
| }, |
| { |
| "clip_ratio/high_max": 0.08449154160916805, |
| "clip_ratio/high_mean": 0.012011481402441859, |
| "clip_ratio/low_mean": 0.00172541297506541, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.013736894307658076, |
| "completion_length": 78.61250152587891, |
| "epoch": 0.03265776582460859, |
| "grad_norm": 7.203779220581055, |
| "kl": 0.9198675453662872, |
| "learning_rate": 9.976876479296167e-07, |
| "loss": -0.0013, |
| "reward": 1.3803849458694457, |
| "reward_std": 0.4038102596998215, |
| "rewards/code_format_reward": 0.9587499976158143, |
| "rewards/code_reward": 0.4505049705505371, |
| "step": 170, |
| "zero_std_ratio": 0.2 |
| }, |
| { |
| "clip_ratio/high_max": 0.07188423536717892, |
| "clip_ratio/high_mean": 0.013125935778953135, |
| "clip_ratio/low_mean": 0.0036661239922977985, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.016792060085572304, |
| "completion_length": 77.1875, |
| "epoch": 0.03457881087311498, |
| "grad_norm": 4.487454414367676, |
| "kl": 1.7507148087024689, |
| "learning_rate": 9.974044478684084e-07, |
| "loss": 0.0129, |
| "reward": 1.3845421075820923, |
| "reward_std": 0.5211645245552063, |
| "rewards/code_format_reward": 0.9325000047683716, |
| "rewards/code_reward": 0.4591460168361664, |
| "step": 180, |
| "zero_std_ratio": 0.175 |
| }, |
| { |
| "clip_ratio/high_max": 0.03638382372446358, |
| "clip_ratio/high_mean": 0.005234482995001599, |
| "clip_ratio/low_mean": 0.0020637288223952057, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.007298211794113741, |
| "completion_length": 72.73499984741211, |
| "epoch": 0.03649985592162136, |
| "grad_norm": 1.9644516706466675, |
| "kl": 1.5947209149599075, |
| "learning_rate": 9.97104942681622e-07, |
| "loss": -0.0015, |
| "reward": 1.5394827842712402, |
| "reward_std": 0.42243914008140565, |
| "rewards/code_format_reward": 0.9625, |
| "rewards/code_reward": 0.5291163563728333, |
| "step": 190, |
| "zero_std_ratio": 0.225 |
| }, |
| { |
| "clip_ratio/high_max": 0.2028519107028842, |
| "clip_ratio/high_mean": 0.031164265819825232, |
| "clip_ratio/low_mean": 0.00270410452503711, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03386837020516396, |
| "completion_length": 69.81000061035157, |
| "epoch": 0.03842090097012775, |
| "grad_norm": 3.170088052749634, |
| "kl": 1.0117133632302284, |
| "learning_rate": 9.9678914328437e-07, |
| "loss": 0.0113, |
| "reward": 1.4108091354370118, |
| "reward_std": 0.43394198417663576, |
| "rewards/code_format_reward": 0.9675000071525574, |
| "rewards/code_reward": 0.46352959871292115, |
| "step": 200, |
| "zero_std_ratio": 0.225 |
| }, |
| { |
| "clip_ratio/high_max": 0.053923821565695106, |
| "clip_ratio/high_mean": 0.009883182743215002, |
| "clip_ratio/low_mean": 0.0038779765891376883, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.013761159335263073, |
| "completion_length": 69.04750213623046, |
| "epoch": 0.04034194601863414, |
| "grad_norm": 2.5729293823242188, |
| "kl": 1.1861489608883857, |
| "learning_rate": 9.964570611855874e-07, |
| "loss": -0.007, |
| "reward": 1.4398113250732423, |
| "reward_std": 0.39351261258125303, |
| "rewards/code_format_reward": 0.9650000095367431, |
| "rewards/code_reward": 0.47865564227104185, |
| "step": 210, |
| "zero_std_ratio": 0.3 |
| }, |
| { |
| "clip_ratio/high_max": 0.1575187448877841, |
| "clip_ratio/high_mean": 0.020484526228392495, |
| "clip_ratio/low_mean": 0.011988240911159664, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03247276756446808, |
| "completion_length": 61.67000122070313, |
| "epoch": 0.04226299106714052, |
| "grad_norm": 9.919574737548828, |
| "kl": 3.983895111083984, |
| "learning_rate": 9.961087084876135e-07, |
| "loss": 0.0076, |
| "reward": 1.2202381372451783, |
| "reward_std": 0.26475468575954436, |
| "rewards/code_format_reward": 0.96875, |
| "rewards/code_reward": 0.36793155074119566, |
| "step": 220, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.141914052516222, |
| "clip_ratio/high_mean": 0.023408634401857854, |
| "clip_ratio/low_mean": 0.004240041392040439, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.027648675863747484, |
| "completion_length": 67.37250213623047, |
| "epoch": 0.04418403611564691, |
| "grad_norm": 106.46134185791016, |
| "kl": 2.121386268734932, |
| "learning_rate": 9.957440978857498e-07, |
| "loss": -0.0021, |
| "reward": 1.3681801557540894, |
| "reward_std": 0.37111111879348757, |
| "rewards/code_format_reward": 0.9675000071525574, |
| "rewards/code_reward": 0.4422150731086731, |
| "step": 230, |
| "zero_std_ratio": 0.25 |
| }, |
| { |
| "clip_ratio/high_max": 0.07315623210743069, |
| "clip_ratio/high_mean": 0.01155225959373638, |
| "clip_ratio/low_mean": 0.005734288269013632, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.01728654802427627, |
| "completion_length": 72.65750198364258, |
| "epoch": 0.0461050811641533, |
| "grad_norm": 3.1017534732818604, |
| "kl": 0.882834991812706, |
| "learning_rate": 9.953632426677983e-07, |
| "loss": -0.0093, |
| "reward": 1.484795618057251, |
| "reward_std": 0.4526777356863022, |
| "rewards/code_format_reward": 0.9662500023841858, |
| "rewards/code_reward": 0.5008352994918823, |
| "step": 240, |
| "zero_std_ratio": 0.2 |
| }, |
| { |
| "clip_ratio/high_max": 0.06419091664720326, |
| "clip_ratio/high_mean": 0.008793376400717534, |
| "clip_ratio/low_mean": 0.0021902987034991385, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.010983675080933609, |
| "completion_length": 88.21500091552734, |
| "epoch": 0.048026126212659684, |
| "grad_norm": 5.3243279457092285, |
| "kl": 2.7226425796747207, |
| "learning_rate": 9.94966156713577e-07, |
| "loss": -0.0127, |
| "reward": 1.455380654335022, |
| "reward_std": 0.4675000965595245, |
| "rewards/code_format_reward": 0.9737500071525573, |
| "rewards/code_reward": 0.4842528164386749, |
| "step": 250, |
| "zero_std_ratio": 0.2 |
| }, |
| { |
| "clip_ratio/high_max": 0.0741606397787109, |
| "clip_ratio/high_mean": 0.012071207936969586, |
| "clip_ratio/low_mean": 0.003062122967094183, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.015133331064134836, |
| "completion_length": 88.98250122070313, |
| "epoch": 0.04994717126116607, |
| "grad_norm": 2.7804369926452637, |
| "kl": 0.6169008180499077, |
| "learning_rate": 9.94552854494413e-07, |
| "loss": 0.0033, |
| "reward": 1.427869987487793, |
| "reward_std": 0.4851543098688126, |
| "rewards/code_format_reward": 0.96875, |
| "rewards/code_reward": 0.4717474699020386, |
| "step": 260, |
| "zero_std_ratio": 0.1 |
| }, |
| { |
| "clip_ratio/high_max": 0.03937563952058554, |
| "clip_ratio/high_mean": 0.0065028761862777175, |
| "clip_ratio/low_mean": 0.004409579199273139, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.01091245551360771, |
| "completion_length": 87.42750091552735, |
| "epoch": 0.05186821630967246, |
| "grad_norm": 6.559643745422363, |
| "kl": 0.4433484449982643, |
| "learning_rate": 9.941233510726168e-07, |
| "loss": -0.0018, |
| "reward": 1.4182387351989747, |
| "reward_std": 0.4612067699432373, |
| "rewards/code_format_reward": 0.9412499904632569, |
| "rewards/code_reward": 0.4738068819046021, |
| "step": 270, |
| "zero_std_ratio": 0.175 |
| }, |
| { |
| "clip_ratio/high_max": 0.057588514033705, |
| "clip_ratio/high_mean": 0.008462971181143076, |
| "clip_ratio/low_mean": 0.007865038787713274, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.016328010114375503, |
| "completion_length": 79.69500122070312, |
| "epoch": 0.05378926135817885, |
| "grad_norm": 6.077131271362305, |
| "kl": 0.6961165189743042, |
| "learning_rate": 9.936776621009322e-07, |
| "loss": 0.0038, |
| "reward": 1.5715951919555664, |
| "reward_std": 0.4179812580347061, |
| "rewards/code_format_reward": 0.975, |
| "rewards/code_reward": 0.5420475661754608, |
| "step": 280, |
| "zero_std_ratio": 0.2 |
| }, |
| { |
| "clip_ratio/high_max": 0.025062982086092235, |
| "clip_ratio/high_mean": 0.004761367203900591, |
| "clip_ratio/low_mean": 0.0028623046877328307, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.007623671973124147, |
| "completion_length": 83.21750183105469, |
| "epoch": 0.055710306406685235, |
| "grad_norm": 6.066061019897461, |
| "kl": 0.7484225794672966, |
| "learning_rate": 9.932158038219662e-07, |
| "loss": -0.0052, |
| "reward": 1.1587857127189636, |
| "reward_std": 0.39943512678146365, |
| "rewards/code_format_reward": 0.9637500047683716, |
| "rewards/code_reward": 0.3384553253650665, |
| "step": 290, |
| "zero_std_ratio": 0.25 |
| }, |
| { |
| "clip_ratio/high_max": 0.10117955654859542, |
| "clip_ratio/high_mean": 0.013649052195250987, |
| "clip_ratio/low_mean": 0.0008885912131518126, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.014537643361836671, |
| "completion_length": 84.00750122070312, |
| "epoch": 0.057631351455191625, |
| "grad_norm": 3.23887038230896, |
| "kl": 0.8064253896474838, |
| "learning_rate": 9.92737793067597e-07, |
| "loss": -0.0034, |
| "reward": 1.3393104553222657, |
| "reward_std": 0.4101540923118591, |
| "rewards/code_format_reward": 0.9549999952316284, |
| "rewards/code_reward": 0.43090522289276123, |
| "step": 300, |
| "zero_std_ratio": 0.15 |
| }, |
| { |
| "clip_ratio/high_max": 0.04703736044466496, |
| "clip_ratio/high_mean": 0.007716302154585719, |
| "clip_ratio/low_mean": 0.0006432932626921683, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.008359595513320528, |
| "completion_length": 77.70500030517579, |
| "epoch": 0.059552396503698014, |
| "grad_norm": 3.357680320739746, |
| "kl": 0.6727996915578842, |
| "learning_rate": 9.922436472583614e-07, |
| "loss": 0.0013, |
| "reward": 1.6670202493667603, |
| "reward_std": 0.4320096135139465, |
| "rewards/code_format_reward": 0.9712500095367431, |
| "rewards/code_reward": 0.5906976163387299, |
| "step": 310, |
| "zero_std_ratio": 0.3 |
| }, |
| { |
| "clip_ratio/high_max": 0.16380154211074113, |
| "clip_ratio/high_mean": 0.03262772373855114, |
| "clip_ratio/low_mean": 0.0011754593724617735, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03380318162962794, |
| "completion_length": 72.78750152587891, |
| "epoch": 0.0614734415522044, |
| "grad_norm": 3.652451992034912, |
| "kl": 1.8953835844993592, |
| "learning_rate": 9.91733384402818e-07, |
| "loss": -0.005, |
| "reward": 1.4837595462799071, |
| "reward_std": 0.45500350296497344, |
| "rewards/code_format_reward": 0.9662500023841858, |
| "rewards/code_reward": 0.5003172576427459, |
| "step": 320, |
| "zero_std_ratio": 0.225 |
| }, |
| { |
| "clip_ratio/high_max": 0.034284231485798955, |
| "clip_ratio/high_mean": 0.005935872689587995, |
| "clip_ratio/low_mean": 0.000911827472737059, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0068477002554573115, |
| "completion_length": 74.17000274658203, |
| "epoch": 0.06339448660071079, |
| "grad_norm": 1.5065704584121704, |
| "kl": 0.40838020071387293, |
| "learning_rate": 9.912070230968928e-07, |
| "loss": -0.0054, |
| "reward": 1.3848075151443482, |
| "reward_std": 0.3038723856210709, |
| "rewards/code_format_reward": 0.9612499952316285, |
| "rewards/code_reward": 0.45209125280380247, |
| "step": 330, |
| "zero_std_ratio": 0.35 |
| }, |
| { |
| "clip_ratio/high_max": 0.05724322898313403, |
| "clip_ratio/high_mean": 0.009350239217747002, |
| "clip_ratio/low_mean": 0.0077414238592609765, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0170916625414975, |
| "completion_length": 80.06500091552735, |
| "epoch": 0.06531553164921718, |
| "grad_norm": 3.77842116355896, |
| "kl": 0.8782595857977867, |
| "learning_rate": 9.906645825232008e-07, |
| "loss": -0.0023, |
| "reward": 1.294193172454834, |
| "reward_std": 0.3676457226276398, |
| "rewards/code_format_reward": 0.9549999952316284, |
| "rewards/code_reward": 0.4083465874195099, |
| "step": 340, |
| "zero_std_ratio": 0.275 |
| }, |
| { |
| "clip_ratio/high_max": 0.10199148450046777, |
| "clip_ratio/high_mean": 0.018657304299995302, |
| "clip_ratio/low_mean": 0.004165191331412643, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.022822496155276893, |
| "completion_length": 86.5000015258789, |
| "epoch": 0.06723657669772357, |
| "grad_norm": 3.2845616340637207, |
| "kl": 0.9463568836450577, |
| "learning_rate": 9.901060824503463e-07, |
| "loss": -0.0115, |
| "reward": 1.485135293006897, |
| "reward_std": 0.48840407729148866, |
| "rewards/code_format_reward": 0.9487499833106995, |
| "rewards/code_reward": 0.5053801357746124, |
| "step": 350, |
| "zero_std_ratio": 0.225 |
| }, |
| { |
| "clip_ratio/high_max": 0.07233364712446928, |
| "clip_ratio/high_mean": 0.009769158461131156, |
| "clip_ratio/low_mean": 0.019356250233249737, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.029125408595427872, |
| "completion_length": 80.54000091552734, |
| "epoch": 0.06915762174622996, |
| "grad_norm": 19.32016944885254, |
| "kl": 1.1565445899963378, |
| "learning_rate": 9.89531543232204e-07, |
| "loss": 0.0045, |
| "reward": 1.3412477493286132, |
| "reward_std": 0.49785757064819336, |
| "rewards/code_format_reward": 0.9599999904632568, |
| "rewards/code_reward": 0.43062385320663454, |
| "step": 360, |
| "zero_std_ratio": 0.25 |
| }, |
| { |
| "clip_ratio/high_max": 0.11471173651516438, |
| "clip_ratio/high_mean": 0.02246011425741017, |
| "clip_ratio/low_mean": 0.00892345790634863, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.031383572798222306, |
| "completion_length": 74.02000274658204, |
| "epoch": 0.07107866679473633, |
| "grad_norm": 2.2520923614501953, |
| "kl": 1.074078917503357, |
| "learning_rate": 9.889409858071753e-07, |
| "loss": -0.0059, |
| "reward": 1.5273491621017456, |
| "reward_std": 0.414175683259964, |
| "rewards/code_format_reward": 0.9775000095367432, |
| "rewards/code_reward": 0.519299578666687, |
| "step": 370, |
| "zero_std_ratio": 0.275 |
| }, |
| { |
| "clip_ratio/high_max": 0.06336253914050757, |
| "clip_ratio/high_mean": 0.01199121386744082, |
| "clip_ratio/low_mean": 0.009130357182584703, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.021121570840477943, |
| "completion_length": 86.4000015258789, |
| "epoch": 0.07299971184324272, |
| "grad_norm": 4.1052961349487305, |
| "kl": 1.3110491752624511, |
| "learning_rate": 9.883344316974266e-07, |
| "loss": -0.0079, |
| "reward": 1.5908024072647096, |
| "reward_std": 0.47413656711578367, |
| "rewards/code_format_reward": 0.9600000023841858, |
| "rewards/code_reward": 0.555401211977005, |
| "step": 380, |
| "zero_std_ratio": 0.2 |
| }, |
| { |
| "clip_ratio/high_max": 0.04433182019274682, |
| "clip_ratio/high_mean": 0.008989717412623577, |
| "clip_ratio/low_mean": 0.006074265367351473, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.015063982826541178, |
| "completion_length": 86.175, |
| "epoch": 0.07492075689174911, |
| "grad_norm": 4.5202155113220215, |
| "kl": 0.830048742890358, |
| "learning_rate": 9.877119030081048e-07, |
| "loss": -0.0051, |
| "reward": 1.492829155921936, |
| "reward_std": 0.3874175697565079, |
| "rewards/code_format_reward": 0.9824999928474426, |
| "rewards/code_reward": 0.5007895469665528, |
| "step": 390, |
| "zero_std_ratio": 0.325 |
| }, |
| { |
| "clip_ratio/high_max": 0.1522485612425953, |
| "clip_ratio/high_mean": 0.0220908185117878, |
| "clip_ratio/low_mean": 0.012701757764443756, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03479257607832551, |
| "completion_length": 78.91000137329101, |
| "epoch": 0.0768418019402555, |
| "grad_norm": 2.6146676540374756, |
| "kl": 0.8627120085060597, |
| "learning_rate": 9.870734224265308e-07, |
| "loss": -0.0059, |
| "reward": 1.5748756647109985, |
| "reward_std": 0.3048340857028961, |
| "rewards/code_format_reward": 0.987500011920929, |
| "rewards/code_reward": 0.5405627965927124, |
| "step": 400, |
| "zero_std_ratio": 0.4 |
| }, |
| { |
| "clip_ratio/high_max": 0.16312104668468236, |
| "clip_ratio/high_mean": 0.025311203207820654, |
| "clip_ratio/low_mean": 0.008227485651150345, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03353868862614036, |
| "completion_length": 77.27750091552734, |
| "epoch": 0.07876284698876189, |
| "grad_norm": 1.7234841585159302, |
| "kl": 0.8750749856233597, |
| "learning_rate": 9.864190132213742e-07, |
| "loss": -0.0062, |
| "reward": 1.6338460445404053, |
| "reward_std": 0.3537067860364914, |
| "rewards/code_format_reward": 0.9849999904632568, |
| "rewards/code_reward": 0.570673018693924, |
| "step": 410, |
| "zero_std_ratio": 0.325 |
| }, |
| { |
| "clip_ratio/high_max": 0.0936438184697181, |
| "clip_ratio/high_mean": 0.014423616812564433, |
| "clip_ratio/low_mean": 0.010347768076462672, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.024771385360509157, |
| "completion_length": 75.69749908447265, |
| "epoch": 0.08068389203726828, |
| "grad_norm": 2.0902154445648193, |
| "kl": 1.264050543308258, |
| "learning_rate": 9.857486992418036e-07, |
| "loss": 0.0048, |
| "reward": 1.644848608970642, |
| "reward_std": 0.277804034948349, |
| "rewards/code_format_reward": 0.9799999952316284, |
| "rewards/code_reward": 0.5774242997169494, |
| "step": 420, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.05532362968660891, |
| "clip_ratio/high_mean": 0.00992250678827986, |
| "clip_ratio/low_mean": 0.004125738283619285, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.014048245223239064, |
| "completion_length": 69.60749969482421, |
| "epoch": 0.08260493708577465, |
| "grad_norm": 3.702075481414795, |
| "kl": 1.7400359451770782, |
| "learning_rate": 9.850625049166189e-07, |
| "loss": -0.0008, |
| "reward": 1.5316168069839478, |
| "reward_std": 0.275749945640564, |
| "rewards/code_format_reward": 0.9737499952316284, |
| "rewards/code_reward": 0.5223708748817444, |
| "step": 430, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.15841160174459218, |
| "clip_ratio/high_mean": 0.02351265251636505, |
| "clip_ratio/low_mean": 0.010281538363778963, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.033794190967455506, |
| "completion_length": 74.51000061035157, |
| "epoch": 0.08452598213428104, |
| "grad_norm": 3.4808361530303955, |
| "kl": 1.2856003642082214, |
| "learning_rate": 9.8436045525336e-07, |
| "loss": -0.0035, |
| "reward": 1.5067368984222411, |
| "reward_std": 0.28293364942073823, |
| "rewards/code_format_reward": 0.9737499833106995, |
| "rewards/code_reward": 0.5099309325218201, |
| "step": 440, |
| "zero_std_ratio": 0.375 |
| }, |
| { |
| "clip_ratio/high_max": 0.06043836465105414, |
| "clip_ratio/high_mean": 0.009103650611359626, |
| "clip_ratio/low_mean": 0.002932069695089012, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.012035720515996218, |
| "completion_length": 76.08250122070312, |
| "epoch": 0.08644702718278743, |
| "grad_norm": 3.665134906768799, |
| "kl": 1.0338351279497147, |
| "learning_rate": 9.836425758373958e-07, |
| "loss": 0.0011, |
| "reward": 1.4822889804840087, |
| "reward_std": 0.18996141627430915, |
| "rewards/code_format_reward": 0.9674999952316284, |
| "rewards/code_reward": 0.49926944375038146, |
| "step": 450, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.21641009524464608, |
| "clip_ratio/high_mean": 0.03260216782800853, |
| "clip_ratio/low_mean": 0.007402116784942336, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.040004284400492904, |
| "completion_length": 73.13500213623047, |
| "epoch": 0.08836807223129382, |
| "grad_norm": 3.1982343196868896, |
| "kl": 0.6477661892771721, |
| "learning_rate": 9.829088928309923e-07, |
| "loss": -0.0043, |
| "reward": 1.7202057361602783, |
| "reward_std": 0.25773381292819975, |
| "rewards/code_format_reward": 0.975, |
| "rewards/code_reward": 0.6163528442382813, |
| "step": 460, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.09453173456713557, |
| "clip_ratio/high_mean": 0.015337946941144764, |
| "clip_ratio/low_mean": 0.005975433619460091, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.02131338034523651, |
| "completion_length": 81.9000015258789, |
| "epoch": 0.09028911727980021, |
| "grad_norm": 1.441091775894165, |
| "kl": 0.6155861958861351, |
| "learning_rate": 9.82159432972358e-07, |
| "loss": -0.0063, |
| "reward": 1.4617766380310058, |
| "reward_std": 0.24772228300571442, |
| "rewards/code_format_reward": 0.9774999976158142, |
| "rewards/code_reward": 0.48651331663131714, |
| "step": 470, |
| "zero_std_ratio": 0.3 |
| }, |
| { |
| "clip_ratio/high_max": 0.16705528497695923, |
| "clip_ratio/high_mean": 0.026639112271368504, |
| "clip_ratio/low_mean": 0.0035399875399889425, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03017909936606884, |
| "completion_length": 77.79500274658203, |
| "epoch": 0.0922101623283066, |
| "grad_norm": 47.74139404296875, |
| "kl": 1.360982394218445, |
| "learning_rate": 9.813942235746705e-07, |
| "loss": 0.0034, |
| "reward": 1.5168325901031494, |
| "reward_std": 0.3997103154659271, |
| "rewards/code_format_reward": 0.9737500071525573, |
| "rewards/code_reward": 0.5149787843227387, |
| "step": 480, |
| "zero_std_ratio": 0.275 |
| }, |
| { |
| "clip_ratio/high_max": 0.26956315375864504, |
| "clip_ratio/high_mean": 0.04211876043118536, |
| "clip_ratio/low_mean": 0.002336682367604226, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.04445544336922467, |
| "completion_length": 86.21500091552734, |
| "epoch": 0.09413120737681299, |
| "grad_norm": 3.7244272232055664, |
| "kl": 2.59437358379364, |
| "learning_rate": 9.80613292525081e-07, |
| "loss": 0.0038, |
| "reward": 1.6131777048110962, |
| "reward_std": 0.32231712639331817, |
| "rewards/code_format_reward": 0.9799999833106995, |
| "rewards/code_reward": 0.5615888297557831, |
| "step": 490, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.22240130547434092, |
| "clip_ratio/high_mean": 0.044074146053753795, |
| "clip_ratio/low_mean": 0.012573283386882395, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0566474299877882, |
| "completion_length": 72.23500061035156, |
| "epoch": 0.09605225242531937, |
| "grad_norm": 2.852999687194824, |
| "kl": 1.615745335817337, |
| "learning_rate": 9.79816668283697e-07, |
| "loss": 0.0017, |
| "reward": 1.5203128576278686, |
| "reward_std": 0.3012717217206955, |
| "rewards/code_format_reward": 0.9725000023841858, |
| "rewards/code_reward": 0.517031443119049, |
| "step": 500, |
| "zero_std_ratio": 0.35 |
| }, |
| { |
| "clip_ratio/high_max": 0.15330625362694264, |
| "clip_ratio/high_mean": 0.02403738833963871, |
| "clip_ratio/low_mean": 0.004583830677438528, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.028621218353509902, |
| "completion_length": 74.30000076293945, |
| "epoch": 0.09797329747382576, |
| "grad_norm": 2.484840154647827, |
| "kl": 2.1540999174118043, |
| "learning_rate": 9.790043798825458e-07, |
| "loss": 0.0073, |
| "reward": 1.5013367414474488, |
| "reward_std": 0.24206546545028687, |
| "rewards/code_format_reward": 0.9699999928474426, |
| "rewards/code_reward": 0.508168363571167, |
| "step": 510, |
| "zero_std_ratio": 0.4 |
| }, |
| { |
| "clip_ratio/high_max": 0.15383050357922912, |
| "clip_ratio/high_mean": 0.027125787048134953, |
| "clip_ratio/low_mean": 0.002593657124089077, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.029719442850910126, |
| "completion_length": 65.0400016784668, |
| "epoch": 0.09989434252233215, |
| "grad_norm": 7.2150959968566895, |
| "kl": 1.1968895211815833, |
| "learning_rate": 9.781764569245178e-07, |
| "loss": -0.006, |
| "reward": 1.510750651359558, |
| "reward_std": 0.41533524394035337, |
| "rewards/code_format_reward": 0.9712500095367431, |
| "rewards/code_reward": 0.5125628054141999, |
| "step": 520, |
| "zero_std_ratio": 0.325 |
| }, |
| { |
| "clip_ratio/high_max": 0.109321213606745, |
| "clip_ratio/high_mean": 0.018354640086181463, |
| "clip_ratio/low_mean": 0.011131488461978733, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.029486127989366652, |
| "completion_length": 74.52250213623047, |
| "epoch": 0.10181538757083854, |
| "grad_norm": 1.8456060886383057, |
| "kl": 0.7155197218060494, |
| "learning_rate": 9.773329295822844e-07, |
| "loss": 0.0073, |
| "reward": 1.5899319171905517, |
| "reward_std": 0.3179755389690399, |
| "rewards/code_format_reward": 0.975, |
| "rewards/code_reward": 0.5512159705162049, |
| "step": 530, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.04905872759409249, |
| "clip_ratio/high_mean": 0.008021075790748, |
| "clip_ratio/low_mean": 0.004390958754811436, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.012412034533917904, |
| "completion_length": 67.07500076293945, |
| "epoch": 0.10373643261934493, |
| "grad_norm": 4.641266345977783, |
| "kl": 0.7290919035673141, |
| "learning_rate": 9.764738285972015e-07, |
| "loss": 0.0008, |
| "reward": 1.300760817527771, |
| "reward_std": 0.3361863404512405, |
| "rewards/code_format_reward": 0.9537500143051147, |
| "rewards/code_reward": 0.4119428813457489, |
| "step": 540, |
| "zero_std_ratio": 0.325 |
| }, |
| { |
| "clip_ratio/high_max": 0.1825170351192355, |
| "clip_ratio/high_mean": 0.027253909036517143, |
| "clip_ratio/low_mean": 0.0015975978298229166, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.028851506439968942, |
| "completion_length": 73.99250030517578, |
| "epoch": 0.10565747766785132, |
| "grad_norm": 1.1770566701889038, |
| "kl": 1.328820213675499, |
| "learning_rate": 9.755991852781876e-07, |
| "loss": -0.0023, |
| "reward": 1.5671115159988402, |
| "reward_std": 0.34309983551502227, |
| "rewards/code_format_reward": 0.9737500071525573, |
| "rewards/code_reward": 0.5401182293891906, |
| "step": 550, |
| "zero_std_ratio": 0.3 |
| }, |
| { |
| "clip_ratio/high_max": 0.12550847120583059, |
| "clip_ratio/high_mean": 0.025771993771195413, |
| "clip_ratio/low_mean": 0.0035689805867150427, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.029340974800288678, |
| "completion_length": 71.76750030517579, |
| "epoch": 0.1075785227163577, |
| "grad_norm": 0.3435879647731781, |
| "kl": 2.12383970618248, |
| "learning_rate": 9.747090315005836e-07, |
| "loss": 0.0024, |
| "reward": 1.5273173809051515, |
| "reward_std": 0.2889336168766022, |
| "rewards/code_format_reward": 0.9649999976158142, |
| "rewards/code_reward": 0.5224087119102478, |
| "step": 560, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.0834595168940723, |
| "clip_ratio/high_mean": 0.015262311231344939, |
| "clip_ratio/low_mean": 0.021645180485211312, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03690749178640544, |
| "completion_length": 79.53250122070312, |
| "epoch": 0.10949956776486408, |
| "grad_norm": 1.7026695013046265, |
| "kl": 1.6705755025148392, |
| "learning_rate": 9.738033997049902e-07, |
| "loss": 0.1708, |
| "reward": 1.5908133745193482, |
| "reward_std": 0.3691225051879883, |
| "rewards/code_format_reward": 0.9912500023841858, |
| "rewards/code_reward": 0.5475941836833954, |
| "step": 570, |
| "zero_std_ratio": 0.35 |
| }, |
| { |
| "clip_ratio/high_max": 0.18124623028561473, |
| "clip_ratio/high_mean": 0.02496154889231548, |
| "clip_ratio/low_mean": 0.020611650816863402, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.04557319916784763, |
| "completion_length": 85.51750183105469, |
| "epoch": 0.11142061281337047, |
| "grad_norm": 18.138025283813477, |
| "kl": 4.237766814231873, |
| "learning_rate": 9.728823228960862e-07, |
| "loss": -0.0051, |
| "reward": 1.5469601631164551, |
| "reward_std": 0.37420718297362326, |
| "rewards/code_format_reward": 0.975000011920929, |
| "rewards/code_reward": 0.5297300696372986, |
| "step": 580, |
| "zero_std_ratio": 0.3 |
| }, |
| { |
| "clip_ratio/high_max": 0.014268473512493074, |
| "clip_ratio/high_mean": 0.0028658110386459157, |
| "clip_ratio/low_mean": 0.0056301898322999476, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.008496000757440924, |
| "completion_length": 80.10750274658203, |
| "epoch": 0.11334165786187686, |
| "grad_norm": 5.16138219833374, |
| "kl": 0.6609396353363991, |
| "learning_rate": 9.71945834641426e-07, |
| "loss": -0.004, |
| "reward": 1.4476024627685546, |
| "reward_std": 0.3472218900918961, |
| "rewards/code_format_reward": 0.9699999928474426, |
| "rewards/code_reward": 0.4813012361526489, |
| "step": 590, |
| "zero_std_ratio": 0.325 |
| }, |
| { |
| "clip_ratio/high_max": 0.17693078136071563, |
| "clip_ratio/high_mean": 0.02441923434380442, |
| "clip_ratio/low_mean": 0.012987980741309002, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.037407214660197495, |
| "completion_length": 83.96500091552734, |
| "epoch": 0.11526270291038325, |
| "grad_norm": 1.7465465068817139, |
| "kl": 1.0383819937705994, |
| "learning_rate": 9.709939690702158e-07, |
| "loss": -0.0078, |
| "reward": 1.4550770282745362, |
| "reward_std": 0.3056318134069443, |
| "rewards/code_format_reward": 0.9587500095367432, |
| "rewards/code_reward": 0.48785099387168884, |
| "step": 600, |
| "zero_std_ratio": 0.375 |
| }, |
| { |
| "clip_ratio/high_max": 0.1888352295383811, |
| "clip_ratio/high_mean": 0.026437551854178308, |
| "clip_ratio/low_mean": 0.0054486555512994524, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.031886206939816475, |
| "completion_length": 79.63500213623047, |
| "epoch": 0.11718374795888964, |
| "grad_norm": 5.674210548400879, |
| "kl": 1.2073093384504319, |
| "learning_rate": 9.700267608720692e-07, |
| "loss": -0.0021, |
| "reward": 1.4424492359161376, |
| "reward_std": 0.3397494524717331, |
| "rewards/code_format_reward": 0.9725000143051148, |
| "rewards/code_reward": 0.4780996203422546, |
| "step": 610, |
| "zero_std_ratio": 0.275 |
| }, |
| { |
| "clip_ratio/high_max": 0.09671425293199717, |
| "clip_ratio/high_mean": 0.020163473271531986, |
| "clip_ratio/low_mean": 0.006395513273309917, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.02655898590455763, |
| "completion_length": 75.22750091552734, |
| "epoch": 0.11910479300739603, |
| "grad_norm": 5.531320571899414, |
| "kl": 2.2407817423343657, |
| "learning_rate": 9.690442452957448e-07, |
| "loss": -0.0021, |
| "reward": 1.5595922470092773, |
| "reward_std": 0.28165863305330274, |
| "rewards/code_format_reward": 0.9787499904632568, |
| "rewards/code_reward": 0.5351086378097534, |
| "step": 620, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.11810005996376276, |
| "clip_ratio/high_mean": 0.02500568316318095, |
| "clip_ratio/low_mean": 0.00357620443101041, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.028581888042390348, |
| "completion_length": 80.09000091552734, |
| "epoch": 0.1210258380559024, |
| "grad_norm": 2.165558338165283, |
| "kl": 1.546025463938713, |
| "learning_rate": 9.680464581478594e-07, |
| "loss": -0.0037, |
| "reward": 1.51439368724823, |
| "reward_std": 0.3320598304271698, |
| "rewards/code_format_reward": 0.9725000023841858, |
| "rewards/code_reward": 0.5140718221664429, |
| "step": 630, |
| "zero_std_ratio": 0.35 |
| }, |
| { |
| "clip_ratio/high_max": 0.10313799739815295, |
| "clip_ratio/high_mean": 0.017414161982014776, |
| "clip_ratio/low_mean": 0.009596780824358575, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.027010941854678096, |
| "completion_length": 76.15749969482422, |
| "epoch": 0.1229468831044088, |
| "grad_norm": 5.05511999130249, |
| "kl": 1.6615911841392517, |
| "learning_rate": 9.670334357915852e-07, |
| "loss": 0.0033, |
| "reward": 1.5930729150772094, |
| "reward_std": 0.3864523351192474, |
| "rewards/code_format_reward": 0.9662500023841858, |
| "rewards/code_reward": 0.554973942041397, |
| "step": 640, |
| "zero_std_ratio": 0.275 |
| }, |
| { |
| "clip_ratio/high_max": 0.1653188370168209, |
| "clip_ratio/high_mean": 0.027094300370663404, |
| "clip_ratio/low_mean": 0.0033949258620850744, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03048922661691904, |
| "completion_length": 74.23250274658203, |
| "epoch": 0.12486792815291518, |
| "grad_norm": 1.1590094566345215, |
| "kl": 0.39487394616007804, |
| "learning_rate": 9.660052151453228e-07, |
| "loss": -0.006, |
| "reward": 1.7198987245559691, |
| "reward_std": 0.3215783953666687, |
| "rewards/code_format_reward": 0.9849999904632568, |
| "rewards/code_reward": 0.613699346780777, |
| "step": 650, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.2655821519903839, |
| "clip_ratio/high_mean": 0.03813204998150468, |
| "clip_ratio/low_mean": 0.017123100493336096, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.05525515023618936, |
| "completion_length": 79.31999969482422, |
| "epoch": 0.12678897320142157, |
| "grad_norm": 2.8189809322357178, |
| "kl": 0.9924295842647552, |
| "learning_rate": 9.649618336813565e-07, |
| "loss": -0.0022, |
| "reward": 1.710445189476013, |
| "reward_std": 0.2906018912792206, |
| "rewards/code_format_reward": 0.9762500047683715, |
| "rewards/code_reward": 0.6111600875854493, |
| "step": 660, |
| "zero_std_ratio": 0.35 |
| }, |
| { |
| "clip_ratio/high_max": 0.10447313897311687, |
| "clip_ratio/high_mean": 0.017084641277324408, |
| "clip_ratio/low_mean": 0.018559307692339645, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03564394909190014, |
| "completion_length": 73.31750183105468, |
| "epoch": 0.12871001824992795, |
| "grad_norm": 7.561813831329346, |
| "kl": 1.0190230280160903, |
| "learning_rate": 9.639033294244894e-07, |
| "loss": -0.0059, |
| "reward": 1.4508479833602905, |
| "reward_std": 0.2639226779341698, |
| "rewards/code_format_reward": 0.9724999904632569, |
| "rewards/code_reward": 0.4822989523410797, |
| "step": 670, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.17416613902896644, |
| "clip_ratio/high_mean": 0.02931727101095021, |
| "clip_ratio/low_mean": 0.013709834642941131, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.04302710462361574, |
| "completion_length": 75.30500183105468, |
| "epoch": 0.13063106329843435, |
| "grad_norm": 4.0138373374938965, |
| "kl": 1.8731355726718903, |
| "learning_rate": 9.628297409506558e-07, |
| "loss": 0.0038, |
| "reward": 1.5990655183792115, |
| "reward_std": 0.38845544308423996, |
| "rewards/code_format_reward": 0.9762500047683715, |
| "rewards/code_reward": 0.5554702281951904, |
| "step": 680, |
| "zero_std_ratio": 0.325 |
| }, |
| { |
| "clip_ratio/high_max": 0.14133978222962468, |
| "clip_ratio/high_mean": 0.025468734742025843, |
| "clip_ratio/low_mean": 0.0034107466402929277, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.028879481457988732, |
| "completion_length": 71.69250183105468, |
| "epoch": 0.13255210834694073, |
| "grad_norm": 2.7108314037323, |
| "kl": 1.0770379617810248, |
| "learning_rate": 9.61741107385517e-07, |
| "loss": 0.0015, |
| "reward": 1.357295000553131, |
| "reward_std": 0.16353759765625, |
| "rewards/code_format_reward": 0.981249988079071, |
| "rewards/code_reward": 0.43333501517772677, |
| "step": 690, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.2215075224637985, |
| "clip_ratio/high_mean": 0.03973329542204738, |
| "clip_ratio/low_mean": 0.021483630378497764, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.06121692657470703, |
| "completion_length": 77.00250244140625, |
| "epoch": 0.13447315339544713, |
| "grad_norm": 3.874828338623047, |
| "kl": 1.798163938522339, |
| "learning_rate": 9.606374684030354e-07, |
| "loss": -0.0002, |
| "reward": 1.4897700071334838, |
| "reward_std": 0.3036611869931221, |
| "rewards/code_format_reward": 0.9699999928474426, |
| "rewards/code_reward": 0.5023849844932556, |
| "step": 700, |
| "zero_std_ratio": 0.375 |
| }, |
| { |
| "clip_ratio/high_max": 0.26057110670953987, |
| "clip_ratio/high_mean": 0.04422192363999784, |
| "clip_ratio/low_mean": 0.012507367390207946, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.05672929054126143, |
| "completion_length": 68.01749954223632, |
| "epoch": 0.1363941984439535, |
| "grad_norm": 1.9008493423461914, |
| "kl": 1.1601522982120513, |
| "learning_rate": 9.595188642240268e-07, |
| "loss": -0.006, |
| "reward": 1.5408167839050293, |
| "reward_std": 0.23992418646812438, |
| "rewards/code_format_reward": 0.9837499976158142, |
| "rewards/code_reward": 0.5244708836078644, |
| "step": 710, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.11190514008048921, |
| "clip_ratio/high_mean": 0.022988432584679686, |
| "clip_ratio/low_mean": 0.003842631517909467, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.02683106428885367, |
| "completion_length": 70.91749954223633, |
| "epoch": 0.1383152434924599, |
| "grad_norm": 2.230220317840576, |
| "kl": 0.6176944851875306, |
| "learning_rate": 9.58385335614697e-07, |
| "loss": -0.0038, |
| "reward": 1.474353313446045, |
| "reward_std": 0.22789922058582307, |
| "rewards/code_format_reward": 0.9850000023841858, |
| "rewards/code_reward": 0.49092662930488584, |
| "step": 720, |
| "zero_std_ratio": 0.4 |
| }, |
| { |
| "clip_ratio/high_max": 0.22790296860039233, |
| "clip_ratio/high_mean": 0.043722260277718306, |
| "clip_ratio/low_mean": 0.005503303511068225, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0492255637422204, |
| "completion_length": 70.33000183105469, |
| "epoch": 0.1402362885409663, |
| "grad_norm": 3.880234956741333, |
| "kl": 1.7978762328624724, |
| "learning_rate": 9.572369238851546e-07, |
| "loss": -0.01, |
| "reward": 1.7555195808410644, |
| "reward_std": 0.30654080510139464, |
| "rewards/code_format_reward": 0.9862499952316284, |
| "rewards/code_reward": 0.6311972856521606, |
| "step": 730, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.13005290240980685, |
| "clip_ratio/high_mean": 0.02253831790876575, |
| "clip_ratio/low_mean": 0.0076317260100040585, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.030170044326223434, |
| "completion_length": 67.4625015258789, |
| "epoch": 0.14215733358947266, |
| "grad_norm": 31014.41015625, |
| "kl": 2.5802926242351534, |
| "learning_rate": 9.560736708879055e-07, |
| "loss": 4.1316, |
| "reward": 1.391554856300354, |
| "reward_std": 0.3107602626085281, |
| "rewards/code_format_reward": 0.9824999928474426, |
| "rewards/code_reward": 0.4501524269580841, |
| "step": 740, |
| "zero_std_ratio": 0.25 |
| }, |
| { |
| "clip_ratio/high_max": 0.21672796942293643, |
| "clip_ratio/high_mean": 0.03920850001741201, |
| "clip_ratio/low_mean": 0.0084746521897614, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.047683153115212915, |
| "completion_length": 71.03750076293946, |
| "epoch": 0.14407837863797907, |
| "grad_norm": 1.3094109296798706, |
| "kl": 4.56303431391716, |
| "learning_rate": 9.54895619016329e-07, |
| "loss": 0.0111, |
| "reward": 1.5939582109451294, |
| "reward_std": 0.2379148319363594, |
| "rewards/code_format_reward": 0.96875, |
| "rewards/code_reward": 0.5547916054725647, |
| "step": 750, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.08126737037673593, |
| "clip_ratio/high_mean": 0.01269659586250782, |
| "clip_ratio/low_mean": 0.006480468995869159, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.019177064718678593, |
| "completion_length": 74.09750213623047, |
| "epoch": 0.14599942368648544, |
| "grad_norm": 3.0267083644866943, |
| "kl": 1.5844107165932655, |
| "learning_rate": 9.53702811203131e-07, |
| "loss": 0.0048, |
| "reward": 1.4744285106658936, |
| "reward_std": 0.2754403457045555, |
| "rewards/code_format_reward": 0.9900000095367432, |
| "rewards/code_reward": 0.489714241027832, |
| "step": 760, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.22151243952102959, |
| "clip_ratio/high_mean": 0.038386100489879026, |
| "clip_ratio/low_mean": 0.001766498590586707, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.04015259912703186, |
| "completion_length": 73.72750244140624, |
| "epoch": 0.14792046873499184, |
| "grad_norm": 3596482.75, |
| "kl": 0.6901701986789703, |
| "learning_rate": 9.524952909187801e-07, |
| "loss": 83.9443, |
| "reward": 1.4019340753555298, |
| "reward_std": 0.24908357337117196, |
| "rewards/code_format_reward": 0.9749999880790711, |
| "rewards/code_reward": 0.45721703171730044, |
| "step": 770, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.07684345319867134, |
| "clip_ratio/high_mean": 0.014277776470407844, |
| "clip_ratio/low_mean": 0.016169815976172685, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0304475924000144, |
| "completion_length": 79.24250183105468, |
| "epoch": 0.14984151378349822, |
| "grad_norm": 3.468223810195923, |
| "kl": 0.45489892959594724, |
| "learning_rate": 9.512731021699245e-07, |
| "loss": -0.0056, |
| "reward": 1.580666732788086, |
| "reward_std": 0.41472728848457335, |
| "rewards/code_format_reward": 0.9774999976158142, |
| "rewards/code_reward": 0.5459583520889282, |
| "step": 780, |
| "zero_std_ratio": 0.275 |
| }, |
| { |
| "clip_ratio/high_max": 0.10067678079940379, |
| "clip_ratio/high_mean": 0.013439147116150707, |
| "clip_ratio/low_mean": 0.023053765966324136, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03649291144683957, |
| "completion_length": 72.04750137329101, |
| "epoch": 0.15176255883200462, |
| "grad_norm": 13.193933486938477, |
| "kl": 1.6161374658346177, |
| "learning_rate": 9.500362894977864e-07, |
| "loss": 0.0007, |
| "reward": 1.6252036333084106, |
| "reward_std": 0.3433967262506485, |
| "rewards/code_format_reward": 0.9837499976158142, |
| "rewards/code_reward": 0.5666643261909485, |
| "step": 790, |
| "zero_std_ratio": 0.325 |
| }, |
| { |
| "clip_ratio/high_max": 0.11107501722872257, |
| "clip_ratio/high_mean": 0.01587685807608068, |
| "clip_ratio/low_mean": 0.001843169682251755, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.017720027733594178, |
| "completion_length": 77.34250183105469, |
| "epoch": 0.153683603880511, |
| "grad_norm": 3.4086289405822754, |
| "kl": 0.735039034485817, |
| "learning_rate": 9.487848979765399e-07, |
| "loss": -0.0033, |
| "reward": 1.7214166164398192, |
| "reward_std": 0.3059865742921829, |
| "rewards/code_format_reward": 0.9924999952316285, |
| "rewards/code_reward": 0.6125832796096802, |
| "step": 800, |
| "zero_std_ratio": 0.325 |
| }, |
| { |
| "clip_ratio/high_max": 0.06381021924316883, |
| "clip_ratio/high_mean": 0.012221441417932511, |
| "clip_ratio/low_mean": 0.002595777277019806, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.014817218482494354, |
| "completion_length": 78.94500045776367, |
| "epoch": 0.15560464892901738, |
| "grad_norm": 2.894174098968506, |
| "kl": 0.9337424471974373, |
| "learning_rate": 9.475189732116677e-07, |
| "loss": -0.0074, |
| "reward": 1.5309076070785523, |
| "reward_std": 0.36832110285758973, |
| "rewards/code_format_reward": 0.981249988079071, |
| "rewards/code_reward": 0.5201413094997406, |
| "step": 810, |
| "zero_std_ratio": 0.35 |
| }, |
| { |
| "clip_ratio/high_max": 0.0614451477304101, |
| "clip_ratio/high_mean": 0.011137601570226252, |
| "clip_ratio/low_mean": 0.015545779425883666, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.02668338119983673, |
| "completion_length": 80.46750030517578, |
| "epoch": 0.15752569397752378, |
| "grad_norm": 1.5945316553115845, |
| "kl": 1.666656306385994, |
| "learning_rate": 9.462385613382997e-07, |
| "loss": -0.0138, |
| "reward": 1.4196115970611571, |
| "reward_std": 0.3273743912577629, |
| "rewards/code_format_reward": 0.9625, |
| "rewards/code_reward": 0.4691807866096497, |
| "step": 820, |
| "zero_std_ratio": 0.4 |
| }, |
| { |
| "clip_ratio/high_max": 0.0729204102884978, |
| "clip_ratio/high_mean": 0.011435226618777961, |
| "clip_ratio/low_mean": 0.0035716916667297483, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.015006918273866177, |
| "completion_length": 83.92250061035156, |
| "epoch": 0.15944673902603015, |
| "grad_norm": 3.7898244857788086, |
| "kl": 3.157607713341713, |
| "learning_rate": 9.449437090195312e-07, |
| "loss": 0.6488, |
| "reward": 1.5506922006607056, |
| "reward_std": 0.3165741294622421, |
| "rewards/code_format_reward": 0.9712499976158142, |
| "rewards/code_reward": 0.532533586025238, |
| "step": 830, |
| "zero_std_ratio": 0.325 |
| }, |
| { |
| "clip_ratio/high_max": 0.22014709915965797, |
| "clip_ratio/high_mean": 0.030960237560793757, |
| "clip_ratio/low_mean": 0.008386016800068318, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03934625396504998, |
| "completion_length": 79.59750213623047, |
| "epoch": 0.16136778407453656, |
| "grad_norm": 3.164461851119995, |
| "kl": 0.48004563301801684, |
| "learning_rate": 9.436344634447226e-07, |
| "loss": 0.0002, |
| "reward": 1.4315959692001343, |
| "reward_std": 0.2676436066627502, |
| "rewards/code_format_reward": 0.9774999976158142, |
| "rewards/code_reward": 0.4714229583740234, |
| "step": 840, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.20247683776542544, |
| "clip_ratio/high_mean": 0.040387283614836636, |
| "clip_ratio/low_mean": 0.0031327656004577877, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.04352005030959845, |
| "completion_length": 81.20750274658204, |
| "epoch": 0.16328882912304293, |
| "grad_norm": 3.2722160816192627, |
| "kl": 0.8405016213655472, |
| "learning_rate": 9.42310872327779e-07, |
| "loss": -0.0002, |
| "reward": 1.550826621055603, |
| "reward_std": 0.4091781198978424, |
| "rewards/code_format_reward": 0.9725000023841858, |
| "rewards/code_reward": 0.5322882652282714, |
| "step": 850, |
| "zero_std_ratio": 0.35 |
| }, |
| { |
| "clip_ratio/high_max": 0.061589781753718854, |
| "clip_ratio/high_mean": 0.011824411456473172, |
| "clip_ratio/low_mean": 0.011703617853345349, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.023528029827866705, |
| "completion_length": 62.61500244140625, |
| "epoch": 0.1652098741715493, |
| "grad_norm": 0.2732953727245331, |
| "kl": 1.4307941138744353, |
| "learning_rate": 9.409729839054123e-07, |
| "loss": 0.0075, |
| "reward": 1.5864750623703003, |
| "reward_std": 0.2073097825050354, |
| "rewards/code_format_reward": 0.9837499976158142, |
| "rewards/code_reward": 0.5473000288009644, |
| "step": 860, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.1379000276327133, |
| "clip_ratio/high_mean": 0.02470994950272143, |
| "clip_ratio/low_mean": 0.004926441749557853, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.029636391997337343, |
| "completion_length": 77.2400032043457, |
| "epoch": 0.1671309192200557, |
| "grad_norm": 3.488050699234009, |
| "kl": 0.9351878672838211, |
| "learning_rate": 9.396208469353826e-07, |
| "loss": -0.0059, |
| "reward": 1.5735363721847535, |
| "reward_std": 0.3392061233520508, |
| "rewards/code_format_reward": 0.9725000023841858, |
| "rewards/code_reward": 0.5436432063579559, |
| "step": 870, |
| "zero_std_ratio": 0.325 |
| }, |
| { |
| "clip_ratio/high_max": 0.07835716316476464, |
| "clip_ratio/high_mean": 0.014919109572656453, |
| "clip_ratio/low_mean": 0.006504692946327851, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.021423802757635713, |
| "completion_length": 74.78000183105469, |
| "epoch": 0.1690519642685621, |
| "grad_norm": 5.493437767028809, |
| "kl": 1.060418888926506, |
| "learning_rate": 9.382545106947214e-07, |
| "loss": -0.0036, |
| "reward": 1.745260238647461, |
| "reward_std": 0.297343048453331, |
| "rewards/code_format_reward": 0.9887499928474426, |
| "rewards/code_reward": 0.6254426181316376, |
| "step": 880, |
| "zero_std_ratio": 0.4 |
| }, |
| { |
| "clip_ratio/high_max": 0.12418304020538926, |
| "clip_ratio/high_mean": 0.022332211420871318, |
| "clip_ratio/low_mean": 0.022319327194418294, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.04465153906494379, |
| "completion_length": 84.53250122070312, |
| "epoch": 0.1709730093170685, |
| "grad_norm": 5.462327480316162, |
| "kl": 1.5445073664188385, |
| "learning_rate": 9.368740249779358e-07, |
| "loss": 0.0049, |
| "reward": 1.473905611038208, |
| "reward_std": 0.33463606536388396, |
| "rewards/code_format_reward": 0.9737499952316284, |
| "rewards/code_reward": 0.49351527690887453, |
| "step": 890, |
| "zero_std_ratio": 0.25 |
| }, |
| { |
| "clip_ratio/high_max": 0.08285986992996186, |
| "clip_ratio/high_mean": 0.015584854045300744, |
| "clip_ratio/low_mean": 0.002020698119304143, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.017605551629094406, |
| "completion_length": 85.78250122070312, |
| "epoch": 0.17289405436557487, |
| "grad_norm": 3.7394657135009766, |
| "kl": 1.2308152213692665, |
| "learning_rate": 9.354794400951942e-07, |
| "loss": 0.0006, |
| "reward": 1.3064285874366761, |
| "reward_std": 0.3360040634870529, |
| "rewards/code_format_reward": 0.9787500023841857, |
| "rewards/code_reward": 0.40852679312229156, |
| "step": 900, |
| "zero_std_ratio": 0.275 |
| }, |
| { |
| "clip_ratio/high_max": 0.06636467641219497, |
| "clip_ratio/high_mean": 0.01088127460097894, |
| "clip_ratio/low_mean": 0.005357642179296818, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.01623891657218337, |
| "completion_length": 86.17000122070313, |
| "epoch": 0.17481509941408127, |
| "grad_norm": 3.883023977279663, |
| "kl": 0.5634948700666428, |
| "learning_rate": 9.340708068704917e-07, |
| "loss": -0.0132, |
| "reward": 1.6946633338928223, |
| "reward_std": 0.2633577108383179, |
| "rewards/code_format_reward": 0.987499988079071, |
| "rewards/code_reward": 0.6004566550254822, |
| "step": 910, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.12004003385081888, |
| "clip_ratio/high_mean": 0.01987670698435977, |
| "clip_ratio/low_mean": 0.00857236894662492, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.028449075785465537, |
| "completion_length": 83.18000030517578, |
| "epoch": 0.17673614446258765, |
| "grad_norm": 5.860812187194824, |
| "kl": 1.0160879641771317, |
| "learning_rate": 9.326481766397991e-07, |
| "loss": -0.0011, |
| "reward": 1.5558514595031738, |
| "reward_std": 0.28839708790183066, |
| "rewards/code_format_reward": 0.9737500071525573, |
| "rewards/code_reward": 0.5344882309436798, |
| "step": 920, |
| "zero_std_ratio": 0.375 |
| }, |
| { |
| "clip_ratio/high_max": 0.06109805963933468, |
| "clip_ratio/high_mean": 0.00847023066598922, |
| "clip_ratio/low_mean": 0.004858631710521877, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.01332886223681271, |
| "completion_length": 85.00750122070312, |
| "epoch": 0.17865718951109402, |
| "grad_norm": 2.287473440170288, |
| "kl": 0.629003182053566, |
| "learning_rate": 9.312116012491916e-07, |
| "loss": -0.0155, |
| "reward": 1.3984088182449341, |
| "reward_std": 0.38690108954906466, |
| "rewards/code_format_reward": 0.9787500023841857, |
| "rewards/code_reward": 0.45451690554618834, |
| "step": 930, |
| "zero_std_ratio": 0.275 |
| }, |
| { |
| "clip_ratio/high_max": 0.11440350348129869, |
| "clip_ratio/high_mean": 0.021249773760791867, |
| "clip_ratio/low_mean": 0.010212704542209395, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03146247826516628, |
| "completion_length": 85.56500244140625, |
| "epoch": 0.18057823455960043, |
| "grad_norm": 2.5915870666503906, |
| "kl": 0.6908730089664459, |
| "learning_rate": 9.297611330529588e-07, |
| "loss": -0.0019, |
| "reward": 1.5472615003585815, |
| "reward_std": 0.34995803236961365, |
| "rewards/code_format_reward": 0.9762500047683715, |
| "rewards/code_reward": 0.529568213224411, |
| "step": 940, |
| "zero_std_ratio": 0.35 |
| }, |
| { |
| "clip_ratio/high_max": 0.11480946252122522, |
| "clip_ratio/high_mean": 0.021491143060848115, |
| "clip_ratio/low_mean": 0.007519157652859576, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.029010300803929568, |
| "completion_length": 72.10000152587891, |
| "epoch": 0.1824992796081068, |
| "grad_norm": 1.5689059495925903, |
| "kl": 0.7929495573043823, |
| "learning_rate": 9.282968249116975e-07, |
| "loss": -0.0054, |
| "reward": 1.8428637742996217, |
| "reward_std": 0.2614489495754242, |
| "rewards/code_format_reward": 0.9875, |
| "rewards/code_reward": 0.6745568513870239, |
| "step": 950, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.3971266824752092, |
| "clip_ratio/high_mean": 0.05282264268025756, |
| "clip_ratio/low_mean": 0.004530514683574438, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.05735315615311265, |
| "completion_length": 70.86750030517578, |
| "epoch": 0.1844203246566132, |
| "grad_norm": 3.4463512897491455, |
| "kl": 0.8312035664916039, |
| "learning_rate": 9.268187301903852e-07, |
| "loss": 0.0003, |
| "reward": 1.6929683208465576, |
| "reward_std": 0.2562918782234192, |
| "rewards/code_format_reward": 0.9837499976158142, |
| "rewards/code_reward": 0.600546681880951, |
| "step": 960, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.1673737466800958, |
| "clip_ratio/high_mean": 0.03157579629332759, |
| "clip_ratio/low_mean": 0.012752554472535848, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.044328349828720096, |
| "completion_length": 76.92749938964843, |
| "epoch": 0.18634136970511958, |
| "grad_norm": 3.0548853874206543, |
| "kl": 0.6291002959012986, |
| "learning_rate": 9.253269027564339e-07, |
| "loss": -0.005, |
| "reward": 1.4119353413581848, |
| "reward_std": 0.33177118599414823, |
| "rewards/code_format_reward": 0.981249988079071, |
| "rewards/code_reward": 0.46065517961978913, |
| "step": 970, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.15495819319039583, |
| "clip_ratio/high_mean": 0.022329012653790413, |
| "clip_ratio/low_mean": 0.006486268152366392, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.02881528080906719, |
| "completion_length": 68.58250122070312, |
| "epoch": 0.18826241475362598, |
| "grad_norm": 7.065835952758789, |
| "kl": 1.0375685960054397, |
| "learning_rate": 9.238213969777292e-07, |
| "loss": -0.0046, |
| "reward": 1.6331373691558837, |
| "reward_std": 0.2626490265130997, |
| "rewards/code_format_reward": 0.9850000023841858, |
| "rewards/code_reward": 0.5703186750411987, |
| "step": 980, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.10045178183354438, |
| "clip_ratio/high_mean": 0.020599483215482904, |
| "clip_ratio/low_mean": 0.007835417747264728, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.02843490142840892, |
| "completion_length": 71.47500076293946, |
| "epoch": 0.19018345980213236, |
| "grad_norm": 4.533353328704834, |
| "kl": 2.011890631914139, |
| "learning_rate": 9.223022677206474e-07, |
| "loss": -0.0001, |
| "reward": 1.7676753044128417, |
| "reward_std": 0.25886805951595304, |
| "rewards/code_format_reward": 0.9824999928474426, |
| "rewards/code_reward": 0.6382126212120056, |
| "step": 990, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.12670890614390373, |
| "clip_ratio/high_mean": 0.022856980562210083, |
| "clip_ratio/low_mean": 0.016935013599868397, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.039791994355618955, |
| "completion_length": 70.40500106811524, |
| "epoch": 0.19210450485063874, |
| "grad_norm": 9.587749481201172, |
| "kl": 1.1125446915626527, |
| "learning_rate": 9.207695703480562e-07, |
| "loss": -0.0049, |
| "reward": 1.5464402914047242, |
| "reward_std": 0.30552313327789304, |
| "rewards/code_format_reward": 0.9899999976158143, |
| "rewards/code_reward": 0.5257201135158539, |
| "step": 1000, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.13173274043947458, |
| "clip_ratio/high_mean": 0.021644592471420764, |
| "clip_ratio/low_mean": 0.01016495683870744, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03180954959243536, |
| "completion_length": 81.64500122070312, |
| "epoch": 0.19402554989914514, |
| "grad_norm": 61.59896469116211, |
| "kl": 1.3899411320686341, |
| "learning_rate": 9.192233607172973e-07, |
| "loss": 0.0117, |
| "reward": 1.5586263418197632, |
| "reward_std": 0.32884465754032133, |
| "rewards/code_format_reward": 0.9862499952316284, |
| "rewards/code_reward": 0.5327506422996521, |
| "step": 1010, |
| "zero_std_ratio": 0.35 |
| }, |
| { |
| "clip_ratio/high_max": 0.38223748579621314, |
| "clip_ratio/high_mean": 0.05293128285557032, |
| "clip_ratio/low_mean": 0.008536407171050087, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.06146768992766738, |
| "completion_length": 75.7425033569336, |
| "epoch": 0.19594659494765151, |
| "grad_norm": 0.8699261546134949, |
| "kl": 2.267198386788368, |
| "learning_rate": 9.17663695178151e-07, |
| "loss": 0.0007, |
| "reward": 1.4393709778785706, |
| "reward_std": 0.19248414039611816, |
| "rewards/code_format_reward": 0.9887499928474426, |
| "rewards/code_reward": 0.4724979490041733, |
| "step": 1020, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.05449964143335819, |
| "clip_ratio/high_mean": 0.008484689320903271, |
| "clip_ratio/low_mean": 0.0017167545520351268, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.010201443906407804, |
| "completion_length": 74.80750045776367, |
| "epoch": 0.19786763999615792, |
| "grad_norm": 3.8721530437469482, |
| "kl": 1.034875027090311, |
| "learning_rate": 9.160906305707814e-07, |
| "loss": -0.0065, |
| "reward": 1.6229804277420044, |
| "reward_std": 0.21886643767356873, |
| "rewards/code_format_reward": 0.9962499976158142, |
| "rewards/code_reward": 0.5624276876449585, |
| "step": 1030, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.057783479290083054, |
| "clip_ratio/high_mean": 0.008794186974409968, |
| "clip_ratio/low_mean": 0.01261859169753734, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.02141277852933854, |
| "completion_length": 80.94500122070312, |
| "epoch": 0.1997886850446643, |
| "grad_norm": 2.0369646549224854, |
| "kl": 0.47016064152121545, |
| "learning_rate": 9.145042242236667e-07, |
| "loss": -0.0016, |
| "reward": 1.5200274467468262, |
| "reward_std": 0.2379522889852524, |
| "rewards/code_format_reward": 0.98125, |
| "rewards/code_reward": 0.5147012054920197, |
| "step": 1040, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.05080003601033241, |
| "clip_ratio/high_mean": 0.0081847107532667, |
| "clip_ratio/low_mean": 0.003685746184783056, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.011870456766337157, |
| "completion_length": 86.39750213623047, |
| "epoch": 0.2017097300931707, |
| "grad_norm": 1.86152184009552, |
| "kl": 0.9119557231664658, |
| "learning_rate": 9.129045339515085e-07, |
| "loss": -0.0025, |
| "reward": 1.338998556137085, |
| "reward_std": 0.29172809422016144, |
| "rewards/code_format_reward": 0.9787500023841857, |
| "rewards/code_reward": 0.42481178045272827, |
| "step": 1050, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.11181259918957949, |
| "clip_ratio/high_mean": 0.01702371232677251, |
| "clip_ratio/low_mean": 0.003983464353950694, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.021007176581770183, |
| "completion_length": 89.0250015258789, |
| "epoch": 0.20363077514167707, |
| "grad_norm": 1.664932370185852, |
| "kl": 1.7415984645485878, |
| "learning_rate": 9.112916180531254e-07, |
| "loss": -0.0009, |
| "reward": 1.6867451906204223, |
| "reward_std": 0.26216842532157897, |
| "rewards/code_format_reward": 0.9849999904632568, |
| "rewards/code_reward": 0.5971225798130035, |
| "step": 1060, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.1619036693125963, |
| "clip_ratio/high_mean": 0.02605230761691928, |
| "clip_ratio/low_mean": 0.011786457896232606, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03783876644447446, |
| "completion_length": 80.52750091552734, |
| "epoch": 0.20555182019018345, |
| "grad_norm": 3.1480722427368164, |
| "kl": 2.3309426337480543, |
| "learning_rate": 9.096655353093286e-07, |
| "loss": -0.0108, |
| "reward": 1.7797099113464356, |
| "reward_std": 0.3243818938732147, |
| "rewards/code_format_reward": 0.9875, |
| "rewards/code_reward": 0.6429799437522888, |
| "step": 1070, |
| "zero_std_ratio": 0.375 |
| }, |
| { |
| "clip_ratio/high_max": 0.145719656907022, |
| "clip_ratio/high_mean": 0.02472380215767771, |
| "clip_ratio/low_mean": 0.01881317695369944, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.043536979146301745, |
| "completion_length": 75.46750183105469, |
| "epoch": 0.20747286523868985, |
| "grad_norm": 4.7426347732543945, |
| "kl": 0.7767296731472015, |
| "learning_rate": 9.080263449807788e-07, |
| "loss": 0.0042, |
| "reward": 1.5128322124481202, |
| "reward_std": 0.26058112680912016, |
| "rewards/code_format_reward": 0.9662500023841858, |
| "rewards/code_reward": 0.514853572845459, |
| "step": 1080, |
| "zero_std_ratio": 0.375 |
| }, |
| { |
| "clip_ratio/high_max": 0.04860758520662785, |
| "clip_ratio/high_mean": 0.00921072952914983, |
| "clip_ratio/low_mean": 0.013458288778201677, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.022669017571024595, |
| "completion_length": 77.22750244140624, |
| "epoch": 0.20939391028719623, |
| "grad_norm": 2.2836835384368896, |
| "kl": 0.6794285923242569, |
| "learning_rate": 9.063741068058278e-07, |
| "loss": -0.0028, |
| "reward": 1.5665315628051757, |
| "reward_std": 0.23656646013259888, |
| "rewards/code_format_reward": 0.9737500071525573, |
| "rewards/code_reward": 0.5398283064365387, |
| "step": 1090, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.2074673067778349, |
| "clip_ratio/high_mean": 0.036228268034756185, |
| "clip_ratio/low_mean": 0.003734398238157155, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.039962667226791385, |
| "completion_length": 91.18000030517578, |
| "epoch": 0.21131495533570263, |
| "grad_norm": 7.916996002197266, |
| "kl": 1.0919141083955766, |
| "learning_rate": 9.0470888099834e-07, |
| "loss": 0.1666, |
| "reward": 1.68690767288208, |
| "reward_std": 0.32907233834266664, |
| "rewards/code_format_reward": 0.9799999952316284, |
| "rewards/code_reward": 0.5984537959098816, |
| "step": 1100, |
| "zero_std_ratio": 0.375 |
| }, |
| { |
| "clip_ratio/high_max": 0.16652454435825348, |
| "clip_ratio/high_mean": 0.027045656740665436, |
| "clip_ratio/low_mean": 0.006342244842380751, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03338790265843272, |
| "completion_length": 80.02000122070312, |
| "epoch": 0.213236000384209, |
| "grad_norm": 24.34583282470703, |
| "kl": 1.00138920545578, |
| "learning_rate": 9.030307282454995e-07, |
| "loss": -0.0023, |
| "reward": 1.6111816883087158, |
| "reward_std": 0.24880893230438234, |
| "rewards/code_format_reward": 0.9724999904632569, |
| "rewards/code_reward": 0.5624658226966858, |
| "step": 1110, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.18198216175660492, |
| "clip_ratio/high_mean": 0.02493738690391183, |
| "clip_ratio/low_mean": 0.004894328210502863, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.029831714881584048, |
| "completion_length": 71.95, |
| "epoch": 0.2151570454327154, |
| "grad_norm": 2.7608304023742676, |
| "kl": 0.971074515581131, |
| "learning_rate": 9.013397097055971e-07, |
| "loss": -0.0022, |
| "reward": 1.6884326457977294, |
| "reward_std": 0.3369467526674271, |
| "rewards/code_format_reward": 0.9712499856948853, |
| "rewards/code_reward": 0.6014038324356079, |
| "step": 1120, |
| "zero_std_ratio": 0.375 |
| }, |
| { |
| "clip_ratio/high_max": 0.16543546952307225, |
| "clip_ratio/high_mean": 0.02493141880258918, |
| "clip_ratio/low_mean": 0.007064808573340997, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03199622761458158, |
| "completion_length": 72.50500030517578, |
| "epoch": 0.21707809048122179, |
| "grad_norm": 7.147952556610107, |
| "kl": 6.163409499824047, |
| "learning_rate": 8.996358870058017e-07, |
| "loss": 0.0081, |
| "reward": 1.5753276348114014, |
| "reward_std": 0.2175431028008461, |
| "rewards/code_format_reward": 0.9924999952316285, |
| "rewards/code_reward": 0.5395387947559357, |
| "step": 1130, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.05989155264105648, |
| "clip_ratio/high_mean": 0.009021314003621227, |
| "clip_ratio/low_mean": 0.014251881884410978, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.023273196443915366, |
| "completion_length": 74.57750091552734, |
| "epoch": 0.21899913552972816, |
| "grad_norm": 17.58907699584961, |
| "kl": 0.9839092344045639, |
| "learning_rate": 8.979193222399154e-07, |
| "loss": -0.0006, |
| "reward": 1.570918822288513, |
| "reward_std": 0.27486068904399874, |
| "rewards/code_format_reward": 0.9737500071525573, |
| "rewards/code_reward": 0.5420219123363494, |
| "step": 1140, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.23600016683340072, |
| "clip_ratio/high_mean": 0.04525289600715041, |
| "clip_ratio/low_mean": 0.00799154011765495, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.05324443739373237, |
| "completion_length": 71.89750061035156, |
| "epoch": 0.22092018057823457, |
| "grad_norm": 8.010896682739258, |
| "kl": 1.0768774889409543, |
| "learning_rate": 8.961900779661095e-07, |
| "loss": 0.0139, |
| "reward": 1.5848765134811402, |
| "reward_std": 0.21965934410691262, |
| "rewards/code_format_reward": 0.9862499952316284, |
| "rewards/code_reward": 0.5458757638931274, |
| "step": 1150, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.10782922431826591, |
| "clip_ratio/high_mean": 0.014393238560296595, |
| "clip_ratio/low_mean": 0.0046036563231609765, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.018996895058080554, |
| "completion_length": 78.2925018310547, |
| "epoch": 0.22284122562674094, |
| "grad_norm": 3.7750465869903564, |
| "kl": 0.5210637584328651, |
| "learning_rate": 8.944482172046448e-07, |
| "loss": -0.0065, |
| "reward": 1.6227028608322143, |
| "reward_std": 0.2484603613615036, |
| "rewards/code_format_reward": 0.98125, |
| "rewards/code_reward": 0.5660388946533204, |
| "step": 1160, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.13120641289278864, |
| "clip_ratio/high_mean": 0.019719564472325146, |
| "clip_ratio/low_mean": 0.00696407729992643, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.026683641644194723, |
| "completion_length": 81.64000091552734, |
| "epoch": 0.22476227067524734, |
| "grad_norm": 1.1691230535507202, |
| "kl": 0.5908193171024323, |
| "learning_rate": 8.926938034355751e-07, |
| "loss": -0.0008, |
| "reward": 1.6598936080932618, |
| "reward_std": 0.3073273479938507, |
| "rewards/code_format_reward": 0.9875, |
| "rewards/code_reward": 0.5830717980861664, |
| "step": 1170, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.26425148695707323, |
| "clip_ratio/high_mean": 0.03642228813841939, |
| "clip_ratio/low_mean": 0.0025068818649742752, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03892916943877935, |
| "completion_length": 83.06500244140625, |
| "epoch": 0.22668331572375372, |
| "grad_norm": 5.047176361083984, |
| "kl": 0.8601905956864357, |
| "learning_rate": 8.90926900596434e-07, |
| "loss": 0.019, |
| "reward": 1.6030859470367431, |
| "reward_std": 0.18358819633722306, |
| "rewards/code_format_reward": 0.9862500071525574, |
| "rewards/code_reward": 0.5549804508686066, |
| "step": 1180, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.20188620835542678, |
| "clip_ratio/high_mean": 0.03365288833156228, |
| "clip_ratio/low_mean": 0.012162915989756584, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.04581580460071564, |
| "completion_length": 80.93500061035157, |
| "epoch": 0.2286043607722601, |
| "grad_norm": 3.431043863296509, |
| "kl": 3.284740853309631, |
| "learning_rate": 8.891475730799039e-07, |
| "loss": -0.0024, |
| "reward": 1.719798493385315, |
| "reward_std": 0.2678588882088661, |
| "rewards/code_format_reward": 0.9887500047683716, |
| "rewards/code_reward": 0.6127117216587067, |
| "step": 1190, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.13805483505129815, |
| "clip_ratio/high_mean": 0.02111883880570531, |
| "clip_ratio/low_mean": 0.002508872369071469, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.023627711273729802, |
| "completion_length": 87.33250274658204, |
| "epoch": 0.2305254058207665, |
| "grad_norm": 4.731442928314209, |
| "kl": 1.1696231275796891, |
| "learning_rate": 8.873558857314706e-07, |
| "loss": -0.0053, |
| "reward": 1.7580220222473144, |
| "reward_std": 0.28411929309368134, |
| "rewards/code_format_reward": 0.9900000095367432, |
| "rewards/code_reward": 0.6315110087394714, |
| "step": 1200, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.07043634681031108, |
| "clip_ratio/high_mean": 0.009235845855437219, |
| "clip_ratio/low_mean": 0.017453379271319135, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.02668922524899244, |
| "completion_length": 86.74250030517578, |
| "epoch": 0.23244645086927287, |
| "grad_norm": 23.686250686645508, |
| "kl": 1.7613270074129104, |
| "learning_rate": 8.855519038470587e-07, |
| "loss": 0.91, |
| "reward": 1.8096629619598388, |
| "reward_std": 0.2700611263513565, |
| "rewards/code_format_reward": 0.9824999928474426, |
| "rewards/code_reward": 0.6592064738273621, |
| "step": 1210, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.1193816315382719, |
| "clip_ratio/high_mean": 0.01799508691765368, |
| "clip_ratio/low_mean": 0.0052341839407745285, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.023229270869342143, |
| "completion_length": 91.73750152587891, |
| "epoch": 0.23436749591777928, |
| "grad_norm": 5.015241622924805, |
| "kl": 87723751.16166303, |
| "learning_rate": 8.83735693170653e-07, |
| "loss": 178666.875, |
| "reward": 1.5409840583801269, |
| "reward_std": 0.3586106300354004, |
| "rewards/code_format_reward": 0.9687500119209289, |
| "rewards/code_reward": 0.5283045113086701, |
| "step": 1220, |
| "zero_std_ratio": 0.4 |
| }, |
| { |
| "clip_ratio/high_max": 0.15788686936721205, |
| "clip_ratio/high_mean": 0.02180835944600403, |
| "clip_ratio/low_mean": 0.004957044991897419, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.026765404315665365, |
| "completion_length": 83.70250091552734, |
| "epoch": 0.23628854096628565, |
| "grad_norm": 2.7140953540802, |
| "kl": 0.755669391900301, |
| "learning_rate": 8.81907319891902e-07, |
| "loss": -0.0099, |
| "reward": 1.8449480056762695, |
| "reward_std": 0.28006095588207247, |
| "rewards/code_format_reward": 0.987499988079071, |
| "rewards/code_reward": 0.6755990028381348, |
| "step": 1230, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.12416752465069295, |
| "clip_ratio/high_mean": 0.01972346901893616, |
| "clip_ratio/low_mean": 0.01847981174942106, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0382032809779048, |
| "completion_length": 91.47000274658203, |
| "epoch": 0.23820958601479206, |
| "grad_norm": 10.781957626342773, |
| "kl": 1.0129390999674797, |
| "learning_rate": 8.800668506437059e-07, |
| "loss": 0.0011, |
| "reward": 1.6923505306243896, |
| "reward_std": 0.3265227422118187, |
| "rewards/code_format_reward": 0.9787500023841857, |
| "rewards/code_reward": 0.6014877319335937, |
| "step": 1240, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.12042212830856443, |
| "clip_ratio/high_mean": 0.017916655144654216, |
| "clip_ratio/low_mean": 0.007017276567057707, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.02493393179029226, |
| "completion_length": 76.6675018310547, |
| "epoch": 0.24013063106329843, |
| "grad_norm": 47.773136138916016, |
| "kl": 1.4071896970272064, |
| "learning_rate": 8.782143524997882e-07, |
| "loss": 0.0018, |
| "reward": 1.6722928285598755, |
| "reward_std": 0.25374017357826234, |
| "rewards/code_format_reward": 0.9824999809265137, |
| "rewards/code_reward": 0.5905213832855225, |
| "step": 1250, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.08169625541195273, |
| "clip_ratio/high_mean": 0.013112110400106758, |
| "clip_ratio/low_mean": 0.003914138658728916, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.01702624891186133, |
| "completion_length": 78.23750152587891, |
| "epoch": 0.2420516761118048, |
| "grad_norm": 2688.99462890625, |
| "kl": 9.395949372649193, |
| "learning_rate": 8.76349892972251e-07, |
| "loss": 0.1943, |
| "reward": 1.5601455688476562, |
| "reward_std": 0.3348282665014267, |
| "rewards/code_format_reward": 0.9712499976158142, |
| "rewards/code_reward": 0.5372602701187134, |
| "step": 1260, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.13095853393897414, |
| "clip_ratio/high_mean": 0.018921413994394242, |
| "clip_ratio/low_mean": 0.018763081403449178, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03768449563067407, |
| "completion_length": 76.1500015258789, |
| "epoch": 0.2439727211603112, |
| "grad_norm": 3.0777931213378906, |
| "kl": 1.7352074533700943, |
| "learning_rate": 8.744735400091154e-07, |
| "loss": 0.0055, |
| "reward": 1.633968448638916, |
| "reward_std": 0.23277063071727752, |
| "rewards/code_format_reward": 0.9824999928474426, |
| "rewards/code_reward": 0.5713592231273651, |
| "step": 1270, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.15694143967702984, |
| "clip_ratio/high_mean": 0.026766782545018943, |
| "clip_ratio/low_mean": 0.010570818380801938, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03733760174363852, |
| "completion_length": 76.13500213623047, |
| "epoch": 0.2458937662088176, |
| "grad_norm": 2.8748385906219482, |
| "kl": 3.007472372055054, |
| "learning_rate": 8.725853619918444e-07, |
| "loss": 0.0249, |
| "reward": 1.4643328666687012, |
| "reward_std": 0.2899716466665268, |
| "rewards/code_format_reward": 0.9799999952316284, |
| "rewards/code_reward": 0.48716638684272767, |
| "step": 1280, |
| "zero_std_ratio": 0.35 |
| }, |
| { |
| "clip_ratio/high_max": 0.07734788609668612, |
| "clip_ratio/high_mean": 0.013521577988285571, |
| "clip_ratio/low_mean": 0.002974278874171432, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.01649585694540292, |
| "completion_length": 77.6050033569336, |
| "epoch": 0.247814811257324, |
| "grad_norm": 4.51137638092041, |
| "kl": 0.6521440967917442, |
| "learning_rate": 8.706854277328507e-07, |
| "loss": -0.0065, |
| "reward": 1.663088607788086, |
| "reward_std": 0.29463320076465604, |
| "rewards/code_format_reward": 0.9887499928474426, |
| "rewards/code_reward": 0.5843567848205566, |
| "step": 1290, |
| "zero_std_ratio": 0.375 |
| }, |
| { |
| "clip_ratio/high_max": 0.12926983460783958, |
| "clip_ratio/high_mean": 0.016393666993826626, |
| "clip_ratio/low_mean": 0.024948839796707034, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.04134250609204173, |
| "completion_length": 74.63750305175782, |
| "epoch": 0.24973585630583037, |
| "grad_norm": 7.019649982452393, |
| "kl": 0.6837658904492855, |
| "learning_rate": 8.687738064729902e-07, |
| "loss": -0.0022, |
| "reward": 1.6927862167358398, |
| "reward_std": 0.14656674191355706, |
| "rewards/code_format_reward": 0.9962499976158142, |
| "rewards/code_reward": 0.5973306179046631, |
| "step": 1300, |
| "zero_std_ratio": 0.675 |
| }, |
| { |
| "clip_ratio/high_max": 0.15073961750604212, |
| "clip_ratio/high_mean": 0.024888798157917336, |
| "clip_ratio/low_mean": 0.004707413475262001, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.02959621128393337, |
| "completion_length": 79.17500152587891, |
| "epoch": 0.25165690135433677, |
| "grad_norm": 3.9428677558898926, |
| "kl": 1.0088127315044404, |
| "learning_rate": 8.668505678790368e-07, |
| "loss": 0.7445, |
| "reward": 1.5962260961532593, |
| "reward_std": 0.22741070687770842, |
| "rewards/code_format_reward": 0.98125, |
| "rewards/code_reward": 0.5528005361557007, |
| "step": 1310, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.0862931152805686, |
| "clip_ratio/high_mean": 0.016994312894530593, |
| "clip_ratio/low_mean": 0.0031913593309582213, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.020185671979561448, |
| "completion_length": 79.30500183105468, |
| "epoch": 0.25357794640284315, |
| "grad_norm": 2.810743808746338, |
| "kl": 2.0237294919788837, |
| "learning_rate": 8.649157820411451e-07, |
| "loss": -0.0028, |
| "reward": 1.6300202369689942, |
| "reward_std": 0.2859074264764786, |
| "rewards/code_format_reward": 0.975, |
| "rewards/code_reward": 0.5712601006031036, |
| "step": 1320, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.14902311654295772, |
| "clip_ratio/high_mean": 0.02855427504691761, |
| "clip_ratio/low_mean": 0.012185945303644984, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.040740220062434676, |
| "completion_length": 70.88000030517578, |
| "epoch": 0.2554989914513495, |
| "grad_norm": 4.68557071685791, |
| "kl": 1.2288852274417876, |
| "learning_rate": 8.629695194702949e-07, |
| "loss": -0.0057, |
| "reward": 1.4114359855651855, |
| "reward_std": 0.2626632884144783, |
| "rewards/code_format_reward": 0.9625, |
| "rewards/code_reward": 0.46509301066398623, |
| "step": 1330, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.11323303133249282, |
| "clip_ratio/high_mean": 0.016216285666450857, |
| "clip_ratio/low_mean": 0.0045135776337701826, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.02072986289858818, |
| "completion_length": 71.99250030517578, |
| "epoch": 0.2574200364998559, |
| "grad_norm": 43.944698333740234, |
| "kl": 1.446278090775013, |
| "learning_rate": 8.610118510957221e-07, |
| "loss": 0.0112, |
| "reward": 1.5807109117507934, |
| "reward_std": 0.23466840982437134, |
| "rewards/code_format_reward": 0.9737500071525573, |
| "rewards/code_reward": 0.5469179153442383, |
| "step": 1340, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.20504833161830902, |
| "clip_ratio/high_mean": 0.029384778672829272, |
| "clip_ratio/low_mean": 0.006734570109983906, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03611934892833233, |
| "completion_length": 69.60750198364258, |
| "epoch": 0.25934108154836233, |
| "grad_norm": 3.4515652656555176, |
| "kl": 1.288391387462616, |
| "learning_rate": 8.59042848262334e-07, |
| "loss": 0.0022, |
| "reward": 1.7648874998092652, |
| "reward_std": 0.29008678793907167, |
| "rewards/code_format_reward": 0.99375, |
| "rewards/code_reward": 0.6340062260627747, |
| "step": 1350, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.18540791552513838, |
| "clip_ratio/high_mean": 0.030647353292442857, |
| "clip_ratio/low_mean": 0.0048290589373209515, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03547641267068684, |
| "completion_length": 73.8150016784668, |
| "epoch": 0.2612621265968687, |
| "grad_norm": 24.974191665649414, |
| "kl": 1.361786951869726, |
| "learning_rate": 8.570625827281077e-07, |
| "loss": -0.0015, |
| "reward": 1.6352276086807251, |
| "reward_std": 0.20483867302536965, |
| "rewards/code_format_reward": 0.9712500095367431, |
| "rewards/code_reward": 0.5748012781143188, |
| "step": 1360, |
| "zero_std_ratio": 0.625 |
| }, |
| { |
| "clip_ratio/high_max": 0.25138766765594484, |
| "clip_ratio/high_mean": 0.043486443860456345, |
| "clip_ratio/low_mean": 0.006613140180706978, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.05009958455339074, |
| "completion_length": 85.41999969482421, |
| "epoch": 0.2631831716453751, |
| "grad_norm": 0.2826422452926636, |
| "kl": 1.1484392315149308, |
| "learning_rate": 8.550711266614774e-07, |
| "loss": -0.0015, |
| "reward": 1.5049166679382324, |
| "reward_std": 0.17118329852819442, |
| "rewards/code_format_reward": 0.9737499952316284, |
| "rewards/code_reward": 0.5090208292007447, |
| "step": 1370, |
| "zero_std_ratio": 0.625 |
| }, |
| { |
| "clip_ratio/high_max": 0.10418513733893633, |
| "clip_ratio/high_mean": 0.017387184244580568, |
| "clip_ratio/low_mean": 0.006483422458404675, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.023870606115087865, |
| "completion_length": 78.00750274658203, |
| "epoch": 0.26510421669388146, |
| "grad_norm": 0.43826720118522644, |
| "kl": 0.5077251173555851, |
| "learning_rate": 8.530685526387023e-07, |
| "loss": 0.0071, |
| "reward": 1.5417476654052735, |
| "reward_std": 0.2806018695235252, |
| "rewards/code_format_reward": 0.975, |
| "rewards/code_reward": 0.5271238267421723, |
| "step": 1380, |
| "zero_std_ratio": 0.375 |
| }, |
| { |
| "clip_ratio/high_max": 0.12633447310654447, |
| "clip_ratio/high_mean": 0.01944113611098146, |
| "clip_ratio/low_mean": 0.02114583211950958, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.04058696813735878, |
| "completion_length": 69.89499969482422, |
| "epoch": 0.26702526174238783, |
| "grad_norm": 3.222648859024048, |
| "kl": 0.8532382689416409, |
| "learning_rate": 8.510549336412227e-07, |
| "loss": 0.2832, |
| "reward": 1.4325429320335388, |
| "reward_std": 0.23379142954945564, |
| "rewards/code_format_reward": 0.95625, |
| "rewards/code_reward": 0.47720896899700166, |
| "step": 1390, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.10978957340121269, |
| "clip_ratio/high_mean": 0.015348212420940399, |
| "clip_ratio/low_mean": 0.00886362442979589, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.024211836606264116, |
| "completion_length": 74.32000198364258, |
| "epoch": 0.26894630679089426, |
| "grad_norm": 511.98333740234375, |
| "kl": 6.762348529696465, |
| "learning_rate": 8.490303430529996e-07, |
| "loss": 0.0097, |
| "reward": 1.5433219909667968, |
| "reward_std": 0.3002948135137558, |
| "rewards/code_format_reward": 0.9787500023841857, |
| "rewards/code_reward": 0.5269734919071197, |
| "step": 1400, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.021737607452087103, |
| "clip_ratio/high_mean": 0.004128801400656812, |
| "clip_ratio/low_mean": 0.008135353482794016, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.012264154804870486, |
| "completion_length": 70.88250122070312, |
| "epoch": 0.27086735183940064, |
| "grad_norm": 4.558300018310547, |
| "kl": 1.0645984336733818, |
| "learning_rate": 8.469948546578406e-07, |
| "loss": -0.002, |
| "reward": 1.711915636062622, |
| "reward_std": 0.23479849100112915, |
| "rewards/code_format_reward": 0.9875, |
| "rewards/code_reward": 0.6090827941894531, |
| "step": 1410, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.31115832179784775, |
| "clip_ratio/high_mean": 0.04542893636971712, |
| "clip_ratio/low_mean": 0.004167796808178537, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.04959673266857863, |
| "completion_length": 82.51750335693359, |
| "epoch": 0.272788396887907, |
| "grad_norm": 26.85635757446289, |
| "kl": 0.6633755072951317, |
| "learning_rate": 8.449485426367113e-07, |
| "loss": -0.0044, |
| "reward": 1.8086278200149537, |
| "reward_std": 0.25109012275934217, |
| "rewards/code_format_reward": 0.9862499952316284, |
| "rewards/code_reward": 0.6577514052391052, |
| "step": 1420, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.21433292645961047, |
| "clip_ratio/high_mean": 0.027504962938837706, |
| "clip_ratio/low_mean": 0.007746222103014589, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03525118476245552, |
| "completion_length": 69.12750015258788, |
| "epoch": 0.2747094419364134, |
| "grad_norm": 39.272727966308594, |
| "kl": 2.1152508199214934, |
| "learning_rate": 8.428914815650318e-07, |
| "loss": 56.6465, |
| "reward": 1.5950207233428955, |
| "reward_std": 0.25626782774925233, |
| "rewards/code_format_reward": 0.9824999928474426, |
| "rewards/code_reward": 0.5518853664398193, |
| "step": 1430, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.14325670124962925, |
| "clip_ratio/high_mean": 0.02268084152601659, |
| "clip_ratio/low_mean": 0.006528474338119849, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.029209316370543092, |
| "completion_length": 67.30000076293945, |
| "epoch": 0.2766304869849198, |
| "grad_norm": 4.287910461425781, |
| "kl": 1.2686308354139328, |
| "learning_rate": 8.408237464099576e-07, |
| "loss": 9.8201, |
| "reward": 1.6364605188369752, |
| "reward_std": 0.22813104093074799, |
| "rewards/code_format_reward": 0.9749999880790711, |
| "rewards/code_reward": 0.5744802415370941, |
| "step": 1440, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.18851536950096487, |
| "clip_ratio/high_mean": 0.024719347018981354, |
| "clip_ratio/low_mean": 0.013444452191470191, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.038163799053290856, |
| "completion_length": 82.13250274658203, |
| "epoch": 0.2785515320334262, |
| "grad_norm": 0.4786536991596222, |
| "kl": 8.468844538927078, |
| "learning_rate": 8.387454125276494e-07, |
| "loss": 0.0456, |
| "reward": 1.7758944988250733, |
| "reward_std": 0.1511917643249035, |
| "rewards/code_format_reward": 0.9875, |
| "rewards/code_reward": 0.6410722196102142, |
| "step": 1450, |
| "zero_std_ratio": 0.6 |
| }, |
| { |
| "clip_ratio/high_max": 0.15984937213361264, |
| "clip_ratio/high_mean": 0.025054804515093565, |
| "clip_ratio/low_mean": 0.01257994698244147, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03763475017622113, |
| "completion_length": 79.66000213623047, |
| "epoch": 0.2804725770819326, |
| "grad_norm": 3.223284959793091, |
| "kl": 1.7015444114804268, |
| "learning_rate": 8.366565556605258e-07, |
| "loss": 0.0276, |
| "reward": 1.5976650953292846, |
| "reward_std": 0.341750779747963, |
| "rewards/code_format_reward": 0.96875, |
| "rewards/code_reward": 0.5566450238227845, |
| "step": 1460, |
| "zero_std_ratio": 0.4 |
| }, |
| { |
| "clip_ratio/high_max": 0.27182711616624144, |
| "clip_ratio/high_mean": 0.040798351392732, |
| "clip_ratio/low_mean": 0.002200227712455671, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.04299857785226777, |
| "completion_length": 79.22250213623047, |
| "epoch": 0.28239362213043895, |
| "grad_norm": 1.4845157861709595, |
| "kl": 1.693036738038063, |
| "learning_rate": 8.345572519345031e-07, |
| "loss": -0.0017, |
| "reward": 1.7161717653274535, |
| "reward_std": 0.2422049015760422, |
| "rewards/code_format_reward": 0.9824999928474426, |
| "rewards/code_reward": 0.612460857629776, |
| "step": 1470, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.17126517184078693, |
| "clip_ratio/high_mean": 0.025960111571475864, |
| "clip_ratio/low_mean": 0.00444280517695006, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.030402917158789934, |
| "completion_length": 83.31750183105468, |
| "epoch": 0.2843146671789453, |
| "grad_norm": 5.96829080581665, |
| "kl": 0.574289733916521, |
| "learning_rate": 8.324475778562209e-07, |
| "loss": -0.0061, |
| "reward": 1.7776363611221313, |
| "reward_std": 0.2358689785003662, |
| "rewards/code_format_reward": 0.9837499976158142, |
| "rewards/code_reward": 0.6428806602954864, |
| "step": 1480, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.2015096817165613, |
| "clip_ratio/high_mean": 0.03324723746627569, |
| "clip_ratio/low_mean": 0.00480144299363019, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.038048680778592824, |
| "completion_length": 73.28000106811524, |
| "epoch": 0.28623571222745176, |
| "grad_norm": 6.496949672698975, |
| "kl": 0.6653359919786453, |
| "learning_rate": 8.30327610310254e-07, |
| "loss": 0.0021, |
| "reward": 1.6191941976547242, |
| "reward_std": 0.31718442738056185, |
| "rewards/code_format_reward": 0.9825000047683716, |
| "rewards/code_reward": 0.5639720797538758, |
| "step": 1490, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.15795932533219456, |
| "clip_ratio/high_mean": 0.02212390162749216, |
| "clip_ratio/low_mean": 0.00480329486890696, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.026927196700125933, |
| "completion_length": 73.78000106811524, |
| "epoch": 0.28815675727595813, |
| "grad_norm": 5.75892972946167, |
| "kl": 0.46196936070919037, |
| "learning_rate": 8.281974265563108e-07, |
| "loss": -0.0045, |
| "reward": 1.7829506158828736, |
| "reward_std": 0.17953601479530334, |
| "rewards/code_format_reward": 0.9949999928474427, |
| "rewards/code_reward": 0.642725282907486, |
| "step": 1500, |
| "zero_std_ratio": 0.625 |
| }, |
| { |
| "clip_ratio/high_max": 0.24582542856223882, |
| "clip_ratio/high_mean": 0.030850262753665446, |
| "clip_ratio/low_mean": 0.005616182333324104, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03646644388791174, |
| "completion_length": 77.69500198364258, |
| "epoch": 0.2900778023244645, |
| "grad_norm": 326340576.0, |
| "kl": 0.605505321919918, |
| "learning_rate": 8.260571042264166e-07, |
| "loss": 8518.9961, |
| "reward": 1.7113344192504882, |
| "reward_std": 0.18693218380212784, |
| "rewards/code_format_reward": 0.9875, |
| "rewards/code_reward": 0.6087921977043151, |
| "step": 1510, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.22759323129430414, |
| "clip_ratio/high_mean": 0.03405714362161234, |
| "clip_ratio/low_mean": 0.0032101303557283247, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03726727362954989, |
| "completion_length": 75.53250122070312, |
| "epoch": 0.2919988473729709, |
| "grad_norm": 2.2893807888031006, |
| "kl": 0.5214515089988708, |
| "learning_rate": 8.23906721322086e-07, |
| "loss": 0.0027, |
| "reward": 1.6311777591705323, |
| "reward_std": 0.17696685791015626, |
| "rewards/code_format_reward": 0.9862499952316284, |
| "rewards/code_reward": 0.5690263509750366, |
| "step": 1520, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.06725322343409061, |
| "clip_ratio/high_mean": 0.010706762981135398, |
| "clip_ratio/low_mean": 0.0018884234530560206, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.012595186498947442, |
| "completion_length": 78.90999908447266, |
| "epoch": 0.29391989242147726, |
| "grad_norm": 2.6211440563201904, |
| "kl": 0.5930808052420616, |
| "learning_rate": 8.217463562114786e-07, |
| "loss": -0.0035, |
| "reward": 1.7637510299682617, |
| "reward_std": 0.209340962767601, |
| "rewards/code_format_reward": 0.981249988079071, |
| "rewards/code_reward": 0.6365630030632019, |
| "step": 1530, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.06629493543878198, |
| "clip_ratio/high_mean": 0.012000571249518543, |
| "clip_ratio/low_mean": 0.010053297760896385, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.022053868882358073, |
| "completion_length": 77.66250152587891, |
| "epoch": 0.2958409374699837, |
| "grad_norm": 0.5937472581863403, |
| "kl": 0.6556157968938351, |
| "learning_rate": 8.195760876265438e-07, |
| "loss": 0.0023, |
| "reward": 1.4144308805465697, |
| "reward_std": 0.12647379338741302, |
| "rewards/code_format_reward": 0.9825000047683716, |
| "rewards/code_reward": 0.461590439081192, |
| "step": 1540, |
| "zero_std_ratio": 0.65 |
| }, |
| { |
| "clip_ratio/high_max": 0.2611659773625433, |
| "clip_ratio/high_mean": 0.05069012229796499, |
| "clip_ratio/low_mean": 0.009917778367525897, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.06060790033079684, |
| "completion_length": 80.60250091552734, |
| "epoch": 0.29776198251849006, |
| "grad_norm": 7.297484874725342, |
| "kl": 2.139972834289074, |
| "learning_rate": 8.173959946601519e-07, |
| "loss": 0.0662, |
| "reward": 1.6416264057159424, |
| "reward_std": 0.3118141442537308, |
| "rewards/code_format_reward": 0.9749999880790711, |
| "rewards/code_reward": 0.5770631790161133, |
| "step": 1550, |
| "zero_std_ratio": 0.4 |
| }, |
| { |
| "clip_ratio/high_max": 0.14441705606877803, |
| "clip_ratio/high_mean": 0.023728324193507434, |
| "clip_ratio/low_mean": 0.005098688977886923, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.028827012795954943, |
| "completion_length": 77.17750244140625, |
| "epoch": 0.29968302756699644, |
| "grad_norm": 5.614815711975098, |
| "kl": 0.5137595549225807, |
| "learning_rate": 8.152061567632108e-07, |
| "loss": -0.0057, |
| "reward": 1.5097593545913697, |
| "reward_std": 0.29559260606765747, |
| "rewards/code_format_reward": 0.9575000047683716, |
| "rewards/code_reward": 0.5155046641826629, |
| "step": 1560, |
| "zero_std_ratio": 0.375 |
| }, |
| { |
| "clip_ratio/high_max": 0.03715956890955567, |
| "clip_ratio/high_mean": 0.006024846772197634, |
| "clip_ratio/low_mean": 0.009319488028995692, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.015344334836117923, |
| "completion_length": 76.34000091552734, |
| "epoch": 0.3016040726155028, |
| "grad_norm": 5.059381008148193, |
| "kl": 0.8711868159472942, |
| "learning_rate": 8.130066537417707e-07, |
| "loss": -0.0003, |
| "reward": 1.4149085521697997, |
| "reward_std": 0.19155050422996284, |
| "rewards/code_format_reward": 0.9749999880790711, |
| "rewards/code_reward": 0.463704252243042, |
| "step": 1570, |
| "zero_std_ratio": 0.625 |
| }, |
| { |
| "clip_ratio/high_max": 0.0935845285654068, |
| "clip_ratio/high_mean": 0.013573423656634987, |
| "clip_ratio/low_mean": 0.00990565216197865, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.023479075590148567, |
| "completion_length": 83.97500152587891, |
| "epoch": 0.30352511766400925, |
| "grad_norm": 2.025956869125366, |
| "kl": 0.9980318561196327, |
| "learning_rate": 8.10797565754116e-07, |
| "loss": -0.0041, |
| "reward": 1.5444376945495606, |
| "reward_std": 0.19510383605957032, |
| "rewards/code_format_reward": 0.9887499928474426, |
| "rewards/code_reward": 0.525031316280365, |
| "step": 1580, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.11659459788352251, |
| "clip_ratio/high_mean": 0.016526972700376064, |
| "clip_ratio/low_mean": 0.0030368489184184, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.019563821679912507, |
| "completion_length": 90.33000335693359, |
| "epoch": 0.3054461627125156, |
| "grad_norm": 4.901747703552246, |
| "kl": 0.6650052145123482, |
| "learning_rate": 8.085789733078439e-07, |
| "loss": 0.9063, |
| "reward": 1.6000897407531738, |
| "reward_std": 0.20618843138217927, |
| "rewards/code_format_reward": 0.9774999976158142, |
| "rewards/code_reward": 0.5556698679924011, |
| "step": 1590, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.1246914654970169, |
| "clip_ratio/high_mean": 0.018419789243489505, |
| "clip_ratio/low_mean": 0.0033823222620412707, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.021802110970020293, |
| "completion_length": 82.78250122070312, |
| "epoch": 0.307367207761022, |
| "grad_norm": 16365.4453125, |
| "kl": 83.84930176734925, |
| "learning_rate": 8.063509572569303e-07, |
| "loss": 0.4123, |
| "reward": 1.8164207458496093, |
| "reward_std": 0.25260339230298995, |
| "rewards/code_format_reward": 0.987499988079071, |
| "rewards/code_reward": 0.6613353252410888, |
| "step": 1600, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.18187000900506972, |
| "clip_ratio/high_mean": 0.026620355295017363, |
| "clip_ratio/low_mean": 0.011157544914749452, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03777789976447821, |
| "completion_length": 72.65250244140626, |
| "epoch": 0.3092882528095284, |
| "grad_norm": 2.8136842250823975, |
| "kl": 0.9565572030842304, |
| "learning_rate": 8.041135987987831e-07, |
| "loss": 0.0037, |
| "reward": 1.7599462985992431, |
| "reward_std": 0.26825075447559354, |
| "rewards/code_format_reward": 0.9899999976158143, |
| "rewards/code_reward": 0.6324730753898621, |
| "step": 1610, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.03404317735694349, |
| "clip_ratio/high_mean": 0.006068735342705622, |
| "clip_ratio/low_mean": 0.010824382931605214, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.016893118200823665, |
| "completion_length": 78.07500305175782, |
| "epoch": 0.31120929785803475, |
| "grad_norm": 31.179058074951172, |
| "kl": 0.560398967564106, |
| "learning_rate": 8.018669794712835e-07, |
| "loss": -0.0011, |
| "reward": 1.5130140781402588, |
| "reward_std": 0.2716240629553795, |
| "rewards/code_format_reward": 0.9799999952316284, |
| "rewards/code_reward": 0.5115070700645447, |
| "step": 1620, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.06218870538286865, |
| "clip_ratio/high_mean": 0.008549430634593591, |
| "clip_ratio/low_mean": 0.007052442076383158, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.01560187318827957, |
| "completion_length": 83.56500091552735, |
| "epoch": 0.3131303429065412, |
| "grad_norm": 0.6899747252464294, |
| "kl": 0.7204694971442223, |
| "learning_rate": 7.996111811498138e-07, |
| "loss": 0.0031, |
| "reward": 1.687961721420288, |
| "reward_std": 0.19512347355484963, |
| "rewards/code_format_reward": 0.9924999952316285, |
| "rewards/code_reward": 0.5958558440208435, |
| "step": 1630, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.17274489336414262, |
| "clip_ratio/high_mean": 0.021967002666497138, |
| "clip_ratio/low_mean": 0.009596503502689303, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03156350784411188, |
| "completion_length": 80.9175033569336, |
| "epoch": 0.31505138795504756, |
| "grad_norm": 2.105334758758545, |
| "kl": 0.8054538488388061, |
| "learning_rate": 7.97346286044274e-07, |
| "loss": -0.0058, |
| "reward": 1.3176400899887084, |
| "reward_std": 0.20478213280439378, |
| "rewards/code_format_reward": 0.98125, |
| "rewards/code_reward": 0.41350752413272857, |
| "step": 1640, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.16534482885617763, |
| "clip_ratio/high_mean": 0.02735080250131432, |
| "clip_ratio/low_mean": 0.0035748321075516286, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.030925634037703275, |
| "completion_length": 74.01250228881835, |
| "epoch": 0.31697243300355393, |
| "grad_norm": 184916.921875, |
| "kl": 28.671802641451357, |
| "learning_rate": 7.950723766960857e-07, |
| "loss": 5.579, |
| "reward": 1.6360910892486573, |
| "reward_std": 0.2874180316925049, |
| "rewards/code_format_reward": 0.9687500119209289, |
| "rewards/code_reward": 0.5758580267429352, |
| "step": 1650, |
| "zero_std_ratio": 0.375 |
| }, |
| { |
| "clip_ratio/high_max": 0.10983106552157551, |
| "clip_ratio/high_mean": 0.016536441215430388, |
| "clip_ratio/low_mean": 0.011150279239518567, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.027686719762277788, |
| "completion_length": 84.17750244140625, |
| "epoch": 0.3188934780520603, |
| "grad_norm": 219305424.0, |
| "kl": 106.82060827612877, |
| "learning_rate": 7.927895359751835e-07, |
| "loss": 5248.6121, |
| "reward": 1.5329812049865723, |
| "reward_std": 0.22349740117788314, |
| "rewards/code_format_reward": 0.9774999976158142, |
| "rewards/code_reward": 0.5221156060695649, |
| "step": 1660, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.13622083119116724, |
| "clip_ratio/high_mean": 0.01933064509066753, |
| "clip_ratio/low_mean": 0.005038347843219526, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.024368993006646633, |
| "completion_length": 80.39500274658204, |
| "epoch": 0.3208145231005667, |
| "grad_norm": 9.519110679626465, |
| "kl": 0.7214748501777649, |
| "learning_rate": 7.904978470769959e-07, |
| "loss": -0.0025, |
| "reward": 1.6617871284484864, |
| "reward_std": 0.27498180270195005, |
| "rewards/code_format_reward": 0.95625, |
| "rewards/code_reward": 0.5918310403823852, |
| "step": 1670, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.09761472065001726, |
| "clip_ratio/high_mean": 0.01911984165199101, |
| "clip_ratio/low_mean": 0.010301339952275158, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.02942118220962584, |
| "completion_length": 74.54750213623046, |
| "epoch": 0.3227355681490731, |
| "grad_norm": 6.143461227416992, |
| "kl": 0.7205829441547393, |
| "learning_rate": 7.881973935194124e-07, |
| "loss": 0.0015, |
| "reward": 1.4262179613113404, |
| "reward_std": 0.26740061640739443, |
| "rewards/code_format_reward": 0.9737499952316284, |
| "rewards/code_reward": 0.4696714758872986, |
| "step": 1680, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.07396706650033594, |
| "clip_ratio/high_mean": 0.011737752065528184, |
| "clip_ratio/low_mean": 0.005250315659213811, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.016988067945931107, |
| "completion_length": 75.27500228881836, |
| "epoch": 0.3246566131975795, |
| "grad_norm": 2.337491989135742, |
| "kl": 68.4789316162467, |
| "learning_rate": 7.858882591397403e-07, |
| "loss": 0.3045, |
| "reward": 1.527750849723816, |
| "reward_std": 0.26877219378948214, |
| "rewards/code_format_reward": 0.9899999976158143, |
| "rewards/code_reward": 0.5163754165172577, |
| "step": 1690, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.28693441725336016, |
| "clip_ratio/high_mean": 0.04205623795860447, |
| "clip_ratio/low_mean": 0.009473194915335626, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.051529432012466715, |
| "completion_length": 84.14500274658204, |
| "epoch": 0.32657765824608587, |
| "grad_norm": 20.964569091796875, |
| "kl": 0.5620399042963982, |
| "learning_rate": 7.835705280916488e-07, |
| "loss": -0.0051, |
| "reward": 1.615627408027649, |
| "reward_std": 0.2002291887998581, |
| "rewards/code_format_reward": 0.9949999928474427, |
| "rewards/code_reward": 0.5590636849403381, |
| "step": 1700, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.2242162274196744, |
| "clip_ratio/high_mean": 0.036464582500047985, |
| "clip_ratio/low_mean": 0.010222097241785378, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.046686679660342636, |
| "completion_length": 78.56000061035157, |
| "epoch": 0.32849870329459224, |
| "grad_norm": 3.2044875621795654, |
| "kl": 0.7747909784317016, |
| "learning_rate": 7.812442848421032e-07, |
| "loss": -0.0006, |
| "reward": 1.6169416427612304, |
| "reward_std": 0.24999960064888, |
| "rewards/code_format_reward": 0.9887500047683716, |
| "rewards/code_reward": 0.5612833142280579, |
| "step": 1710, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.10125078996643425, |
| "clip_ratio/high_mean": 0.019883562461473048, |
| "clip_ratio/low_mean": 0.014126901775307487, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.034010464209131896, |
| "completion_length": 73.05000152587891, |
| "epoch": 0.3304197483430986, |
| "grad_norm": 735.9865112304688, |
| "kl": 2.3181345582008364, |
| "learning_rate": 7.789096141682851e-07, |
| "loss": 0.1213, |
| "reward": 1.371981406211853, |
| "reward_std": 0.17790164202451705, |
| "rewards/code_format_reward": 0.9712499976158142, |
| "rewards/code_reward": 0.44317818284034727, |
| "step": 1720, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.15698121464811265, |
| "clip_ratio/high_mean": 0.026607585436431692, |
| "clip_ratio/low_mean": 0.004372719774255529, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.030980303999967873, |
| "completion_length": 78.5425033569336, |
| "epoch": 0.33234079339160505, |
| "grad_norm": 2.3281009197235107, |
| "kl": 1.7815167903900146, |
| "learning_rate": 7.765666011545045e-07, |
| "loss": 0.4359, |
| "reward": 1.669968068599701, |
| "reward_std": 0.18121034651994705, |
| "rewards/code_format_reward": 0.9737499833106995, |
| "rewards/code_reward": 0.5915465235710144, |
| "step": 1730, |
| "zero_std_ratio": 0.625 |
| }, |
| { |
| "clip_ratio/high_max": 0.1189429596066475, |
| "clip_ratio/high_mean": 0.021151045989245176, |
| "clip_ratio/low_mean": 0.002452358941081911, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.023603405337780714, |
| "completion_length": 69.71000289916992, |
| "epoch": 0.3342618384401114, |
| "grad_norm": 1720.8326416015625, |
| "kl": 0.7967777937650681, |
| "learning_rate": 7.742153311890971e-07, |
| "loss": 0.0982, |
| "reward": 1.5440645456314086, |
| "reward_std": 0.18595425188541412, |
| "rewards/code_format_reward": 0.9712499976158142, |
| "rewards/code_reward": 0.5292197823524475, |
| "step": 1740, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.08902034647762776, |
| "clip_ratio/high_mean": 0.012681722827255725, |
| "clip_ratio/low_mean": 0.00311334275174886, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.015795065369457007, |
| "completion_length": 74.49249954223633, |
| "epoch": 0.3361828834886178, |
| "grad_norm": 0.09847641736268997, |
| "kl": 0.8014414094388485, |
| "learning_rate": 7.718558899613143e-07, |
| "loss": 0.0099, |
| "reward": 1.5567015647888183, |
| "reward_std": 0.14754890371114016, |
| "rewards/code_format_reward": 0.9649999976158142, |
| "rewards/code_reward": 0.5371007978916168, |
| "step": 1750, |
| "zero_std_ratio": 0.675 |
| }, |
| { |
| "clip_ratio/high_max": 0.15779653917998077, |
| "clip_ratio/high_mean": 0.030520046106539668, |
| "clip_ratio/low_mean": 0.009007267560809851, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03952731299214065, |
| "completion_length": 77.64000091552734, |
| "epoch": 0.3381039285371242, |
| "grad_norm": 16.5263729095459, |
| "kl": 0.7359155111014843, |
| "learning_rate": 7.69488363458199e-07, |
| "loss": -0.0085, |
| "reward": 1.477712869644165, |
| "reward_std": 0.26145162880420686, |
| "rewards/code_format_reward": 0.993749988079071, |
| "rewards/code_reward": 0.49041891694068906, |
| "step": 1760, |
| "zero_std_ratio": 0.4 |
| }, |
| { |
| "clip_ratio/high_max": 0.17542534926906228, |
| "clip_ratio/high_mean": 0.025472976046148687, |
| "clip_ratio/low_mean": 0.005083448148798198, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.030556425044778734, |
| "completion_length": 78.76000061035157, |
| "epoch": 0.3400249735856306, |
| "grad_norm": 2.440377950668335, |
| "kl": 1.2570879265666008, |
| "learning_rate": 7.671128379614524e-07, |
| "loss": -0.0029, |
| "reward": 1.697490382194519, |
| "reward_std": 0.21552397906780243, |
| "rewards/code_format_reward": 0.9887499809265137, |
| "rewards/code_reward": 0.6015576839447021, |
| "step": 1770, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.03777246242389083, |
| "clip_ratio/high_mean": 0.005805602658074349, |
| "clip_ratio/low_mean": 0.006219673785381019, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.012025276734493672, |
| "completion_length": 78.01500091552734, |
| "epoch": 0.341946018634137, |
| "grad_norm": 3.58803129196167, |
| "kl": 1.3505164757370949, |
| "learning_rate": 7.647294000442899e-07, |
| "loss": -0.0008, |
| "reward": 1.3937680006027222, |
| "reward_std": 0.1832626909017563, |
| "rewards/code_format_reward": 0.9912500023841858, |
| "rewards/code_reward": 0.44907149076461794, |
| "step": 1780, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.08561003021895885, |
| "clip_ratio/high_mean": 0.011109948102966883, |
| "clip_ratio/low_mean": 0.0035756964149186387, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.014685644480050542, |
| "completion_length": 76.20749969482422, |
| "epoch": 0.34386706368264336, |
| "grad_norm": 10.503286361694336, |
| "kl": 0.552098847925663, |
| "learning_rate": 7.623381365682855e-07, |
| "loss": -0.0015, |
| "reward": 1.6644479036331177, |
| "reward_std": 0.22849067896604539, |
| "rewards/code_format_reward": 0.9899999976158143, |
| "rewards/code_reward": 0.5847239375114441, |
| "step": 1790, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.06243175007402897, |
| "clip_ratio/high_mean": 0.009089326043613255, |
| "clip_ratio/low_mean": 0.005161185140605084, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.014250511419959366, |
| "completion_length": 69.70000076293945, |
| "epoch": 0.34578810873114973, |
| "grad_norm": 4.685351371765137, |
| "kl": 0.3103115826845169, |
| "learning_rate": 7.599391346802063e-07, |
| "loss": -0.0003, |
| "reward": 1.8390909910202027, |
| "reward_std": 0.20207120031118392, |
| "rewards/code_format_reward": 0.9875, |
| "rewards/code_reward": 0.6726704835891724, |
| "step": 1800, |
| "zero_std_ratio": 0.6 |
| }, |
| { |
| "clip_ratio/high_max": 0.046840774989686904, |
| "clip_ratio/high_mean": 0.007519985581166111, |
| "clip_ratio/low_mean": 0.004676173024927266, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.01219615869631525, |
| "completion_length": 80.15500183105469, |
| "epoch": 0.3477091537796561, |
| "grad_norm": 21886460.0, |
| "kl": 0.48781016543507577, |
| "learning_rate": 7.575324818088367e-07, |
| "loss": 517.7405, |
| "reward": 1.6558839797973632, |
| "reward_std": 0.2796541228890419, |
| "rewards/code_format_reward": 0.9737500071525573, |
| "rewards/code_reward": 0.5845044732093811, |
| "step": 1810, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.18512438922189176, |
| "clip_ratio/high_mean": 0.0357341198658105, |
| "clip_ratio/low_mean": 0.0033004880184307694, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03903460723813623, |
| "completion_length": 78.84000091552734, |
| "epoch": 0.34963019882816254, |
| "grad_norm": 9.198795318603516, |
| "kl": 4.244446061551571, |
| "learning_rate": 7.551182656617924e-07, |
| "loss": 0.0031, |
| "reward": 1.5848650455474853, |
| "reward_std": 0.17606763169169426, |
| "rewards/code_format_reward": 0.9862500071525574, |
| "rewards/code_reward": 0.5458700299263001, |
| "step": 1820, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.07551750033162534, |
| "clip_ratio/high_mean": 0.013169253122759983, |
| "clip_ratio/low_mean": 0.001537335959437769, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.014706589409615844, |
| "completion_length": 82.8800033569336, |
| "epoch": 0.3515512438766689, |
| "grad_norm": 0.724766731262207, |
| "kl": 0.9274087265133858, |
| "learning_rate": 7.526965742223234e-07, |
| "loss": 0.0013, |
| "reward": 1.5606717586517334, |
| "reward_std": 0.2877893716096878, |
| "rewards/code_format_reward": 0.9837499976158142, |
| "rewards/code_reward": 0.5343983888626098, |
| "step": 1830, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.1388332260772586, |
| "clip_ratio/high_mean": 0.021653852658346295, |
| "clip_ratio/low_mean": 0.008576209528837354, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03023006208240986, |
| "completion_length": 74.26250076293945, |
| "epoch": 0.3534722889251753, |
| "grad_norm": 5.426670074462891, |
| "kl": 0.7045004338026046, |
| "learning_rate": 7.502674957461079e-07, |
| "loss": -0.007, |
| "reward": 1.5688656568527222, |
| "reward_std": 0.30554552264511586, |
| "rewards/code_format_reward": 0.9837499976158142, |
| "rewards/code_reward": 0.5384953856468201, |
| "step": 1840, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.07899991576559842, |
| "clip_ratio/high_mean": 0.013301478006178513, |
| "clip_ratio/low_mean": 0.01124582380289212, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.024547301628626884, |
| "completion_length": 74.45500106811524, |
| "epoch": 0.35539333397368167, |
| "grad_norm": 2.5104761123657227, |
| "kl": 0.6198086604475975, |
| "learning_rate": 7.478311187580363e-07, |
| "loss": -0.0071, |
| "reward": 1.5550098896026612, |
| "reward_std": 0.21109988391399384, |
| "rewards/code_format_reward": 0.9924999952316285, |
| "rewards/code_reward": 0.52937992811203, |
| "step": 1850, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.063601403683424, |
| "clip_ratio/high_mean": 0.010639100335538387, |
| "clip_ratio/low_mean": 0.00778028266504407, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.018419382628053427, |
| "completion_length": 71.92500152587891, |
| "epoch": 0.35731437902218804, |
| "grad_norm": 3.805928945541382, |
| "kl": 1.6179959252476692, |
| "learning_rate": 7.453875320489842e-07, |
| "loss": 0.3, |
| "reward": 1.4410953760147094, |
| "reward_std": 0.19501519501209258, |
| "rewards/code_format_reward": 0.981249988079071, |
| "rewards/code_reward": 0.47523519992828367, |
| "step": 1860, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.10860318611375988, |
| "clip_ratio/high_mean": 0.018746975070098416, |
| "clip_ratio/low_mean": 0.008747255423804745, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.02749423016794026, |
| "completion_length": 69.9375015258789, |
| "epoch": 0.3592354240706945, |
| "grad_norm": 2.388782501220703, |
| "kl": 0.5952992506325245, |
| "learning_rate": 7.429368246725772e-07, |
| "loss": 0.0443, |
| "reward": 1.6972971916198731, |
| "reward_std": 0.17401356399059295, |
| "rewards/code_format_reward": 0.9912499904632568, |
| "rewards/code_reward": 0.6008361041545868, |
| "step": 1870, |
| "zero_std_ratio": 0.625 |
| }, |
| { |
| "clip_ratio/high_max": 0.08630747124552726, |
| "clip_ratio/high_mean": 0.012746809562668205, |
| "clip_ratio/low_mean": 0.010304910433478653, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.02305172006599605, |
| "completion_length": 70.83000183105469, |
| "epoch": 0.36115646911920085, |
| "grad_norm": 16.255178451538086, |
| "kl": 0.8730347856879235, |
| "learning_rate": 7.40479085941945e-07, |
| "loss": 0.0036, |
| "reward": 1.467816424369812, |
| "reward_std": 0.17535984218120576, |
| "rewards/code_format_reward": 0.9925000071525574, |
| "rewards/code_reward": 0.48578319549560545, |
| "step": 1880, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.26251301234588026, |
| "clip_ratio/high_mean": 0.03827818045392632, |
| "clip_ratio/low_mean": 0.005526873719645664, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.04380505495937541, |
| "completion_length": 64.74750213623047, |
| "epoch": 0.3630775141677072, |
| "grad_norm": 4.061140060424805, |
| "kl": 0.8530658036470413, |
| "learning_rate": 7.380144054264669e-07, |
| "loss": 0.0197, |
| "reward": 1.498781108856201, |
| "reward_std": 0.17463037073612214, |
| "rewards/code_format_reward": 0.9600000023841858, |
| "rewards/code_reward": 0.509390527009964, |
| "step": 1890, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.24850870491936802, |
| "clip_ratio/high_mean": 0.04144583061570302, |
| "clip_ratio/low_mean": 0.00702623330289498, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.04847206423291937, |
| "completion_length": 75.8375015258789, |
| "epoch": 0.3649985592162136, |
| "grad_norm": 3.4472062587738037, |
| "kl": 1.6324397973716258, |
| "learning_rate": 7.355428729485071e-07, |
| "loss": -0.001, |
| "reward": 1.6619214057922362, |
| "reward_std": 0.18103656098246573, |
| "rewards/code_format_reward": 0.9875, |
| "rewards/code_reward": 0.5840856909751893, |
| "step": 1900, |
| "zero_std_ratio": 0.625 |
| }, |
| { |
| "clip_ratio/high_max": 0.09173821061849594, |
| "clip_ratio/high_mean": 0.014921509474515916, |
| "clip_ratio/low_mean": 0.002157307107700035, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.017078816797584294, |
| "completion_length": 62.185000610351565, |
| "epoch": 0.36691960426472003, |
| "grad_norm": 2.0225422382354736, |
| "kl": 184.02759787738324, |
| "learning_rate": 7.330645785801417e-07, |
| "loss": 2.9496, |
| "reward": 1.7410502433776855, |
| "reward_std": 0.10668236091732979, |
| "rewards/code_format_reward": 0.9949999928474427, |
| "rewards/code_reward": 0.6217751204967499, |
| "step": 1910, |
| "zero_std_ratio": 0.75 |
| }, |
| { |
| "clip_ratio/high_max": 0.16933906488120556, |
| "clip_ratio/high_mean": 0.02619449864141643, |
| "clip_ratio/low_mean": 0.014137339405715465, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.04033183753490448, |
| "completion_length": 79.30000152587891, |
| "epoch": 0.3688406493132264, |
| "grad_norm": 2.6208443641662598, |
| "kl": 1.235317513346672, |
| "learning_rate": 7.305796126398758e-07, |
| "loss": -0.0012, |
| "reward": 1.5036948204040528, |
| "reward_std": 0.20645264089107512, |
| "rewards/code_format_reward": 0.9762499928474426, |
| "rewards/code_reward": 0.5077848553657531, |
| "step": 1920, |
| "zero_std_ratio": 0.6 |
| }, |
| { |
| "clip_ratio/high_max": 0.2661599090555683, |
| "clip_ratio/high_mean": 0.03600101897318382, |
| "clip_ratio/low_mean": 0.009155643907433841, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.045156662538647654, |
| "completion_length": 78.10000152587891, |
| "epoch": 0.3707616943617328, |
| "grad_norm": 8.953734397888184, |
| "kl": 0.6204134523868561, |
| "learning_rate": 7.280880656893518e-07, |
| "loss": 0.0025, |
| "reward": 1.4915935516357421, |
| "reward_std": 0.2376121073961258, |
| "rewards/code_format_reward": 0.9787499904632568, |
| "rewards/code_reward": 0.501109266281128, |
| "step": 1930, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.15203024838119744, |
| "clip_ratio/high_mean": 0.023713350854814054, |
| "clip_ratio/low_mean": 0.004282052081543952, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.02799540273845196, |
| "completion_length": 74.42500076293945, |
| "epoch": 0.37268273941023916, |
| "grad_norm": 11.845942497253418, |
| "kl": 0.5031724810600281, |
| "learning_rate": 7.255900285300496e-07, |
| "loss": 0.5255, |
| "reward": 1.6400779724121093, |
| "reward_std": 0.22267285138368606, |
| "rewards/code_format_reward": 0.9649999856948852, |
| "rewards/code_reward": 0.5787889719009399, |
| "step": 1940, |
| "zero_std_ratio": 0.625 |
| }, |
| { |
| "clip_ratio/high_max": 0.09135808227583767, |
| "clip_ratio/high_mean": 0.012801296508405358, |
| "clip_ratio/low_mean": 0.01690869364247192, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.02970999013632536, |
| "completion_length": 69.52000198364257, |
| "epoch": 0.37460378445874554, |
| "grad_norm": 6.7441229820251465, |
| "kl": 1.2024895504117012, |
| "learning_rate": 7.230855921999769e-07, |
| "loss": 44.3651, |
| "reward": 1.6912511348724366, |
| "reward_std": 0.17418113350868225, |
| "rewards/code_format_reward": 0.9899999976158143, |
| "rewards/code_reward": 0.5981255412101746, |
| "step": 1950, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.07453169417567551, |
| "clip_ratio/high_mean": 0.009913802641676739, |
| "clip_ratio/low_mean": 0.003736039294744842, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.013649841863662004, |
| "completion_length": 74.01250228881835, |
| "epoch": 0.37652482950725197, |
| "grad_norm": 4.616723537445068, |
| "kl": 0.6156632959842682, |
| "learning_rate": 7.205748479703515e-07, |
| "loss": -0.0005, |
| "reward": 1.846400761604309, |
| "reward_std": 0.17167636156082153, |
| "rewards/code_format_reward": 0.9899999976158143, |
| "rewards/code_reward": 0.6757004141807557, |
| "step": 1960, |
| "zero_std_ratio": 0.675 |
| }, |
| { |
| "clip_ratio/high_max": 0.09189570704475045, |
| "clip_ratio/high_mean": 0.013587052945513278, |
| "clip_ratio/low_mean": 0.004667519498616457, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0182545724324882, |
| "completion_length": 64.46750030517578, |
| "epoch": 0.37844587455575834, |
| "grad_norm": 0.17748567461967468, |
| "kl": 0.4286219261586666, |
| "learning_rate": 7.180578873422757e-07, |
| "loss": -0.0046, |
| "reward": 1.612094521522522, |
| "reward_std": 0.10822201184928418, |
| "rewards/code_format_reward": 0.99375, |
| "rewards/code_reward": 0.5576097548007966, |
| "step": 1970, |
| "zero_std_ratio": 0.725 |
| }, |
| { |
| "clip_ratio/high_max": 0.2088342323899269, |
| "clip_ratio/high_mean": 0.028434151923283933, |
| "clip_ratio/low_mean": 0.005974846053868532, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.034408997558057305, |
| "completion_length": 69.26750106811524, |
| "epoch": 0.3803669196042647, |
| "grad_norm": 6.238914966583252, |
| "kl": 0.7256933867931366, |
| "learning_rate": 7.155348020434001e-07, |
| "loss": -0.0046, |
| "reward": 1.469704508781433, |
| "reward_std": 0.24035734832286834, |
| "rewards/code_format_reward": 0.9799999833106995, |
| "rewards/code_reward": 0.4898522675037384, |
| "step": 1980, |
| "zero_std_ratio": 0.4 |
| }, |
| { |
| "clip_ratio/high_max": 0.05789460870437324, |
| "clip_ratio/high_mean": 0.007717460609273985, |
| "clip_ratio/low_mean": 0.003460834617726505, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.011178295104764402, |
| "completion_length": 70.19000244140625, |
| "epoch": 0.3822879646527711, |
| "grad_norm": 8.066108703613281, |
| "kl": 1.1788517452776432, |
| "learning_rate": 7.130056840245824e-07, |
| "loss": -0.0005, |
| "reward": 1.5026792764663697, |
| "reward_std": 0.2312860034406185, |
| "rewards/code_format_reward": 0.9962499976158142, |
| "rewards/code_reward": 0.5022771418094635, |
| "step": 1990, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.07602796133141965, |
| "clip_ratio/high_mean": 0.012856367122731171, |
| "clip_ratio/low_mean": 0.0035519548939191735, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.016408322553616017, |
| "completion_length": 66.4625015258789, |
| "epoch": 0.38420900970127747, |
| "grad_norm": 3.559206962585449, |
| "kl": 1.225260878354311, |
| "learning_rate": 7.104706254565358e-07, |
| "loss": -0.003, |
| "reward": 1.742388916015625, |
| "reward_std": 0.12480423972010612, |
| "rewards/code_format_reward": 0.9924999952316285, |
| "rewards/code_reward": 0.623069453239441, |
| "step": 2000, |
| "zero_std_ratio": 0.675 |
| }, |
| { |
| "clip_ratio/high_max": 0.11271043051965535, |
| "clip_ratio/high_mean": 0.017727556044701488, |
| "clip_ratio/low_mean": 0.005613272835034877, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.02334082857705653, |
| "completion_length": 77.1650001525879, |
| "epoch": 0.3861300547497839, |
| "grad_norm": 3.4077274799346924, |
| "kl": 0.8489379599690438, |
| "learning_rate": 7.07929718726469e-07, |
| "loss": 0.0403, |
| "reward": 1.5602745056152343, |
| "reward_std": 0.2609230324625969, |
| "rewards/code_format_reward": 0.9850000023841858, |
| "rewards/code_reward": 0.5338872492313385, |
| "step": 2010, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.2993799396790564, |
| "clip_ratio/high_mean": 0.041865267558023334, |
| "clip_ratio/low_mean": 0.006948894041124731, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.04881416228599846, |
| "completion_length": 74.0150016784668, |
| "epoch": 0.3880510997982903, |
| "grad_norm": 3.2043685913085938, |
| "kl": 6.086115422844887, |
| "learning_rate": 7.053830564347206e-07, |
| "loss": 2.2989, |
| "reward": 1.5310536623001099, |
| "reward_std": 0.19302123934030532, |
| "rewards/code_format_reward": 0.9837500095367432, |
| "rewards/code_reward": 0.5195893287658692, |
| "step": 2020, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.06591402762569487, |
| "clip_ratio/high_mean": 0.009311116795288399, |
| "clip_ratio/low_mean": 0.0017412514251191169, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.01105236830189824, |
| "completion_length": 73.44250106811523, |
| "epoch": 0.38997214484679665, |
| "grad_norm": 2.137256622314453, |
| "kl": 3.9139866441488267, |
| "learning_rate": 7.028307313913838e-07, |
| "loss": 0.0061, |
| "reward": 1.8796703815460205, |
| "reward_std": 0.12868851274251938, |
| "rewards/code_format_reward": 0.9974999904632569, |
| "rewards/code_reward": 0.6904601573944091, |
| "step": 2030, |
| "zero_std_ratio": 0.775 |
| }, |
| { |
| "clip_ratio/high_max": 0.24738994101062417, |
| "clip_ratio/high_mean": 0.03705689987400547, |
| "clip_ratio/low_mean": 0.007423648721305654, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.04448054819367826, |
| "completion_length": 67.09500198364258, |
| "epoch": 0.39189318989530303, |
| "grad_norm": 5.504507541656494, |
| "kl": 1.4878595262765884, |
| "learning_rate": 7.002728366129242e-07, |
| "loss": 0.0166, |
| "reward": 1.8640715599060058, |
| "reward_std": 0.22610510736703873, |
| "rewards/code_format_reward": 0.9799999952316284, |
| "rewards/code_reward": 0.6870357990264893, |
| "step": 2040, |
| "zero_std_ratio": 0.65 |
| }, |
| { |
| "clip_ratio/high_max": 0.09283696161583066, |
| "clip_ratio/high_mean": 0.014592013147193938, |
| "clip_ratio/low_mean": 0.0040809189551509915, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.01867293259128928, |
| "completion_length": 72.88500137329102, |
| "epoch": 0.3938142349438094, |
| "grad_norm": 1.877032995223999, |
| "kl": 2.3534633785486223, |
| "learning_rate": 6.977094653187891e-07, |
| "loss": 0.3364, |
| "reward": 1.5182712078094482, |
| "reward_std": 0.19934598058462144, |
| "rewards/code_format_reward": 0.9712499976158142, |
| "rewards/code_reward": 0.5163230776786805, |
| "step": 2050, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.047755516692996026, |
| "clip_ratio/high_mean": 0.007312651420943439, |
| "clip_ratio/low_mean": 0.0007527987050707452, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.008065450168214739, |
| "completion_length": 67.76500091552734, |
| "epoch": 0.39573527999231584, |
| "grad_norm": 1.7954281568527222, |
| "kl": 2.4329017847776413, |
| "learning_rate": 6.95140710928012e-07, |
| "loss": 206.5648, |
| "reward": 1.3761554956436157, |
| "reward_std": 0.21033956706523896, |
| "rewards/code_format_reward": 0.9762499928474426, |
| "rewards/code_reward": 0.44401525855064394, |
| "step": 2060, |
| "zero_std_ratio": 0.6 |
| }, |
| { |
| "clip_ratio/high_max": 0.07075442476198077, |
| "clip_ratio/high_mean": 0.009443573304452002, |
| "clip_ratio/low_mean": 0.003901358728762716, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.013344932324253022, |
| "completion_length": 68.6150016784668, |
| "epoch": 0.3976563250408222, |
| "grad_norm": 1.3921815156936646, |
| "kl": 0.6283935949206352, |
| "learning_rate": 6.925666670558062e-07, |
| "loss": 1.5274, |
| "reward": 1.4756604433059692, |
| "reward_std": 0.2542987480759621, |
| "rewards/code_format_reward": 0.9837499976158142, |
| "rewards/code_reward": 0.49189271330833434, |
| "step": 2070, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.09334485791623592, |
| "clip_ratio/high_mean": 0.015712386509403587, |
| "clip_ratio/low_mean": 0.005205962993204594, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.02091834945604205, |
| "completion_length": 75.06750183105468, |
| "epoch": 0.3995773700893286, |
| "grad_norm": 1.3997697830200195, |
| "kl": 0.5330163806676864, |
| "learning_rate": 6.899874275101538e-07, |
| "loss": -0.0031, |
| "reward": 1.7522424459457397, |
| "reward_std": 0.1803124487400055, |
| "rewards/code_format_reward": 0.9899999856948852, |
| "rewards/code_reward": 0.6286212205886841, |
| "step": 2080, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.08916364926844836, |
| "clip_ratio/high_mean": 0.014017748599871992, |
| "clip_ratio/low_mean": 0.003948131998186, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.01796588017605245, |
| "completion_length": 78.19750213623047, |
| "epoch": 0.40149841513783496, |
| "grad_norm": 2296.336669921875, |
| "kl": 1.0256180852651595, |
| "learning_rate": 6.874030862883879e-07, |
| "loss": 0.0318, |
| "reward": 1.2450440883636475, |
| "reward_std": 0.22890471369028093, |
| "rewards/code_format_reward": 0.9775000095367432, |
| "rewards/code_reward": 0.3781470343470573, |
| "step": 2090, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.22327043637633323, |
| "clip_ratio/high_mean": 0.04789549903944135, |
| "clip_ratio/low_mean": 0.00559167112223804, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.053487171232700345, |
| "completion_length": 70.61250152587891, |
| "epoch": 0.4034194601863414, |
| "grad_norm": 3.2615253925323486, |
| "kl": 8.218332803249359, |
| "learning_rate": 6.848137375737652e-07, |
| "loss": 0.0058, |
| "reward": 1.6430699110031128, |
| "reward_std": 0.21420457661151887, |
| "rewards/code_format_reward": 0.96875, |
| "rewards/code_reward": 0.5793474376201629, |
| "step": 2100, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.1533806946128607, |
| "clip_ratio/high_mean": 0.02256658235564828, |
| "clip_ratio/low_mean": 0.002787484592408873, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.025354066491127016, |
| "completion_length": 74.33999938964844, |
| "epoch": 0.40534050523484777, |
| "grad_norm": 4.315516471862793, |
| "kl": 1.0426696628332137, |
| "learning_rate": 6.822194757320354e-07, |
| "loss": 0.0019, |
| "reward": 1.6090970516204834, |
| "reward_std": 0.1758709292858839, |
| "rewards/code_format_reward": 0.993749988079071, |
| "rewards/code_reward": 0.5561110019683838, |
| "step": 2110, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.1336930485442281, |
| "clip_ratio/high_mean": 0.021989132883027195, |
| "clip_ratio/low_mean": 0.0070218192064203325, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0290109527297318, |
| "completion_length": 73.0250015258789, |
| "epoch": 0.40726155028335415, |
| "grad_norm": 18.143117904663086, |
| "kl": 0.4288759011775255, |
| "learning_rate": 6.796203953080007e-07, |
| "loss": 0.0005, |
| "reward": 1.72017080783844, |
| "reward_std": 0.22243313789367675, |
| "rewards/code_format_reward": 0.9824999928474426, |
| "rewards/code_reward": 0.6144603788852692, |
| "step": 2120, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.08061643750406802, |
| "clip_ratio/high_mean": 0.011467291257577016, |
| "clip_ratio/low_mean": 0.011395246715983376, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.022862538695335388, |
| "completion_length": 68.66250152587891, |
| "epoch": 0.4091825953318605, |
| "grad_norm": 1.0005404949188232, |
| "kl": 0.47304695919156076, |
| "learning_rate": 6.770165910220709e-07, |
| "loss": 0.0006, |
| "reward": 1.4831626653671264, |
| "reward_std": 0.1916220799088478, |
| "rewards/code_format_reward": 0.9837499856948853, |
| "rewards/code_reward": 0.4956438183784485, |
| "step": 2130, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.06393485199660062, |
| "clip_ratio/high_mean": 0.011905963439494372, |
| "clip_ratio/low_mean": 0.0023792986408807336, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.01428526220843196, |
| "completion_length": 74.32250137329102, |
| "epoch": 0.4111036403803669, |
| "grad_norm": 2.491830825805664, |
| "kl": 2.213325946778059, |
| "learning_rate": 6.744081577668115e-07, |
| "loss": 0.1532, |
| "reward": 1.7680244207382203, |
| "reward_std": 0.18317916095256806, |
| "rewards/code_format_reward": 0.9687499880790711, |
| "rewards/code_reward": 0.6418246865272522, |
| "step": 2140, |
| "zero_std_ratio": 0.675 |
| }, |
| { |
| "clip_ratio/high_max": 0.03965856842696667, |
| "clip_ratio/high_mean": 0.00730013819411397, |
| "clip_ratio/low_mean": 0.0031650666729547083, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.010465204739011824, |
| "completion_length": 73.1050018310547, |
| "epoch": 0.41302468542887333, |
| "grad_norm": 0.353427916765213, |
| "kl": 0.2898652456700802, |
| "learning_rate": 6.717951906034856e-07, |
| "loss": -0.0015, |
| "reward": 1.6113624095916748, |
| "reward_std": 0.09930019937455654, |
| "rewards/code_format_reward": 0.9862499952316284, |
| "rewards/code_reward": 0.5591186702251434, |
| "step": 2150, |
| "zero_std_ratio": 0.725 |
| }, |
| { |
| "clip_ratio/high_max": 0.03382167350500822, |
| "clip_ratio/high_mean": 0.005409902473911643, |
| "clip_ratio/low_mean": 0.0024156818573828785, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.007825584337115287, |
| "completion_length": 68.12750091552735, |
| "epoch": 0.4149457304773797, |
| "grad_norm": 3.9950575828552246, |
| "kl": 0.789361334592104, |
| "learning_rate": 6.691777847585883e-07, |
| "loss": 0.048, |
| "reward": 1.5698497295379639, |
| "reward_std": 0.1552659712731838, |
| "rewards/code_format_reward": 0.9725000023841858, |
| "rewards/code_reward": 0.5417998552322387, |
| "step": 2160, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.026565033989027143, |
| "clip_ratio/high_mean": 0.004212364956038073, |
| "clip_ratio/low_mean": 0.0013839059392921627, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005596270889509469, |
| "completion_length": 70.80999984741212, |
| "epoch": 0.4168667755258861, |
| "grad_norm": 1.3910998106002808, |
| "kl": 1.4257395297288895, |
| "learning_rate": 6.665560356203784e-07, |
| "loss": 0.8731, |
| "reward": 1.4512264728546143, |
| "reward_std": 0.14117379933595658, |
| "rewards/code_format_reward": 0.9924999952316285, |
| "rewards/code_reward": 0.47748821377754214, |
| "step": 2170, |
| "zero_std_ratio": 0.65 |
| }, |
| { |
| "clip_ratio/high_max": 0.09546168451197445, |
| "clip_ratio/high_mean": 0.01459201174438931, |
| "clip_ratio/low_mean": 0.006060798710677773, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.020652810629690065, |
| "completion_length": 67.89000091552734, |
| "epoch": 0.41878782057439246, |
| "grad_norm": 0.6732813715934753, |
| "kl": 1.1321026906371117, |
| "learning_rate": 6.639300387353999e-07, |
| "loss": -0.0002, |
| "reward": 1.3501636981964111, |
| "reward_std": 0.21670444533228875, |
| "rewards/code_format_reward": 0.9924999833106994, |
| "rewards/code_reward": 0.42695685029029845, |
| "step": 2180, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.11407874876167626, |
| "clip_ratio/high_mean": 0.01725804756570142, |
| "clip_ratio/low_mean": 0.0015681478500482627, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.018826195126166567, |
| "completion_length": 68.7525016784668, |
| "epoch": 0.42070886562289883, |
| "grad_norm": 1.5759879350662231, |
| "kl": 0.4211964398622513, |
| "learning_rate": 6.612998898050014e-07, |
| "loss": -0.0021, |
| "reward": 1.7485667228698731, |
| "reward_std": 0.16526954025030136, |
| "rewards/code_format_reward": 0.9612500071525574, |
| "rewards/code_reward": 0.6339708626270294, |
| "step": 2190, |
| "zero_std_ratio": 0.6 |
| }, |
| { |
| "clip_ratio/high_max": 0.10751554854214192, |
| "clip_ratio/high_mean": 0.013745604571886361, |
| "clip_ratio/low_mean": 0.010064921525190585, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.023810526612214743, |
| "completion_length": 62.88500137329102, |
| "epoch": 0.42262991067140526, |
| "grad_norm": 2.4066975116729736, |
| "kl": 0.7549011036753654, |
| "learning_rate": 6.586656846818477e-07, |
| "loss": 0.2999, |
| "reward": 1.6932018756866456, |
| "reward_std": 0.1608109436929226, |
| "rewards/code_format_reward": 0.9899999976158143, |
| "rewards/code_reward": 0.5991009473800659, |
| "step": 2200, |
| "zero_std_ratio": 0.675 |
| }, |
| { |
| "clip_ratio/high_max": 0.019488381547853352, |
| "clip_ratio/high_mean": 0.003436583065195009, |
| "clip_ratio/low_mean": 0.002801175639615394, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0062377589056268334, |
| "completion_length": 72.55250244140625, |
| "epoch": 0.42455095571991164, |
| "grad_norm": 2.0696611404418945, |
| "kl": 5.306586292386055, |
| "learning_rate": 6.56027519366427e-07, |
| "loss": 0.011, |
| "reward": 1.611876368522644, |
| "reward_std": 0.1603232156485319, |
| "rewards/code_format_reward": 0.9850000023841858, |
| "rewards/code_reward": 0.5596881568431854, |
| "step": 2210, |
| "zero_std_ratio": 0.6 |
| }, |
| { |
| "clip_ratio/high_max": 0.05246827639639377, |
| "clip_ratio/high_mean": 0.00732308179140091, |
| "clip_ratio/low_mean": 0.0034836977836675944, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.010806779406266287, |
| "completion_length": 64.31750183105468, |
| "epoch": 0.426472000768418, |
| "grad_norm": 0.12577353417873383, |
| "kl": 0.5850224502384662, |
| "learning_rate": 6.533854900035516e-07, |
| "loss": -0.0015, |
| "reward": 1.7735862731933594, |
| "reward_std": 0.13040905613452197, |
| "rewards/code_format_reward": 0.9875, |
| "rewards/code_reward": 0.6399181246757507, |
| "step": 2220, |
| "zero_std_ratio": 0.7 |
| }, |
| { |
| "clip_ratio/high_max": 0.24315445288084447, |
| "clip_ratio/high_mean": 0.031706276966724546, |
| "clip_ratio/low_mean": 0.011593326600268484, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.04329960328759626, |
| "completion_length": 72.80000152587891, |
| "epoch": 0.4283930458169244, |
| "grad_norm": 4.765016078948975, |
| "kl": 1.5887107208371163, |
| "learning_rate": 6.507396928788548e-07, |
| "loss": 0.0023, |
| "reward": 1.6477301597595215, |
| "reward_std": 0.12887158915400504, |
| "rewards/code_format_reward": 0.975, |
| "rewards/code_reward": 0.5801151037216187, |
| "step": 2230, |
| "zero_std_ratio": 0.675 |
| }, |
| { |
| "clip_ratio/high_max": 0.044854282308369874, |
| "clip_ratio/high_mean": 0.007485381804872304, |
| "clip_ratio/low_mean": 0.0028356918206554837, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.01032107327482663, |
| "completion_length": 66.63000183105468, |
| "epoch": 0.4303140908654308, |
| "grad_norm": 1.5923104286193848, |
| "kl": 0.9431760296225548, |
| "learning_rate": 6.480902244152813e-07, |
| "loss": -0.0021, |
| "reward": 1.4723083972930908, |
| "reward_std": 0.13776133116334677, |
| "rewards/code_format_reward": 0.9899999856948852, |
| "rewards/code_reward": 0.48865418434143065, |
| "step": 2240, |
| "zero_std_ratio": 0.7 |
| }, |
| { |
| "clip_ratio/high_max": 0.08558401605114341, |
| "clip_ratio/high_mean": 0.01418596402509138, |
| "clip_ratio/low_mean": 0.005716345021210145, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.019902308681048454, |
| "completion_length": 67.75250015258788, |
| "epoch": 0.4322351359139372, |
| "grad_norm": 4.213563442230225, |
| "kl": 0.7182839468121529, |
| "learning_rate": 6.454371811695732e-07, |
| "loss": -0.0032, |
| "reward": 1.5263491868972778, |
| "reward_std": 0.215225650370121, |
| "rewards/code_format_reward": 0.975000011920929, |
| "rewards/code_reward": 0.51942458152771, |
| "step": 2250, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.17924602022394537, |
| "clip_ratio/high_mean": 0.02314122476382181, |
| "clip_ratio/low_mean": 0.006780697987414897, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.029921922995708884, |
| "completion_length": 67.31500091552735, |
| "epoch": 0.43415618096244357, |
| "grad_norm": 2.018653392791748, |
| "kl": 0.644180704653263, |
| "learning_rate": 6.427806598287522e-07, |
| "loss": -0.0031, |
| "reward": 1.8284268617630004, |
| "reward_std": 0.1590463936328888, |
| "rewards/code_format_reward": 0.993749988079071, |
| "rewards/code_reward": 0.6657759308815002, |
| "step": 2260, |
| "zero_std_ratio": 0.625 |
| }, |
| { |
| "clip_ratio/high_max": 0.26562999933958054, |
| "clip_ratio/high_mean": 0.04080731603316963, |
| "clip_ratio/low_mean": 0.002605196795775555, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.043412512401118875, |
| "completion_length": 64.91250076293946, |
| "epoch": 0.43607722601094995, |
| "grad_norm": 2.8014633655548096, |
| "kl": 1.4193657219409943, |
| "learning_rate": 6.401207572065942e-07, |
| "loss": 0.0075, |
| "reward": 1.6795406818389893, |
| "reward_std": 0.1340640414506197, |
| "rewards/code_format_reward": 0.99375, |
| "rewards/code_reward": 0.5913328170776367, |
| "step": 2270, |
| "zero_std_ratio": 0.625 |
| }, |
| { |
| "clip_ratio/high_max": 0.15035496577620505, |
| "clip_ratio/high_mean": 0.021555275144055485, |
| "clip_ratio/low_mean": 0.007308500797080342, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.028863775311037898, |
| "completion_length": 83.20750122070312, |
| "epoch": 0.4379982710594563, |
| "grad_norm": 5.3116655349731445, |
| "kl": 1.7165004715323449, |
| "learning_rate": 6.374575702401019e-07, |
| "loss": -0.0031, |
| "reward": 1.694450354576111, |
| "reward_std": 0.2935485541820526, |
| "rewards/code_format_reward": 0.9650000095367431, |
| "rewards/code_reward": 0.6059751749038697, |
| "step": 2280, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.05178634703624994, |
| "clip_ratio/high_mean": 0.007199086344917305, |
| "clip_ratio/low_mean": 0.004959188599605114, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.012158275028923526, |
| "completion_length": 68.35500106811523, |
| "epoch": 0.43991931610796275, |
| "grad_norm": 11.67419719696045, |
| "kl": 0.8460408747196198, |
| "learning_rate": 6.347911959859725e-07, |
| "loss": -0.0013, |
| "reward": 1.6080287456512452, |
| "reward_std": 0.2270718976855278, |
| "rewards/code_format_reward": 0.9699999928474426, |
| "rewards/code_reward": 0.5615143775939941, |
| "step": 2290, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.07622572851832957, |
| "clip_ratio/high_mean": 0.011604995708330535, |
| "clip_ratio/low_mean": 0.0013341609621420503, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.012939156647189521, |
| "completion_length": 68.30750274658203, |
| "epoch": 0.44184036115646913, |
| "grad_norm": 332.7762451171875, |
| "kl": 0.7540152728557586, |
| "learning_rate": 6.321217316170599e-07, |
| "loss": 0.1015, |
| "reward": 1.4850183725357056, |
| "reward_std": 0.1393202841281891, |
| "rewards/code_format_reward": 0.9912499904632568, |
| "rewards/code_reward": 0.49469670057296755, |
| "step": 2300, |
| "zero_std_ratio": 0.6 |
| }, |
| { |
| "clip_ratio/high_max": 0.16746083926409483, |
| "clip_ratio/high_mean": 0.02103413282893598, |
| "clip_ratio/low_mean": 0.0068577720652683635, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.027891904639545828, |
| "completion_length": 64.55000152587891, |
| "epoch": 0.4437614062049755, |
| "grad_norm": 0.36056017875671387, |
| "kl": 0.4329931303858757, |
| "learning_rate": 6.294492744188335e-07, |
| "loss": 0.0002, |
| "reward": 1.4963040232658387, |
| "reward_std": 0.07247132882475853, |
| "rewards/code_format_reward": 0.9837499976158142, |
| "rewards/code_reward": 0.502214539051056, |
| "step": 2310, |
| "zero_std_ratio": 0.725 |
| }, |
| { |
| "clip_ratio/high_max": 0.05429213300812989, |
| "clip_ratio/high_mean": 0.007803994990536012, |
| "clip_ratio/low_mean": 0.008226435555843636, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.01603043078503106, |
| "completion_length": 69.78750228881836, |
| "epoch": 0.4456824512534819, |
| "grad_norm": 0.1676941215991974, |
| "kl": 0.276796979829669, |
| "learning_rate": 6.267739217858329e-07, |
| "loss": -0.0028, |
| "reward": 1.7269956827163697, |
| "reward_std": 0.1742506742477417, |
| "rewards/code_format_reward": 0.9912499904632568, |
| "rewards/code_reward": 0.6156853199005127, |
| "step": 2320, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.03961263382807374, |
| "clip_ratio/high_mean": 0.00831791803939268, |
| "clip_ratio/low_mean": 0.008615480939624831, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.016933399019762874, |
| "completion_length": 70.70000228881835, |
| "epoch": 0.44760349630198826, |
| "grad_norm": 6.724217891693115, |
| "kl": 0.544577070325613, |
| "learning_rate": 6.240957712181186e-07, |
| "loss": -0.0041, |
| "reward": 1.3949034690856934, |
| "reward_std": 0.21950918734073638, |
| "rewards/code_format_reward": 0.9774999976158142, |
| "rewards/code_reward": 0.45307670831680297, |
| "step": 2330, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.2168802363506984, |
| "clip_ratio/high_mean": 0.03705684195374488, |
| "clip_ratio/low_mean": 0.0028890643618069587, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03994590537622571, |
| "completion_length": 74.0625, |
| "epoch": 0.4495245413504947, |
| "grad_norm": 3.073554277420044, |
| "kl": 0.6307030320167542, |
| "learning_rate": 6.214149203177182e-07, |
| "loss": -0.0002, |
| "reward": 1.679004979133606, |
| "reward_std": 0.1860196329653263, |
| "rewards/code_format_reward": 0.9912499904632568, |
| "rewards/code_reward": 0.5916899800300598, |
| "step": 2340, |
| "zero_std_ratio": 0.6 |
| }, |
| { |
| "clip_ratio/high_max": 0.0932310588657856, |
| "clip_ratio/high_mean": 0.014429462677799165, |
| "clip_ratio/low_mean": 0.0065534046734683216, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.020982867432758213, |
| "completion_length": 67.18000183105468, |
| "epoch": 0.45144558639900106, |
| "grad_norm": 3595.65576171875, |
| "kl": 1.140541896224022, |
| "learning_rate": 6.187314667850697e-07, |
| "loss": 0.1447, |
| "reward": 1.4676954984664916, |
| "reward_std": 0.20568167939782142, |
| "rewards/code_format_reward": 0.9849999904632568, |
| "rewards/code_reward": 0.4875977456569672, |
| "step": 2350, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.036702672578394414, |
| "clip_ratio/high_mean": 0.006752843782305717, |
| "clip_ratio/low_mean": 0.008269340678816661, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.015022184286499396, |
| "completion_length": 80.57000122070312, |
| "epoch": 0.45336663144750744, |
| "grad_norm": 2.759171724319458, |
| "kl": 10.568821829557418, |
| "learning_rate": 6.160455084154613e-07, |
| "loss": 1.8532, |
| "reward": 1.4545687198638917, |
| "reward_std": 0.23069845288991928, |
| "rewards/code_format_reward": 0.9837499976158142, |
| "rewards/code_reward": 0.4813468337059021, |
| "step": 2360, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.08144733654335141, |
| "clip_ratio/high_mean": 0.014263840962667019, |
| "clip_ratio/low_mean": 0.0019872021744959056, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.01625104369595647, |
| "completion_length": 71.55750045776367, |
| "epoch": 0.4552876764960138, |
| "grad_norm": 1.9088038206100464, |
| "kl": 1.3571255028247833, |
| "learning_rate": 6.133571430954667e-07, |
| "loss": 0.0026, |
| "reward": 1.5344175338745116, |
| "reward_std": 0.16607576459646226, |
| "rewards/code_format_reward": 0.9737500071525573, |
| "rewards/code_reward": 0.5237712502479553, |
| "step": 2370, |
| "zero_std_ratio": 0.65 |
| }, |
| { |
| "clip_ratio/high_max": 0.10026950519531966, |
| "clip_ratio/high_mean": 0.01315019663888961, |
| "clip_ratio/low_mean": 0.00221524270309601, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.015365439187735318, |
| "completion_length": 72.40750122070312, |
| "epoch": 0.4572087215445202, |
| "grad_norm": 4.301158428192139, |
| "kl": 0.6290791854262352, |
| "learning_rate": 6.106664687993782e-07, |
| "loss": -0.0032, |
| "reward": 1.5749263525009156, |
| "reward_std": 0.16429235637187958, |
| "rewards/code_format_reward": 0.9724999785423278, |
| "rewards/code_reward": 0.5443381488323211, |
| "step": 2380, |
| "zero_std_ratio": 0.65 |
| }, |
| { |
| "clip_ratio/high_max": 0.10308512919582427, |
| "clip_ratio/high_mean": 0.016338009486207738, |
| "clip_ratio/low_mean": 0.0017032683303114028, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.018041277857264504, |
| "completion_length": 76.43750228881837, |
| "epoch": 0.4591297665930266, |
| "grad_norm": 6.198258876800537, |
| "kl": 408884378.2116049, |
| "learning_rate": 6.079735835856362e-07, |
| "loss": 1157747.0, |
| "reward": 1.5280384778976441, |
| "reward_std": 0.19424125757068395, |
| "rewards/code_format_reward": 0.9862499952316284, |
| "rewards/code_reward": 0.517456728219986, |
| "step": 2390, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.24319633510895072, |
| "clip_ratio/high_mean": 0.037530579004669565, |
| "clip_ratio/low_mean": 0.004886501970031531, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.04241708111949265, |
| "completion_length": 74.5425018310547, |
| "epoch": 0.461050811641533, |
| "grad_norm": 5.885474681854248, |
| "kl": 1.4351533338427545, |
| "learning_rate": 6.052785855932548e-07, |
| "loss": 0.123, |
| "reward": 1.4949720859527589, |
| "reward_std": 0.20392217636108398, |
| "rewards/code_format_reward": 0.9887500047683716, |
| "rewards/code_reward": 0.5002985119819641, |
| "step": 2400, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.23103972100652753, |
| "clip_ratio/high_mean": 0.0305588347138837, |
| "clip_ratio/low_mean": 0.002339675696566701, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03289851036388427, |
| "completion_length": 70.01750106811524, |
| "epoch": 0.4629718566900394, |
| "grad_norm": 0.8806352615356445, |
| "kl": 1.6503019407391548, |
| "learning_rate": 6.025815730382463e-07, |
| "loss": 0.8832, |
| "reward": 1.6588483333587647, |
| "reward_std": 0.19124363958835602, |
| "rewards/code_format_reward": 0.9725000143051148, |
| "rewards/code_reward": 0.5862991452217102, |
| "step": 2410, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.02757756725186482, |
| "clip_ratio/high_mean": 0.005332520017691422, |
| "clip_ratio/low_mean": 0.019763218611478804, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.025095738274103496, |
| "completion_length": 71.59250183105469, |
| "epoch": 0.46489290173854575, |
| "grad_norm": 1.2440141439437866, |
| "kl": 2.751401698589325, |
| "learning_rate": 5.998826442100412e-07, |
| "loss": 362174.725, |
| "reward": 1.5159764885902405, |
| "reward_std": 0.1902527991682291, |
| "rewards/code_format_reward": 0.9799999952316284, |
| "rewards/code_reward": 0.5129882216453552, |
| "step": 2420, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.2530499072512612, |
| "clip_ratio/high_mean": 0.03376921496528666, |
| "clip_ratio/low_mean": 0.0062858725665137175, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.04005508716509212, |
| "completion_length": 76.0425018310547, |
| "epoch": 0.4668139467870522, |
| "grad_norm": 66.4449234008789, |
| "kl": 2164149.3861157326, |
| "learning_rate": 5.971818974679065e-07, |
| "loss": 2449736.0, |
| "reward": 1.6650853157043457, |
| "reward_std": 0.24712301939725875, |
| "rewards/code_format_reward": 0.9887500047683716, |
| "rewards/code_reward": 0.585355132818222, |
| "step": 2430, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.2950001623481512, |
| "clip_ratio/high_mean": 0.042542998865246776, |
| "clip_ratio/low_mean": 0.0068845050991512835, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.049427504313644025, |
| "completion_length": 75.27000198364257, |
| "epoch": 0.46873499183555856, |
| "grad_norm": 2.206911563873291, |
| "kl": 11.237105096876622, |
| "learning_rate": 5.944794312373607e-07, |
| "loss": 0.0298, |
| "reward": 1.7914002895355225, |
| "reward_std": 0.22826257348060608, |
| "rewards/code_format_reward": 0.9850000023841858, |
| "rewards/code_reward": 0.649450159072876, |
| "step": 2440, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.07424517879262567, |
| "clip_ratio/high_mean": 0.010772422759328038, |
| "clip_ratio/low_mean": 0.010833968574297614, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.02160639046342112, |
| "completion_length": 71.40500183105469, |
| "epoch": 0.47065603688406493, |
| "grad_norm": 76503500980224.0, |
| "kl": 393.06428125053645, |
| "learning_rate": 5.917753440065869e-07, |
| "loss": 909725593.6, |
| "reward": 1.4975883960723877, |
| "reward_std": 0.28928079828619957, |
| "rewards/code_format_reward": 0.9612499833106994, |
| "rewards/code_reward": 0.5084816813468933, |
| "step": 2450, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.14294563261792065, |
| "clip_ratio/high_mean": 0.019953654275741427, |
| "clip_ratio/low_mean": 0.004493102640844881, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.024446757195983083, |
| "completion_length": 79.72250061035156, |
| "epoch": 0.4725770819325713, |
| "grad_norm": 0.778223991394043, |
| "kl": 2.2069298341870307, |
| "learning_rate": 5.89069734322844e-07, |
| "loss": -0.0085, |
| "reward": 1.5203648328781127, |
| "reward_std": 0.1896197520196438, |
| "rewards/code_format_reward": 0.9712499976158142, |
| "rewards/code_reward": 0.5173698782920837, |
| "step": 2460, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.0473501511849463, |
| "clip_ratio/high_mean": 0.006591684772865846, |
| "clip_ratio/low_mean": 0.0004718510695965961, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.007063536025816575, |
| "completion_length": 76.70500183105469, |
| "epoch": 0.4744981269810777, |
| "grad_norm": 0.5978448390960693, |
| "kl": 0.6427325546741486, |
| "learning_rate": 5.863627007888745e-07, |
| "loss": 0.0007, |
| "reward": 1.7259918212890626, |
| "reward_std": 0.1515914086252451, |
| "rewards/code_format_reward": 0.9774999976158142, |
| "rewards/code_reward": 0.618620878458023, |
| "step": 2470, |
| "zero_std_ratio": 0.625 |
| }, |
| { |
| "clip_ratio/high_max": 0.048674007039517166, |
| "clip_ratio/high_mean": 0.010258768184576184, |
| "clip_ratio/low_mean": 0.012727768435433972, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.022986536961980164, |
| "completion_length": 78.06500244140625, |
| "epoch": 0.4764191720295841, |
| "grad_norm": 4.168500900268555, |
| "kl": 0.5699560895562172, |
| "learning_rate": 5.836543420593119e-07, |
| "loss": -0.0011, |
| "reward": 1.6060274362564086, |
| "reward_std": 0.2864475339651108, |
| "rewards/code_format_reward": 0.9824999928474426, |
| "rewards/code_reward": 0.557388699054718, |
| "step": 2480, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.014699839102104307, |
| "clip_ratio/high_mean": 0.0019118846452329309, |
| "clip_ratio/low_mean": 0.0005179177765967325, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0024298023723531514, |
| "completion_length": 85.17000122070313, |
| "epoch": 0.4783402170780905, |
| "grad_norm": 4.149423599243164, |
| "kl": 1.3347756370902062, |
| "learning_rate": 5.809447568370843e-07, |
| "loss": 0.0102, |
| "reward": 1.621114158630371, |
| "reward_std": 0.21484595835208892, |
| "rewards/code_format_reward": 0.9774999856948853, |
| "rewards/code_reward": 0.5661820948123932, |
| "step": 2490, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.029943293944234027, |
| "clip_ratio/high_mean": 0.006927184848609613, |
| "clip_ratio/low_mean": 0.0035072380007477475, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.010434422721300508, |
| "completion_length": 83.86250228881836, |
| "epoch": 0.48026126212659687, |
| "grad_norm": 5.97049617767334, |
| "kl": 4.178053397685289, |
| "learning_rate": 5.782340438698185e-07, |
| "loss": -0.0063, |
| "reward": 1.6789068222045898, |
| "reward_std": 0.25779220163822175, |
| "rewards/code_format_reward": 0.9962499976158142, |
| "rewards/code_reward": 0.5903908908367157, |
| "step": 2500, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.08102847400587052, |
| "clip_ratio/high_mean": 0.01392527524731122, |
| "clip_ratio/low_mean": 0.0045509199095249645, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.018476195022230968, |
| "completion_length": 83.11250152587891, |
| "epoch": 0.48218230717510324, |
| "grad_norm": 5.283038139343262, |
| "kl": 1.111867392808199, |
| "learning_rate": 5.755223019462401e-07, |
| "loss": 17.941, |
| "reward": 1.577300524711609, |
| "reward_std": 0.22725088596343995, |
| "rewards/code_format_reward": 0.9774999976158142, |
| "rewards/code_reward": 0.5442752420902253, |
| "step": 2510, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.06463829884305597, |
| "clip_ratio/high_mean": 0.008858744835015387, |
| "clip_ratio/low_mean": 0.0054666692391037944, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.01432541401591152, |
| "completion_length": 85.7625015258789, |
| "epoch": 0.4841033522236096, |
| "grad_norm": 8.200135231018066, |
| "kl": 0.4475974731147289, |
| "learning_rate": 5.728096298925745e-07, |
| "loss": -0.0057, |
| "reward": 1.5549763917922974, |
| "reward_std": 0.23400793820619584, |
| "rewards/code_format_reward": 0.9774999976158142, |
| "rewards/code_reward": 0.5331131994724274, |
| "step": 2520, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.031378739466890695, |
| "clip_ratio/high_mean": 0.00507326218066737, |
| "clip_ratio/low_mean": 0.010504274675622583, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.015577536821365357, |
| "completion_length": 79.96750030517578, |
| "epoch": 0.48602439727211605, |
| "grad_norm": 2.6766583919525146, |
| "kl": 0.4622874528169632, |
| "learning_rate": 5.700961265689434e-07, |
| "loss": -0.0011, |
| "reward": 1.8167934179306031, |
| "reward_std": 0.30146218538284303, |
| "rewards/code_format_reward": 0.9850000023841858, |
| "rewards/code_reward": 0.6621467113494873, |
| "step": 2530, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.07196793179027736, |
| "clip_ratio/high_mean": 0.013576928357360884, |
| "clip_ratio/low_mean": 0.0018521397636504845, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.015429067742661572, |
| "completion_length": 87.03000030517578, |
| "epoch": 0.4879454423206224, |
| "grad_norm": 1.347899317741394, |
| "kl": 0.7047492057085037, |
| "learning_rate": 5.673818908657644e-07, |
| "loss": -0.0079, |
| "reward": 1.6893932342529296, |
| "reward_std": 0.24144218415021895, |
| "rewards/code_format_reward": 0.9862500071525574, |
| "rewards/code_reward": 0.5981341004371643, |
| "step": 2540, |
| "zero_std_ratio": 0.375 |
| }, |
| { |
| "clip_ratio/high_max": 0.03861275149974972, |
| "clip_ratio/high_mean": 0.005072360605117865, |
| "clip_ratio/low_mean": 0.0013027720240643248, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0063751326058991255, |
| "completion_length": 78.80750122070313, |
| "epoch": 0.4898664873691288, |
| "grad_norm": 1.7946380376815796, |
| "kl": 0.7765734851360321, |
| "learning_rate": 5.646670217001451e-07, |
| "loss": 0.004, |
| "reward": 1.8638887882232666, |
| "reward_std": 0.1732952728867531, |
| "rewards/code_format_reward": 0.9949999928474427, |
| "rewards/code_reward": 0.6831943988800049, |
| "step": 2550, |
| "zero_std_ratio": 0.625 |
| }, |
| { |
| "clip_ratio/high_max": 0.044772564456798135, |
| "clip_ratio/high_mean": 0.008199371959199198, |
| "clip_ratio/low_mean": 0.007188984929234721, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.015388356836047024, |
| "completion_length": 92.03750305175781, |
| "epoch": 0.4917875324176352, |
| "grad_norm": 8241.9619140625, |
| "kl": 3.7090125039219854, |
| "learning_rate": 5.619516180122789e-07, |
| "loss": 0.2194, |
| "reward": 1.346347188949585, |
| "reward_std": 0.3114967554807663, |
| "rewards/code_format_reward": 0.9712499976158142, |
| "rewards/code_reward": 0.4303610801696777, |
| "step": 2560, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.136108908476308, |
| "clip_ratio/high_mean": 0.01777363264700398, |
| "clip_ratio/low_mean": 0.0005986301795928739, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.018372263037599625, |
| "completion_length": 77.66500091552734, |
| "epoch": 0.4937085774661416, |
| "grad_norm": 2.8724048137664795, |
| "kl": 0.30402788892388344, |
| "learning_rate": 5.592357787618398e-07, |
| "loss": -0.0095, |
| "reward": 1.235116672515869, |
| "reward_std": 0.16121466904878617, |
| "rewards/code_format_reward": 0.9875, |
| "rewards/code_reward": 0.3706833332777023, |
| "step": 2570, |
| "zero_std_ratio": 0.6 |
| }, |
| { |
| "clip_ratio/high_max": 0.21411730786785482, |
| "clip_ratio/high_mean": 0.02751181152416393, |
| "clip_ratio/low_mean": 0.005141469169757329, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03265328073175624, |
| "completion_length": 77.39250030517579, |
| "epoch": 0.495629622514648, |
| "grad_norm": 3.1119463443756104, |
| "kl": 0.516096468269825, |
| "learning_rate": 5.565196029243746e-07, |
| "loss": -0.0097, |
| "reward": 1.7056148529052735, |
| "reward_std": 0.26717675626277926, |
| "rewards/code_format_reward": 0.9849999904632568, |
| "rewards/code_reward": 0.6065573751926422, |
| "step": 2580, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.06655758274719119, |
| "clip_ratio/high_mean": 0.00869752592407167, |
| "clip_ratio/low_mean": 0.0007154849590733647, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.009413010929711163, |
| "completion_length": 78.71000061035156, |
| "epoch": 0.49755066756315436, |
| "grad_norm": 9.566883087158203, |
| "kl": 6.985853771865368, |
| "learning_rate": 5.538031894876971e-07, |
| "loss": 0.0154, |
| "reward": 1.8047074317932128, |
| "reward_std": 0.2406391829252243, |
| "rewards/code_format_reward": 0.9862499952316284, |
| "rewards/code_reward": 0.655791187286377, |
| "step": 2590, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.04153572088107467, |
| "clip_ratio/high_mean": 0.0076960999285802245, |
| "clip_ratio/low_mean": 0.00053562533139484, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.008231725194491446, |
| "completion_length": 87.35249938964844, |
| "epoch": 0.49947171261166073, |
| "grad_norm": 4.163487911224365, |
| "kl": 3.02228729724884, |
| "learning_rate": 5.510866374482799e-07, |
| "loss": 0.0014, |
| "reward": 1.7271404266357422, |
| "reward_std": 0.20059744864702225, |
| "rewards/code_format_reward": 0.9862499952316284, |
| "rewards/code_reward": 0.6170076906681061, |
| "step": 2600, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.09242721796035766, |
| "clip_ratio/high_mean": 0.01333312913775444, |
| "clip_ratio/low_mean": 0.0022988114287727512, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.01563194077461958, |
| "completion_length": 86.66250152587891, |
| "epoch": 0.5013927576601671, |
| "grad_norm": 1.7816847562789917, |
| "kl": 2.135231140255928, |
| "learning_rate": 5.48370045807647e-07, |
| "loss": -0.0043, |
| "reward": 1.5687429666519166, |
| "reward_std": 0.22490316033363342, |
| "rewards/code_format_reward": 0.9524999976158142, |
| "rewards/code_reward": 0.5462464988231659, |
| "step": 2610, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.10432031177915632, |
| "clip_ratio/high_mean": 0.01704162026871927, |
| "clip_ratio/low_mean": 0.0019667694039526397, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.01900838967994787, |
| "completion_length": 98.58000183105469, |
| "epoch": 0.5033138027086735, |
| "grad_norm": 2.1069369316101074, |
| "kl": 2.131927290558815, |
| "learning_rate": 5.456535135687656e-07, |
| "loss": -0.0069, |
| "reward": 1.6628828048706055, |
| "reward_std": 0.23133169412612914, |
| "rewards/code_format_reward": 0.9824999928474426, |
| "rewards/code_reward": 0.5858163475990296, |
| "step": 2620, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.04196612173691392, |
| "clip_ratio/high_mean": 0.0064837948535569016, |
| "clip_ratio/low_mean": 0.0025595034239813685, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.009043298207689076, |
| "completion_length": 86.6, |
| "epoch": 0.5052348477571799, |
| "grad_norm": 15.670801162719727, |
| "kl": 2.1330361180007458, |
| "learning_rate": 5.429371397324378e-07, |
| "loss": -0.0054, |
| "reward": 1.4884859561920165, |
| "reward_std": 0.3388957381248474, |
| "rewards/code_format_reward": 0.9887500047683716, |
| "rewards/code_reward": 0.497055447101593, |
| "step": 2630, |
| "zero_std_ratio": 0.35 |
| }, |
| { |
| "clip_ratio/high_max": 0.036724881688132885, |
| "clip_ratio/high_mean": 0.005309284973191097, |
| "clip_ratio/low_mean": 0.003665669827023521, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00897495478275232, |
| "completion_length": 84.73250122070313, |
| "epoch": 0.5071558928056863, |
| "grad_norm": 6.480928421020508, |
| "kl": 0.9241176024079323, |
| "learning_rate": 5.402210232936934e-07, |
| "loss": -0.0009, |
| "reward": 1.792254877090454, |
| "reward_std": 0.29597480297088624, |
| "rewards/code_format_reward": 0.9974999904632569, |
| "rewards/code_reward": 0.646752405166626, |
| "step": 2640, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.109601711621508, |
| "clip_ratio/high_mean": 0.015215938963228837, |
| "clip_ratio/low_mean": 0.0034228902019094675, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.018638829072006046, |
| "completion_length": 88.82750244140625, |
| "epoch": 0.5090769378541927, |
| "grad_norm": 5.080334186553955, |
| "kl": 0.6404796183109284, |
| "learning_rate": 5.37505263238181e-07, |
| "loss": -0.0032, |
| "reward": 1.7266260623931884, |
| "reward_std": 0.27733459770679475, |
| "rewards/code_format_reward": 0.993749988079071, |
| "rewards/code_reward": 0.6148755311965942, |
| "step": 2650, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.0589832967845723, |
| "clip_ratio/high_mean": 0.009531341239926406, |
| "clip_ratio/low_mean": 0.00046608211705461143, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00999742336862255, |
| "completion_length": 88.69250183105468, |
| "epoch": 0.510997982902699, |
| "grad_norm": 7.949027061462402, |
| "kl": 0.6428510576486588, |
| "learning_rate": 5.347899585385619e-07, |
| "loss": -0.0028, |
| "reward": 1.8208046436309815, |
| "reward_std": 0.32592435777187345, |
| "rewards/code_format_reward": 0.9824999928474426, |
| "rewards/code_reward": 0.6647772669792176, |
| "step": 2660, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.15465617645531893, |
| "clip_ratio/high_mean": 0.022943795099854468, |
| "clip_ratio/low_mean": 0.0016213681577937678, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.024565163621446118, |
| "completion_length": 87.13250274658203, |
| "epoch": 0.5129190279512055, |
| "grad_norm": 34.059959411621094, |
| "kl": 0.5654895901679993, |
| "learning_rate": 5.320752081509019e-07, |
| "loss": -0.0048, |
| "reward": 1.7013320207595826, |
| "reward_std": 0.27322621941566466, |
| "rewards/code_format_reward": 0.9875, |
| "rewards/code_reward": 0.6037909984588623, |
| "step": 2670, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.04982744911685586, |
| "clip_ratio/high_mean": 0.007483145385049283, |
| "clip_ratio/low_mean": 0.0010201202865573577, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.008503265725448728, |
| "completion_length": 91.09750213623047, |
| "epoch": 0.5148400729997118, |
| "grad_norm": 3.5055086612701416, |
| "kl": 0.5736653476953506, |
| "learning_rate": 5.293611110110661e-07, |
| "loss": -0.0032, |
| "reward": 1.672940969467163, |
| "reward_std": 0.24722242057323457, |
| "rewards/code_format_reward": 0.9850000023841858, |
| "rewards/code_reward": 0.5902204990386963, |
| "step": 2680, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.06388061246834695, |
| "clip_ratio/high_mean": 0.008427212300011888, |
| "clip_ratio/low_mean": 0.000504276818537619, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.008931488974485546, |
| "completion_length": 84.06750183105468, |
| "epoch": 0.5167611180482182, |
| "grad_norm": 1.1749032735824585, |
| "kl": 0.6257337100803853, |
| "learning_rate": 5.266477660311123e-07, |
| "loss": -0.0049, |
| "reward": 1.883350706100464, |
| "reward_std": 0.1923319399356842, |
| "rewards/code_format_reward": 0.9949999928474427, |
| "rewards/code_reward": 0.6929253697395324, |
| "step": 2690, |
| "zero_std_ratio": 0.6 |
| }, |
| { |
| "clip_ratio/high_max": 0.032716090651229025, |
| "clip_ratio/high_mean": 0.004722256149398163, |
| "clip_ratio/low_mean": 0.00025295682498835956, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0049752129940316085, |
| "completion_length": 101.05250244140625, |
| "epoch": 0.5186821630967247, |
| "grad_norm": 2.250870704650879, |
| "kl": 0.3336128618568182, |
| "learning_rate": 5.239352720956869e-07, |
| "loss": -0.0014, |
| "reward": 1.803996729850769, |
| "reward_std": 0.3182943195104599, |
| "rewards/code_format_reward": 0.9887499928474426, |
| "rewards/code_reward": 0.6548108696937561, |
| "step": 2700, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.057230279734358194, |
| "clip_ratio/high_mean": 0.01016470161266625, |
| "clip_ratio/low_mean": 0.001601585964090191, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.01176628761459142, |
| "completion_length": 92.2125015258789, |
| "epoch": 0.520603208145231, |
| "grad_norm": 1.7528822422027588, |
| "kl": 0.30482072457671167, |
| "learning_rate": 5.212237280584214e-07, |
| "loss": -0.0012, |
| "reward": 1.6862072706222535, |
| "reward_std": 0.2419889122247696, |
| "rewards/code_format_reward": 0.9837499976158142, |
| "rewards/code_reward": 0.5971661269664764, |
| "step": 2710, |
| "zero_std_ratio": 0.375 |
| }, |
| { |
| "clip_ratio/high_max": 0.060608417179901154, |
| "clip_ratio/high_mean": 0.00894762706157053, |
| "clip_ratio/low_mean": 0.0007068538383464329, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.009654480942117516, |
| "completion_length": 92.53000183105469, |
| "epoch": 0.5225242531937374, |
| "grad_norm": 274.7859802246094, |
| "kl": 1.1552282243967056, |
| "learning_rate": 5.185132327383284e-07, |
| "loss": 0.1157, |
| "reward": 1.7673757076263428, |
| "reward_std": 0.3102965742349625, |
| "rewards/code_format_reward": 0.9887499809265137, |
| "rewards/code_reward": 0.6365003228187561, |
| "step": 2720, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.15797494337894022, |
| "clip_ratio/high_mean": 0.02083307456341572, |
| "clip_ratio/low_mean": 0.009061275536078028, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.02989435092313215, |
| "completion_length": 88.4500015258789, |
| "epoch": 0.5244452982422437, |
| "grad_norm": 4.456059455871582, |
| "kl": 1.3563814774155616, |
| "learning_rate": 5.158038849162024e-07, |
| "loss": 0.0014, |
| "reward": 1.5090751886367797, |
| "reward_std": 0.23531495928764343, |
| "rewards/code_format_reward": 0.9787499904632568, |
| "rewards/code_reward": 0.5098500728607178, |
| "step": 2730, |
| "zero_std_ratio": 0.4 |
| }, |
| { |
| "clip_ratio/high_max": 0.054899439518339935, |
| "clip_ratio/high_mean": 0.008722224002121947, |
| "clip_ratio/low_mean": 0.0002903619286371395, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.009012586006429046, |
| "completion_length": 85.53750305175781, |
| "epoch": 0.5263663432907502, |
| "grad_norm": 1.951745867729187, |
| "kl": 0.5144835211336612, |
| "learning_rate": 5.130957833310177e-07, |
| "loss": -0.0017, |
| "reward": 1.7648489713668822, |
| "reward_std": 0.1646851196885109, |
| "rewards/code_format_reward": 0.987499988079071, |
| "rewards/code_reward": 0.6355494737625123, |
| "step": 2740, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.10675688227638602, |
| "clip_ratio/high_mean": 0.016114802553784103, |
| "clip_ratio/low_mean": 0.0011672286826069466, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0172820313135162, |
| "completion_length": 97.54000244140624, |
| "epoch": 0.5282873883392566, |
| "grad_norm": 2.7782888412475586, |
| "kl": 0.484642443805933, |
| "learning_rate": 5.103890266763317e-07, |
| "loss": -0.0017, |
| "reward": 1.7005881071090698, |
| "reward_std": 0.17179570347070694, |
| "rewards/code_format_reward": 0.9824999928474426, |
| "rewards/code_reward": 0.6046690165996551, |
| "step": 2750, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.04718098001321778, |
| "clip_ratio/high_mean": 0.0069286267200368455, |
| "clip_ratio/low_mean": 0.0022616338639636522, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.009190260457398836, |
| "completion_length": 91.71500091552734, |
| "epoch": 0.5302084333877629, |
| "grad_norm": 1.6982417106628418, |
| "kl": 0.40430613309144975, |
| "learning_rate": 5.076837135966868e-07, |
| "loss": -0.0001, |
| "reward": 1.7166170120239257, |
| "reward_std": 0.12425057031214237, |
| "rewards/code_format_reward": 0.9887499928474426, |
| "rewards/code_reward": 0.6111209750175476, |
| "step": 2760, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.01563614197075367, |
| "clip_ratio/high_mean": 0.0026440696616191416, |
| "clip_ratio/low_mean": 0.0005518664722330869, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0031959360814653335, |
| "completion_length": 93.39000091552734, |
| "epoch": 0.5321294784362693, |
| "grad_norm": 0.12160471081733704, |
| "kl": 0.3728202864527702, |
| "learning_rate": 5.049799426840166e-07, |
| "loss": -0.0008, |
| "reward": 1.8690509557724, |
| "reward_std": 0.20764816105365752, |
| "rewards/code_format_reward": 0.9824999928474426, |
| "rewards/code_reward": 0.6889004349708557, |
| "step": 2770, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.08729656506329775, |
| "clip_ratio/high_mean": 0.013787648268043995, |
| "clip_ratio/low_mean": 0.0016946192088653333, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.015482267551124095, |
| "completion_length": 83.49000091552735, |
| "epoch": 0.5340505234847757, |
| "grad_norm": 2.061514377593994, |
| "kl": 0.2805942878127098, |
| "learning_rate": 5.02277812474052e-07, |
| "loss": -0.0005, |
| "reward": 1.5558062076568604, |
| "reward_std": 0.18851915150880813, |
| "rewards/code_format_reward": 0.9924999833106994, |
| "rewards/code_reward": 0.529778128862381, |
| "step": 2780, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.047503374773077665, |
| "clip_ratio/high_mean": 0.007121381178149022, |
| "clip_ratio/low_mean": 0.003943280148087069, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.011064661786076613, |
| "completion_length": 90.33500213623047, |
| "epoch": 0.5359715685332821, |
| "grad_norm": 2.8578014373779297, |
| "kl": 0.9348004341125489, |
| "learning_rate": 4.995774214427299e-07, |
| "loss": -0.0083, |
| "reward": 1.5787676095962524, |
| "reward_std": 0.24208036959171295, |
| "rewards/code_format_reward": 0.9924999952316285, |
| "rewards/code_reward": 0.5412587821483612, |
| "step": 2790, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.0682177669601515, |
| "clip_ratio/high_mean": 0.010770523789688013, |
| "clip_ratio/low_mean": 0.0030192029429599644, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.01378972665697802, |
| "completion_length": 97.35500030517578, |
| "epoch": 0.5378926135817885, |
| "grad_norm": 3.7523272037506104, |
| "kl": 0.49133365601301193, |
| "learning_rate": 4.968788680026062e-07, |
| "loss": 0.0019, |
| "reward": 1.8675085306167603, |
| "reward_std": 0.3084888607263565, |
| "rewards/code_format_reward": 0.9799999952316284, |
| "rewards/code_reward": 0.6887542605400085, |
| "step": 2800, |
| "zero_std_ratio": 0.4 |
| }, |
| { |
| "clip_ratio/high_max": 0.051895615691319105, |
| "clip_ratio/high_mean": 0.007092137623112648, |
| "clip_ratio/low_mean": 0.0009049189888173714, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.007997056643944234, |
| "completion_length": 84.84000244140626, |
| "epoch": 0.5398136586302948, |
| "grad_norm": 6879.29296875, |
| "kl": 41.48030465692282, |
| "learning_rate": 4.941822504992665e-07, |
| "loss": 0.3058, |
| "reward": 1.8456867456436157, |
| "reward_std": 0.17148398756980895, |
| "rewards/code_format_reward": 0.9912500023841858, |
| "rewards/code_reward": 0.6750308394432067, |
| "step": 2810, |
| "zero_std_ratio": 0.625 |
| }, |
| { |
| "clip_ratio/high_max": 0.055018832255154845, |
| "clip_ratio/high_mean": 0.009382021031342447, |
| "clip_ratio/low_mean": 0.0012206103128846735, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0106026312103495, |
| "completion_length": 92.16250152587891, |
| "epoch": 0.5417347036788013, |
| "grad_norm": 1.7369046211242676, |
| "kl": 39.203208688646555, |
| "learning_rate": 4.914876672077444e-07, |
| "loss": 0.0739, |
| "reward": 1.7605399131774901, |
| "reward_std": 0.22667703181505203, |
| "rewards/code_format_reward": 0.9862499952316284, |
| "rewards/code_reward": 0.6337074398994446, |
| "step": 2820, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.07783090919256211, |
| "clip_ratio/high_mean": 0.013414820143952965, |
| "clip_ratio/low_mean": 0.004346576618263498, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.017761396686546506, |
| "completion_length": 86.75750122070312, |
| "epoch": 0.5436557487273077, |
| "grad_norm": 1.3669426441192627, |
| "kl": 0.6254852950572968, |
| "learning_rate": 4.887952163289387e-07, |
| "loss": -0.0037, |
| "reward": 1.7524815320968627, |
| "reward_std": 0.18003067299723624, |
| "rewards/code_format_reward": 0.9962499976158142, |
| "rewards/code_reward": 0.6271782517433167, |
| "step": 2830, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.032230034773238006, |
| "clip_ratio/high_mean": 0.005299290179391391, |
| "clip_ratio/low_mean": 0.002357826306251809, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.007657116468180902, |
| "completion_length": 92.63250122070312, |
| "epoch": 0.545576793775814, |
| "grad_norm": 6.607495307922363, |
| "kl": 0.6308505192399025, |
| "learning_rate": 4.861049959860352e-07, |
| "loss": -0.0026, |
| "reward": 1.879476284980774, |
| "reward_std": 0.21936110258102418, |
| "rewards/code_format_reward": 0.9787499785423279, |
| "rewards/code_reward": 0.6950506567955017, |
| "step": 2840, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.07762792855501174, |
| "clip_ratio/high_mean": 0.012513539101928473, |
| "clip_ratio/low_mean": 0.0019065381304244511, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.014420077262911946, |
| "completion_length": 79.06750183105468, |
| "epoch": 0.5474978388243205, |
| "grad_norm": 2.1427297592163086, |
| "kl": 0.7649303644895553, |
| "learning_rate": 4.834171042209299e-07, |
| "loss": -0.0016, |
| "reward": 1.7679643869400024, |
| "reward_std": 0.2242477983236313, |
| "rewards/code_format_reward": 0.9725000023841858, |
| "rewards/code_reward": 0.640857207775116, |
| "step": 2850, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.10583647433668375, |
| "clip_ratio/high_mean": 0.015110303135588764, |
| "clip_ratio/low_mean": 0.0026529163093073293, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.017763219110202046, |
| "completion_length": 89.09499969482422, |
| "epoch": 0.5494188838728268, |
| "grad_norm": 5.39391565322876, |
| "kl": 1.1404950305819512, |
| "learning_rate": 4.807316389906573e-07, |
| "loss": 0.0011, |
| "reward": 1.6588359355926514, |
| "reward_std": 0.23765334486961365, |
| "rewards/code_format_reward": 0.9887500047683716, |
| "rewards/code_reward": 0.5822304427623749, |
| "step": 2860, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.13843556838110088, |
| "clip_ratio/high_mean": 0.022050847904756664, |
| "clip_ratio/low_mean": 0.006549120438285172, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.028599968180060387, |
| "completion_length": 85.02750091552734, |
| "epoch": 0.5513399289213332, |
| "grad_norm": 6.328859329223633, |
| "kl": 1.3457766875624657, |
| "learning_rate": 4.780486981638194e-07, |
| "loss": 0.004, |
| "reward": 1.4554174661636352, |
| "reward_std": 0.291735103726387, |
| "rewards/code_format_reward": 0.975, |
| "rewards/code_reward": 0.4839587390422821, |
| "step": 2870, |
| "zero_std_ratio": 0.325 |
| }, |
| { |
| "clip_ratio/high_max": 0.03891797037795186, |
| "clip_ratio/high_mean": 0.005045192840043455, |
| "clip_ratio/low_mean": 0.0029750549525488167, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.008020247751846909, |
| "completion_length": 85.00500183105468, |
| "epoch": 0.5532609739698396, |
| "grad_norm": 3.746497392654419, |
| "kl": 1.5130328834056854, |
| "learning_rate": 4.75368379517019e-07, |
| "loss": -0.0033, |
| "reward": 1.8564167737960815, |
| "reward_std": 0.14603331089019775, |
| "rewards/code_format_reward": 0.9987499952316284, |
| "rewards/code_reward": 0.6785208344459533, |
| "step": 2880, |
| "zero_std_ratio": 0.7 |
| }, |
| { |
| "clip_ratio/high_max": 0.23034826815128326, |
| "clip_ratio/high_mean": 0.037423617928288876, |
| "clip_ratio/low_mean": 0.0014674156729597599, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03889103210531175, |
| "completion_length": 81.40750122070312, |
| "epoch": 0.555182019018346, |
| "grad_norm": 7.282290458679199, |
| "kl": 0.5326755799353122, |
| "learning_rate": 4.7269078073129696e-07, |
| "loss": 0.0032, |
| "reward": 1.700506567955017, |
| "reward_std": 0.3424434006214142, |
| "rewards/code_format_reward": 0.9700000047683716, |
| "rewards/code_reward": 0.6077533006668091, |
| "step": 2890, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.06711816978640854, |
| "clip_ratio/high_mean": 0.008929741784231737, |
| "clip_ratio/low_mean": 0.0021304662863258273, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.011060208058916032, |
| "completion_length": 75.07750244140625, |
| "epoch": 0.5571030640668524, |
| "grad_norm": 3.718710422515869, |
| "kl": 0.3807241953909397, |
| "learning_rate": 4.7001599938857204e-07, |
| "loss": -0.0016, |
| "reward": 1.6593467235565185, |
| "reward_std": 0.2742844566702843, |
| "rewards/code_format_reward": 0.9899999856948852, |
| "rewards/code_reward": 0.5821733415126801, |
| "step": 2900, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.10134089784696698, |
| "clip_ratio/high_mean": 0.014033923938404769, |
| "clip_ratio/low_mean": 0.0036910680413711817, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.01772499195067212, |
| "completion_length": 74.57250061035157, |
| "epoch": 0.5590241091153587, |
| "grad_norm": 18.590866088867188, |
| "kl": 0.9125221639871597, |
| "learning_rate": 4.673441329680844e-07, |
| "loss": 0.0044, |
| "reward": 1.6198436498641968, |
| "reward_std": 0.1470041409134865, |
| "rewards/code_format_reward": 0.9912500023841858, |
| "rewards/code_reward": 0.5621092915534973, |
| "step": 2910, |
| "zero_std_ratio": 0.7 |
| }, |
| { |
| "clip_ratio/high_max": 0.042256729071959855, |
| "clip_ratio/high_mean": 0.007948629459133372, |
| "clip_ratio/low_mean": 0.001496748169302009, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.009445377223892137, |
| "completion_length": 77.4625015258789, |
| "epoch": 0.5609451541638651, |
| "grad_norm": 0.18645010888576508, |
| "kl": 0.4780749522149563, |
| "learning_rate": 4.6467527884284365e-07, |
| "loss": 0.0006, |
| "reward": 1.8204985857009888, |
| "reward_std": 0.19856311585754155, |
| "rewards/code_format_reward": 0.981249988079071, |
| "rewards/code_reward": 0.6649368166923523, |
| "step": 2920, |
| "zero_std_ratio": 0.6 |
| }, |
| { |
| "clip_ratio/high_max": 0.1271044396329671, |
| "clip_ratio/high_mean": 0.016186495171859862, |
| "clip_ratio/low_mean": 0.0011034044640837238, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.01728989938274026, |
| "completion_length": 82.72500305175781, |
| "epoch": 0.5628661992123716, |
| "grad_norm": 6.4396162033081055, |
| "kl": 0.30610462203621863, |
| "learning_rate": 4.6200953427607927e-07, |
| "loss": -0.0021, |
| "reward": 1.7915108680725098, |
| "reward_std": 0.22729050666093825, |
| "rewards/code_format_reward": 0.9700000047683716, |
| "rewards/code_reward": 0.6532554149627685, |
| "step": 2930, |
| "zero_std_ratio": 0.6 |
| }, |
| { |
| "clip_ratio/high_max": 0.23429102210793645, |
| "clip_ratio/high_mean": 0.03006269016477745, |
| "clip_ratio/low_mean": 0.001874277341994457, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0319369669421576, |
| "completion_length": 88.97500152587891, |
| "epoch": 0.5647872442608779, |
| "grad_norm": 43.83531951904297, |
| "kl": 0.5952823750674725, |
| "learning_rate": 4.5934699641769747e-07, |
| "loss": -0.0032, |
| "reward": 1.837431001663208, |
| "reward_std": 0.3392215400934219, |
| "rewards/code_format_reward": 0.9825000047683716, |
| "rewards/code_reward": 0.6730904817581177, |
| "step": 2940, |
| "zero_std_ratio": 0.325 |
| }, |
| { |
| "clip_ratio/high_max": 0.06733583421446383, |
| "clip_ratio/high_mean": 0.00863017894444056, |
| "clip_ratio/low_mean": 0.005618994176620618, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.014249173022108153, |
| "completion_length": 77.96000061035156, |
| "epoch": 0.5667082893093843, |
| "grad_norm": 2.642043352127075, |
| "kl": 0.56968834400177, |
| "learning_rate": 4.566877623007389e-07, |
| "loss": 0.0049, |
| "reward": 1.7328413248062133, |
| "reward_std": 0.21620932817459107, |
| "rewards/code_format_reward": 0.9737500071525573, |
| "rewards/code_reward": 0.6229831516742707, |
| "step": 2950, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.04462944087572396, |
| "clip_ratio/high_mean": 0.007475414098007604, |
| "clip_ratio/low_mean": 0.002004683316772571, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.009480097430059686, |
| "completion_length": 85.0875015258789, |
| "epoch": 0.5686293343578906, |
| "grad_norm": 3.8512065410614014, |
| "kl": 0.33709155321121215, |
| "learning_rate": 4.540319288378439e-07, |
| "loss": -0.0057, |
| "reward": 1.6900140762329101, |
| "reward_std": 0.21961634010076522, |
| "rewards/code_format_reward": 0.9887499928474426, |
| "rewards/code_reward": 0.5978195071220398, |
| "step": 2960, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.06556220971979201, |
| "clip_ratio/high_mean": 0.01001431758631952, |
| "clip_ratio/low_mean": 0.003507485325098969, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.01352180291141849, |
| "completion_length": 92.67500152587891, |
| "epoch": 0.5705503794063971, |
| "grad_norm": 2.966658592224121, |
| "kl": 0.5968067184090614, |
| "learning_rate": 4.513795928177193e-07, |
| "loss": 0.0007, |
| "reward": 1.4343469619750977, |
| "reward_std": 0.16000542044639587, |
| "rewards/code_format_reward": 0.9962499976158142, |
| "rewards/code_reward": 0.4681109845638275, |
| "step": 2970, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.006220952537842095, |
| "clip_ratio/high_mean": 0.0009992636245442555, |
| "clip_ratio/low_mean": 0.0029718225210672244, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0039710860582999885, |
| "completion_length": 92.65750274658203, |
| "epoch": 0.5724714244549035, |
| "grad_norm": 9.493338584899902, |
| "kl": 0.5875692501664161, |
| "learning_rate": 4.4873085090161266e-07, |
| "loss": -0.0009, |
| "reward": 1.4061829090118407, |
| "reward_std": 0.20027331858873368, |
| "rewards/code_format_reward": 0.9762499928474426, |
| "rewards/code_reward": 0.45902894139289857, |
| "step": 2980, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.0330589919583872, |
| "clip_ratio/high_mean": 0.004416179939289578, |
| "clip_ratio/low_mean": 0.002115111546299886, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006531291425926611, |
| "completion_length": 79.80750274658203, |
| "epoch": 0.5743924695034098, |
| "grad_norm": 1.592423915863037, |
| "kl": 0.6846940219402313, |
| "learning_rate": 4.460857996197879e-07, |
| "loss": -0.0088, |
| "reward": 1.8656628370285033, |
| "reward_std": 0.24907293021678925, |
| "rewards/code_format_reward": 0.9912499904632568, |
| "rewards/code_reward": 0.6850189208984375, |
| "step": 2990, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.10685318629257382, |
| "clip_ratio/high_mean": 0.014238242123974487, |
| "clip_ratio/low_mean": 0.0005060926268924959, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.014744334877468646, |
| "completion_length": 75.28500213623047, |
| "epoch": 0.5763135145519163, |
| "grad_norm": 11.023285865783691, |
| "kl": 1.773244822025299, |
| "learning_rate": 4.434445353680084e-07, |
| "loss": -0.0004, |
| "reward": 1.6719849348068236, |
| "reward_std": 0.23447352051734924, |
| "rewards/code_format_reward": 0.9887500047683716, |
| "rewards/code_reward": 0.5888049364089966, |
| "step": 3000, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.09909297195263207, |
| "clip_ratio/high_mean": 0.014624686987372116, |
| "clip_ratio/low_mean": 0.0008105992455966771, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.015435286390129477, |
| "completion_length": 80.88750228881835, |
| "epoch": 0.5782345596004226, |
| "grad_norm": 3.5932817459106445, |
| "kl": 1.2866470351815225, |
| "learning_rate": 4.4080715440402417e-07, |
| "loss": 0.0028, |
| "reward": 1.7477641582489014, |
| "reward_std": 0.27256832718849183, |
| "rewards/code_format_reward": 0.9800000071525574, |
| "rewards/code_reward": 0.628882086277008, |
| "step": 3010, |
| "zero_std_ratio": 0.4 |
| }, |
| { |
| "clip_ratio/high_max": 0.061281491769477725, |
| "clip_ratio/high_mean": 0.008347922342363746, |
| "clip_ratio/low_mean": 0.00354889674927108, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.011896818911191076, |
| "completion_length": 75.85250091552734, |
| "epoch": 0.580155604648929, |
| "grad_norm": 4.849332809448242, |
| "kl": 0.476963010430336, |
| "learning_rate": 4.381737528440624e-07, |
| "loss": -0.0002, |
| "reward": 1.5080678462982178, |
| "reward_std": 0.1984383262693882, |
| "rewards/code_format_reward": 0.9762500047683715, |
| "rewards/code_reward": 0.5099714159965515, |
| "step": 3020, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.02014698493294418, |
| "clip_ratio/high_mean": 0.0029024946445133535, |
| "clip_ratio/low_mean": 0.001273224765463965, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0041757194034289565, |
| "completion_length": 86.35750122070313, |
| "epoch": 0.5820766496974354, |
| "grad_norm": 5.408311367034912, |
| "kl": 1.1033611692488194, |
| "learning_rate": 4.3554442665932664e-07, |
| "loss": -0.0044, |
| "reward": 1.7480007410049438, |
| "reward_std": 0.20548871904611588, |
| "rewards/code_format_reward": 0.9674999952316284, |
| "rewards/code_reward": 0.6321253478527069, |
| "step": 3030, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.07207567039877176, |
| "clip_ratio/high_mean": 0.010315603285562247, |
| "clip_ratio/low_mean": 0.0024311804067110644, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.012746783741749822, |
| "completion_length": 87.45250091552734, |
| "epoch": 0.5839976947459418, |
| "grad_norm": 5.45907735824585, |
| "kl": 0.7388446770608426, |
| "learning_rate": 4.329192716724974e-07, |
| "loss": -0.0134, |
| "reward": 1.617799663543701, |
| "reward_std": 0.28184359073638915, |
| "rewards/code_format_reward": 0.9900000095367432, |
| "rewards/code_reward": 0.5613998055458069, |
| "step": 3040, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.03178482772782445, |
| "clip_ratio/high_mean": 0.00484326797304675, |
| "clip_ratio/low_mean": 0.0010359384352341295, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005879206501413136, |
| "completion_length": 83.70250091552734, |
| "epoch": 0.5859187397944482, |
| "grad_norm": 6.244964122772217, |
| "kl": 0.8223805136978626, |
| "learning_rate": 4.3029838355424165e-07, |
| "loss": -0.0028, |
| "reward": 1.5551699638366698, |
| "reward_std": 0.23868169337511064, |
| "rewards/code_format_reward": 0.9899999976158143, |
| "rewards/code_reward": 0.5300849676132202, |
| "step": 3050, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.031123768421821296, |
| "clip_ratio/high_mean": 0.0042093763331649825, |
| "clip_ratio/low_mean": 0.00023920949752209708, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0044485858496045695, |
| "completion_length": 90.6500015258789, |
| "epoch": 0.5878397848429545, |
| "grad_norm": 1.844166874885559, |
| "kl": 0.9453303083777428, |
| "learning_rate": 4.2768185781972433e-07, |
| "loss": 0.0038, |
| "reward": 1.7277095794677735, |
| "reward_std": 0.22161270976066588, |
| "rewards/code_format_reward": 0.9849999904632568, |
| "rewards/code_reward": 0.6176047682762146, |
| "step": 3060, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.06112067271023989, |
| "clip_ratio/high_mean": 0.008206171146593989, |
| "clip_ratio/low_mean": 0.0006491162814199925, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.008855287660844624, |
| "completion_length": 81.27750091552734, |
| "epoch": 0.589760829891461, |
| "grad_norm": 3.0321500301361084, |
| "kl": 0.4705409877002239, |
| "learning_rate": 4.2506978982512964e-07, |
| "loss": -0.0002, |
| "reward": 1.9011548519134522, |
| "reward_std": 0.2363950289785862, |
| "rewards/code_format_reward": 0.9875, |
| "rewards/code_reward": 0.7037024021148681, |
| "step": 3070, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.028480308945290744, |
| "clip_ratio/high_mean": 0.00514335140469484, |
| "clip_ratio/low_mean": 0.0035089968900138047, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.008652348211035133, |
| "completion_length": 88.05750274658203, |
| "epoch": 0.5916818749399674, |
| "grad_norm": 4.498425483703613, |
| "kl": 0.9383749194443226, |
| "learning_rate": 4.224622747641835e-07, |
| "loss": -0.0068, |
| "reward": 1.2419449806213378, |
| "reward_std": 0.1959183931350708, |
| "rewards/code_format_reward": 0.9799999952316284, |
| "rewards/code_reward": 0.37597247362136843, |
| "step": 3080, |
| "zero_std_ratio": 0.6 |
| }, |
| { |
| "clip_ratio/high_max": 0.02165755571331829, |
| "clip_ratio/high_mean": 0.003493850605445914, |
| "clip_ratio/low_mean": 0.0001163623295724392, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.003610212981584482, |
| "completion_length": 83.10500030517578, |
| "epoch": 0.5936029199884737, |
| "grad_norm": 1.0221151113510132, |
| "kl": 1.614695566892624, |
| "learning_rate": 4.1985940766468663e-07, |
| "loss": 0.1048, |
| "reward": 1.8437815666198731, |
| "reward_std": 0.12033854126930237, |
| "rewards/code_format_reward": 0.9949999928474427, |
| "rewards/code_reward": 0.6731407642364502, |
| "step": 3090, |
| "zero_std_ratio": 0.675 |
| }, |
| { |
| "clip_ratio/high_max": 0.05757335813250393, |
| "clip_ratio/high_mean": 0.0107182093168376, |
| "clip_ratio/low_mean": 0.004042259410198312, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.014760468708118424, |
| "completion_length": 86.6625, |
| "epoch": 0.5955239650369801, |
| "grad_norm": 3.0221967697143555, |
| "kl": 0.4662696644663811, |
| "learning_rate": 4.1726128338504997e-07, |
| "loss": 0.0059, |
| "reward": 1.6797678232192994, |
| "reward_std": 0.23598156571388246, |
| "rewards/code_format_reward": 0.9875, |
| "rewards/code_reward": 0.5930089056491852, |
| "step": 3100, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.1632944119395688, |
| "clip_ratio/high_mean": 0.02386658971372526, |
| "clip_ratio/low_mean": 0.000367270597780589, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.02423386004229542, |
| "completion_length": 87.04000244140624, |
| "epoch": 0.5974450100854864, |
| "grad_norm": 3124.911865234375, |
| "kl": 1.4825018651783466, |
| "learning_rate": 4.146679966108374e-07, |
| "loss": 0.109, |
| "reward": 1.7368038177490235, |
| "reward_std": 0.2290027320384979, |
| "rewards/code_format_reward": 0.9912499785423279, |
| "rewards/code_reward": 0.620589405298233, |
| "step": 3110, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.011806602030992508, |
| "clip_ratio/high_mean": 0.00222149578621611, |
| "clip_ratio/low_mean": 0.001782867594738491, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.004004363450803794, |
| "completion_length": 76.50500183105468, |
| "epoch": 0.5993660551339929, |
| "grad_norm": 5.609122276306152, |
| "kl": 1.2381610602140427, |
| "learning_rate": 4.120796418513165e-07, |
| "loss": 0.0687, |
| "reward": 1.6538613319396973, |
| "reward_std": 0.2478315144777298, |
| "rewards/code_format_reward": 0.9825000047683716, |
| "rewards/code_reward": 0.5813056170940399, |
| "step": 3120, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.04111457797698677, |
| "clip_ratio/high_mean": 0.006102612579707056, |
| "clip_ratio/low_mean": 0.0006678692123387008, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006770481838611886, |
| "completion_length": 90.63500213623047, |
| "epoch": 0.6012871001824993, |
| "grad_norm": 1.7537983655929565, |
| "kl": 0.8379382207989693, |
| "learning_rate": 4.094963134360129e-07, |
| "loss": 3.0713, |
| "reward": 1.8111864566802978, |
| "reward_std": 0.23444892466068268, |
| "rewards/code_format_reward": 0.9824999928474426, |
| "rewards/code_reward": 0.6599682211875916, |
| "step": 3130, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.07487462717108428, |
| "clip_ratio/high_mean": 0.009757341054501012, |
| "clip_ratio/low_mean": 0.002470593445468694, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.012227934325346723, |
| "completion_length": 84.99250183105468, |
| "epoch": 0.6032081452310056, |
| "grad_norm": 7.498387336730957, |
| "kl": 0.5894037500023842, |
| "learning_rate": 4.0691810551127327e-07, |
| "loss": 0.0462, |
| "reward": 1.6221882104873657, |
| "reward_std": 0.25462802946567537, |
| "rewards/code_format_reward": 0.9975000023841858, |
| "rewards/code_reward": 0.5617190957069397, |
| "step": 3140, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.015196678857319058, |
| "clip_ratio/high_mean": 0.0022096226894063875, |
| "clip_ratio/low_mean": 0.002686911600176245, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.004896534324507229, |
| "completion_length": 88.44750213623047, |
| "epoch": 0.6051291902795121, |
| "grad_norm": 0.7371006011962891, |
| "kl": 1.5165767412632705, |
| "learning_rate": 4.0434511203683386e-07, |
| "loss": 0.0113, |
| "reward": 1.958918571472168, |
| "reward_std": 0.17050198167562486, |
| "rewards/code_format_reward": 0.9962499856948852, |
| "rewards/code_reward": 0.7303967714309693, |
| "step": 3150, |
| "zero_std_ratio": 0.625 |
| }, |
| { |
| "clip_ratio/high_max": 0.047509356867522, |
| "clip_ratio/high_mean": 0.0060635729460045695, |
| "clip_ratio/low_mean": 0.0037405278504593297, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.009804100578185171, |
| "completion_length": 93.71500091552734, |
| "epoch": 0.6070502353280185, |
| "grad_norm": 4.062532424926758, |
| "kl": 164.7577206812799, |
| "learning_rate": 4.017774267823967e-07, |
| "loss": 0.3479, |
| "reward": 1.8433427095413208, |
| "reward_std": 0.20897280275821686, |
| "rewards/code_format_reward": 0.9824999928474426, |
| "rewards/code_reward": 0.6760463416576385, |
| "step": 3160, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.007103513111360371, |
| "clip_ratio/high_mean": 0.0009442454349482432, |
| "clip_ratio/low_mean": 0.0005656339257257059, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0015098793461220338, |
| "completion_length": 97.03500061035156, |
| "epoch": 0.6089712803765248, |
| "grad_norm": 0.3194718658924103, |
| "kl": 19.38877977654338, |
| "learning_rate": 3.9921514332421193e-07, |
| "loss": 0.1279, |
| "reward": 1.3801440358161927, |
| "reward_std": 0.26880781557410954, |
| "rewards/code_format_reward": 0.9699999928474426, |
| "rewards/code_reward": 0.44757200181484225, |
| "step": 3170, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.027826181857381015, |
| "clip_ratio/high_mean": 0.004423137854610104, |
| "clip_ratio/low_mean": 0.000519216748944018, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.004942354625381995, |
| "completion_length": 99.6375015258789, |
| "epoch": 0.6108923254250312, |
| "grad_norm": 133.27151489257812, |
| "kl": 91.96420569866896, |
| "learning_rate": 3.966583550416676e-07, |
| "loss": 284.3821, |
| "reward": 1.6065278768539428, |
| "reward_std": 0.2671674907207489, |
| "rewards/code_format_reward": 0.9737499952316284, |
| "rewards/code_reward": 0.5598264217376709, |
| "step": 3180, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.03810381339862943, |
| "clip_ratio/high_mean": 0.005511091940570622, |
| "clip_ratio/low_mean": 0.00701818183879368, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.01252927360474132, |
| "completion_length": 90.77000122070312, |
| "epoch": 0.6128133704735376, |
| "grad_norm": 2.931155204772949, |
| "kl": 4.587994083762169, |
| "learning_rate": 3.9410715511388647e-07, |
| "loss": 28143.1688, |
| "reward": 1.7186223268508911, |
| "reward_std": 0.2031429558992386, |
| "rewards/code_format_reward": 0.98125, |
| "rewards/code_reward": 0.6139986515045166, |
| "step": 3190, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.18900979291647674, |
| "clip_ratio/high_mean": 0.025313075329177082, |
| "clip_ratio/low_mean": 0.00013794690457871183, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.02545102240983397, |
| "completion_length": 88.60750122070313, |
| "epoch": 0.614734415522044, |
| "grad_norm": 3.9914708137512207, |
| "kl": 0.678571529686451, |
| "learning_rate": 3.915616365163304e-07, |
| "loss": 0.0002, |
| "reward": 1.818918228149414, |
| "reward_std": 0.24608779847621917, |
| "rewards/code_format_reward": 0.9762500047683715, |
| "rewards/code_reward": 0.6653966069221496, |
| "step": 3200, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.05510081194806844, |
| "clip_ratio/high_mean": 0.008429678474203683, |
| "clip_ratio/low_mean": 0.0015389235399197788, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0099686019733781, |
| "completion_length": 85.05250091552735, |
| "epoch": 0.6166554605705504, |
| "grad_norm": 2.0297534465789795, |
| "kl": 0.5190044179558754, |
| "learning_rate": 3.890218920174122e-07, |
| "loss": -0.0056, |
| "reward": 1.938026785850525, |
| "reward_std": 0.2829041987657547, |
| "rewards/code_format_reward": 0.9887499928474426, |
| "rewards/code_reward": 0.7218258857727051, |
| "step": 3210, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.04617450258228928, |
| "clip_ratio/high_mean": 0.007303895291988738, |
| "clip_ratio/low_mean": 0.002542783234093804, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.009846678806934506, |
| "completion_length": 92.52000122070312, |
| "epoch": 0.6185765056190567, |
| "grad_norm": 3.2283730506896973, |
| "kl": 0.5362374372780323, |
| "learning_rate": 3.86488014175114e-07, |
| "loss": 0.0003, |
| "reward": 1.7741312742233277, |
| "reward_std": 0.20447308868169783, |
| "rewards/code_format_reward": 0.9899999976158143, |
| "rewards/code_reward": 0.6395656108856201, |
| "step": 3220, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.059750709845684466, |
| "clip_ratio/high_mean": 0.00790787541482132, |
| "clip_ratio/low_mean": 0.0012954409321537241, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.009203316466300748, |
| "completion_length": 90.4375, |
| "epoch": 0.6204975506675632, |
| "grad_norm": 2.409045934677124, |
| "kl": 0.553566773980856, |
| "learning_rate": 3.8396009533361486e-07, |
| "loss": -0.0, |
| "reward": 1.6513851642608643, |
| "reward_std": 0.24081393480300903, |
| "rewards/code_format_reward": 0.9799999952316284, |
| "rewards/code_reward": 0.580692571401596, |
| "step": 3230, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.03564520282670856, |
| "clip_ratio/high_mean": 0.004964679945260286, |
| "clip_ratio/low_mean": 0.004444090686592972, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.009408770385198295, |
| "completion_length": 79.08000183105469, |
| "epoch": 0.6224185957160695, |
| "grad_norm": 7.759763717651367, |
| "kl": 1.2998816877603532, |
| "learning_rate": 3.814382276199251e-07, |
| "loss": -0.0006, |
| "reward": 1.6336610555648803, |
| "reward_std": 0.1691926121711731, |
| "rewards/code_format_reward": 0.9949999928474427, |
| "rewards/code_reward": 0.5680804908275604, |
| "step": 3240, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.011579358880408109, |
| "clip_ratio/high_mean": 0.002202258622855879, |
| "clip_ratio/low_mean": 0.0003946456956327893, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0025969043519580735, |
| "completion_length": 88.7375, |
| "epoch": 0.6243396407645759, |
| "grad_norm": 9.489768981933594, |
| "kl": 4.286054483056068, |
| "learning_rate": 3.7892250294052853e-07, |
| "loss": 31.2761, |
| "reward": 1.8622464895248414, |
| "reward_std": 0.2547990471124649, |
| "rewards/code_format_reward": 0.98125, |
| "rewards/code_reward": 0.6858106970787048, |
| "step": 3250, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.07659143296186813, |
| "clip_ratio/high_mean": 0.010122461079299682, |
| "clip_ratio/low_mean": 0.0019954566974774933, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.012117917880095775, |
| "completion_length": 99.80750274658203, |
| "epoch": 0.6262606858130824, |
| "grad_norm": 2.884183168411255, |
| "kl": 1.2840011775493623, |
| "learning_rate": 3.764130129780341e-07, |
| "loss": 0.0383, |
| "reward": 1.6670962572097778, |
| "reward_std": 0.34920003414154055, |
| "rewards/code_format_reward": 0.9712499976158142, |
| "rewards/code_reward": 0.5907356142997742, |
| "step": 3260, |
| "zero_std_ratio": 0.375 |
| }, |
| { |
| "clip_ratio/high_max": 0.029844516195589678, |
| "clip_ratio/high_mean": 0.004244843772175955, |
| "clip_ratio/low_mean": 0.0002169124811189249, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.004461756230011815, |
| "completion_length": 100.70250091552734, |
| "epoch": 0.6281817308615887, |
| "grad_norm": 4.036985397338867, |
| "kl": 2.1118960954248904, |
| "learning_rate": 3.7390984918783286e-07, |
| "loss": 0.9419, |
| "reward": 1.6084105730056764, |
| "reward_std": 0.17128639966249465, |
| "rewards/code_format_reward": 0.9712500095367431, |
| "rewards/code_reward": 0.5613927602767944, |
| "step": 3270, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.07067356873303651, |
| "clip_ratio/high_mean": 0.00971948360092938, |
| "clip_ratio/low_mean": 0.0006290240438829642, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.010348507657181472, |
| "completion_length": 88.9000015258789, |
| "epoch": 0.6301027759100951, |
| "grad_norm": 1.543152928352356, |
| "kl": 0.5742107287049294, |
| "learning_rate": 3.714131027947669e-07, |
| "loss": 0.0006, |
| "reward": 1.808586883544922, |
| "reward_std": 0.20984979271888732, |
| "rewards/code_format_reward": 0.9912499904632568, |
| "rewards/code_reward": 0.6564809083938599, |
| "step": 3280, |
| "zero_std_ratio": 0.6 |
| }, |
| { |
| "clip_ratio/high_max": 0.060896387742832306, |
| "clip_ratio/high_mean": 0.00765781793743372, |
| "clip_ratio/low_mean": 0.01029690281720832, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.017954721208661796, |
| "completion_length": 80.08500213623047, |
| "epoch": 0.6320238209586014, |
| "grad_norm": 2.127617359161377, |
| "kl": 0.6725200928747654, |
| "learning_rate": 3.689228647898034e-07, |
| "loss": 0.1143, |
| "reward": 1.678031039237976, |
| "reward_std": 0.19750893712043763, |
| "rewards/code_format_reward": 0.987499988079071, |
| "rewards/code_reward": 0.5921404898166657, |
| "step": 3290, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.05414980174973607, |
| "clip_ratio/high_mean": 0.007520435960032046, |
| "clip_ratio/low_mean": 0.00011696565634338185, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.007637401651300025, |
| "completion_length": 92.725, |
| "epoch": 0.6339448660071079, |
| "grad_norm": 8.315914154052734, |
| "kl": 0.30459046363830566, |
| "learning_rate": 3.6643922592671904e-07, |
| "loss": -0.0066, |
| "reward": 1.5898099780082702, |
| "reward_std": 0.1832955375313759, |
| "rewards/code_format_reward": 0.99375, |
| "rewards/code_reward": 0.5464674949645996, |
| "step": 3300, |
| "zero_std_ratio": 0.625 |
| }, |
| { |
| "clip_ratio/high_max": 0.02745349882170558, |
| "clip_ratio/high_mean": 0.004275670822244138, |
| "clip_ratio/low_mean": 0.001036624335392844, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005312295141629874, |
| "completion_length": 86.80250091552735, |
| "epoch": 0.6358659110556143, |
| "grad_norm": 4.2797441482543945, |
| "kl": 2.398578557372093, |
| "learning_rate": 3.6396227671879267e-07, |
| "loss": 0.028, |
| "reward": 1.7730424404144287, |
| "reward_std": 0.3175764262676239, |
| "rewards/code_format_reward": 0.9912500023841858, |
| "rewards/code_reward": 0.6387087047100067, |
| "step": 3310, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.03558065614197403, |
| "clip_ratio/high_mean": 0.004970578508800827, |
| "clip_ratio/low_mean": 0.0008951228694058955, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0058657014247728515, |
| "completion_length": 91.0875015258789, |
| "epoch": 0.6377869561041206, |
| "grad_norm": 5.376333713531494, |
| "kl": 1.4305558323860168, |
| "learning_rate": 3.614921074355067e-07, |
| "loss": 0.0034, |
| "reward": 1.7029305696487427, |
| "reward_std": 0.34837333858013153, |
| "rewards/code_format_reward": 0.9849999904632568, |
| "rewards/code_reward": 0.6052152514457703, |
| "step": 3320, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.04346057323855348, |
| "clip_ratio/high_mean": 0.005737839776702458, |
| "clip_ratio/low_mean": 0.001675139949657023, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00741297956337803, |
| "completion_length": 88.75250396728515, |
| "epoch": 0.639708001152627, |
| "grad_norm": 2.969228744506836, |
| "kl": 0.7607076019048691, |
| "learning_rate": 3.5902880809925704e-07, |
| "loss": -0.0001, |
| "reward": 1.6762405157089233, |
| "reward_std": 0.2515918217599392, |
| "rewards/code_format_reward": 0.9887499928474426, |
| "rewards/code_reward": 0.5909327387809753, |
| "step": 3330, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.05080293011851609, |
| "clip_ratio/high_mean": 0.006427765643456951, |
| "clip_ratio/low_mean": 0.00040708604792598634, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006834851735038683, |
| "completion_length": 88.0750015258789, |
| "epoch": 0.6416290462011334, |
| "grad_norm": 12.137472152709961, |
| "kl": 0.31881698705255984, |
| "learning_rate": 3.565724684820727e-07, |
| "loss": 3.6118, |
| "reward": 1.8916306495666504, |
| "reward_std": 0.1850387692451477, |
| "rewards/code_format_reward": 0.99375, |
| "rewards/code_reward": 0.6973778128623962, |
| "step": 3340, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.14287711144424975, |
| "clip_ratio/high_mean": 0.019231261435197666, |
| "clip_ratio/low_mean": 0.0020263168029487134, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.021257578127551822, |
| "completion_length": 94.19000091552735, |
| "epoch": 0.6435500912496398, |
| "grad_norm": 6.10810661315918, |
| "kl": 0.8296034529805183, |
| "learning_rate": 3.541231781023436e-07, |
| "loss": -0.0004, |
| "reward": 1.6248144626617431, |
| "reward_std": 0.2219874605536461, |
| "rewards/code_format_reward": 0.9887499809265137, |
| "rewards/code_reward": 0.5652197122573852, |
| "step": 3350, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.07329725201707334, |
| "clip_ratio/high_mean": 0.009671362905646675, |
| "clip_ratio/low_mean": 0.005342914546781685, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.015014277724549174, |
| "completion_length": 97.74500274658203, |
| "epoch": 0.6454711362981462, |
| "grad_norm": 2.801866054534912, |
| "kl": 0.5770246163010597, |
| "learning_rate": 3.5168102622155894e-07, |
| "loss": 0.0, |
| "reward": 1.6838999271392823, |
| "reward_std": 0.2707583636045456, |
| "rewards/code_format_reward": 0.9912499904632568, |
| "rewards/code_reward": 0.594137442111969, |
| "step": 3360, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.005975091701839119, |
| "clip_ratio/high_mean": 0.0011488659016322344, |
| "clip_ratio/low_mean": 0.001098146109143272, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0022470120195066555, |
| "completion_length": 89.36750030517578, |
| "epoch": 0.6473921813466526, |
| "grad_norm": 34.13050079345703, |
| "kl": 2.2693535044789312, |
| "learning_rate": 3.492461018410535e-07, |
| "loss": 0.0028, |
| "reward": 1.8977232933044434, |
| "reward_std": 0.2937870219349861, |
| "rewards/code_format_reward": 0.9862499952316284, |
| "rewards/code_reward": 0.7022991299629211, |
| "step": 3370, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.04565345844021067, |
| "clip_ratio/high_mean": 0.009023689541209023, |
| "clip_ratio/low_mean": 0.00031946374219842254, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.009343153254303616, |
| "completion_length": 84.38249969482422, |
| "epoch": 0.649313226395159, |
| "grad_norm": 0.9170461893081665, |
| "kl": 108.80695619434118, |
| "learning_rate": 3.468184936987645e-07, |
| "loss": 920.5057, |
| "reward": 1.7496967315673828, |
| "reward_std": 0.2916144669055939, |
| "rewards/code_format_reward": 0.99375, |
| "rewards/code_reward": 0.626410859823227, |
| "step": 3380, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.04991705315187574, |
| "clip_ratio/high_mean": 0.007881995162460954, |
| "clip_ratio/low_mean": 0.00031314246589317916, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.008195137570146472, |
| "completion_length": 88.86000061035156, |
| "epoch": 0.6512342714436653, |
| "grad_norm": 3.084516763687134, |
| "kl": 1331.8980419039726, |
| "learning_rate": 3.4439829026599765e-07, |
| "loss": 2.6994, |
| "reward": 1.7110779523849486, |
| "reward_std": 0.22298349142074586, |
| "rewards/code_format_reward": 0.99375, |
| "rewards/code_reward": 0.6071014523506164, |
| "step": 3390, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.03301922780228779, |
| "clip_ratio/high_mean": 0.005802097530977335, |
| "clip_ratio/low_mean": 0.002479181956732646, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00828127931599738, |
| "completion_length": 77.6500015258789, |
| "epoch": 0.6531553164921717, |
| "grad_norm": 3643.742919921875, |
| "kl": 629.580971956253, |
| "learning_rate": 3.4198557974420236e-07, |
| "loss": 1.3601, |
| "reward": 1.9027020692825318, |
| "reward_std": 0.23692196756601333, |
| "rewards/code_format_reward": 0.9899999856948852, |
| "rewards/code_reward": 0.7038509964942932, |
| "step": 3400, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.026931732892990112, |
| "clip_ratio/high_mean": 0.004060871619731188, |
| "clip_ratio/low_mean": 0.0013453641964588313, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00540623576962389, |
| "completion_length": 81.77250213623047, |
| "epoch": 0.6550763615406782, |
| "grad_norm": 3.2221176624298096, |
| "kl": 17.398655989021062, |
| "learning_rate": 3.3958045006175804e-07, |
| "loss": 0.0552, |
| "reward": 1.7479909420013429, |
| "reward_std": 0.22741918563842772, |
| "rewards/code_format_reward": 0.9725000023841858, |
| "rewards/code_reward": 0.6308704853057862, |
| "step": 3410, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.06737710665911437, |
| "clip_ratio/high_mean": 0.008767830353463069, |
| "clip_ratio/low_mean": 0.0005067014892119915, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.009274532069684937, |
| "completion_length": 92.88500061035157, |
| "epoch": 0.6569974065891845, |
| "grad_norm": 4.063995838165283, |
| "kl": 2.0011128395795823, |
| "learning_rate": 3.3718298887077003e-07, |
| "loss": 0.0159, |
| "reward": 1.7235053777694702, |
| "reward_std": 0.2168472334742546, |
| "rewards/code_format_reward": 0.98125, |
| "rewards/code_reward": 0.616440212726593, |
| "step": 3420, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.019622295489534737, |
| "clip_ratio/high_mean": 0.003191170998616144, |
| "clip_ratio/low_mean": 0.0015002752974396572, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0046914463368011635, |
| "completion_length": 80.16000213623047, |
| "epoch": 0.6589184516376909, |
| "grad_norm": 1.253300428390503, |
| "kl": 0.48067781031131745, |
| "learning_rate": 3.3479328354387286e-07, |
| "loss": 0.0008, |
| "reward": 1.7450715541839599, |
| "reward_std": 0.1590050458908081, |
| "rewards/code_format_reward": 0.9924999952316285, |
| "rewards/code_reward": 0.6244107484817505, |
| "step": 3430, |
| "zero_std_ratio": 0.625 |
| }, |
| { |
| "clip_ratio/high_max": 0.03181373123079538, |
| "clip_ratio/high_mean": 0.0046242739539593455, |
| "clip_ratio/low_mean": 0.012107005770667456, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.016731279762461783, |
| "completion_length": 84.32750244140625, |
| "epoch": 0.6608394966861972, |
| "grad_norm": 1.5854672193527222, |
| "kl": 0.42518851198256014, |
| "learning_rate": 3.324114211710498e-07, |
| "loss": 0.0, |
| "reward": 1.6541699171066284, |
| "reward_std": 0.1113172210752964, |
| "rewards/code_format_reward": 0.9962499856948852, |
| "rewards/code_reward": 0.5780224561691284, |
| "step": 3440, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.13535575959831475, |
| "clip_ratio/high_mean": 0.018421862670220435, |
| "clip_ratio/low_mean": 0.0012572539155371488, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.019679116318002343, |
| "completion_length": 91.63250122070312, |
| "epoch": 0.6627605417347037, |
| "grad_norm": 4.593750476837158, |
| "kl": 0.7388513803482055, |
| "learning_rate": 3.300374885564553e-07, |
| "loss": -0.0, |
| "reward": 1.5408308625221252, |
| "reward_std": 0.29571940898895266, |
| "rewards/code_format_reward": 0.9749999880790711, |
| "rewards/code_reward": 0.5266654074192048, |
| "step": 3450, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.04671933995559811, |
| "clip_ratio/high_mean": 0.0062255718978121875, |
| "clip_ratio/low_mean": 0.003391482085862663, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.009617053843976464, |
| "completion_length": 78.45250091552734, |
| "epoch": 0.6646815867832101, |
| "grad_norm": 2.5849409103393555, |
| "kl": 10.90581871420145, |
| "learning_rate": 3.2767157221525437e-07, |
| "loss": 0.0178, |
| "reward": 1.5087457418441772, |
| "reward_std": 0.19353876560926436, |
| "rewards/code_format_reward": 0.9875, |
| "rewards/code_reward": 0.5074978828430176, |
| "step": 3460, |
| "zero_std_ratio": 0.625 |
| }, |
| { |
| "clip_ratio/high_max": 0.0318772604689002, |
| "clip_ratio/high_mean": 0.004644899175036699, |
| "clip_ratio/low_mean": 0.0032211030862526967, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.007866002165246754, |
| "completion_length": 75.36500091552735, |
| "epoch": 0.6666026318317164, |
| "grad_norm": 1.8117616176605225, |
| "kl": 187030.33910432606, |
| "learning_rate": 3.253137583704673e-07, |
| "loss": 374.1458, |
| "reward": 1.6825225114822389, |
| "reward_std": 0.2058879092335701, |
| "rewards/code_format_reward": 0.9962499976158142, |
| "rewards/code_reward": 0.5921987533569336, |
| "step": 3470, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.09497236199676991, |
| "clip_ratio/high_mean": 0.015650217607617378, |
| "clip_ratio/low_mean": 0.0006928690614586231, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.016343086725100875, |
| "completion_length": 89.13250198364258, |
| "epoch": 0.6685236768802229, |
| "grad_norm": 5.850868225097656, |
| "kl": 0.5080707125365734, |
| "learning_rate": 3.229641329498296e-07, |
| "loss": 0.0463, |
| "reward": 1.6678599119186401, |
| "reward_std": 0.2830047011375427, |
| "rewards/code_format_reward": 0.9724999904632569, |
| "rewards/code_reward": 0.5908049464225769, |
| "step": 3480, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.050425410037860274, |
| "clip_ratio/high_mean": 0.006465365196345374, |
| "clip_ratio/low_mean": 0.0006157927738968283, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0070811579586006704, |
| "completion_length": 81.40749969482422, |
| "epoch": 0.6704447219287293, |
| "grad_norm": 10.526844024658203, |
| "kl": 1.5019532606005668, |
| "learning_rate": 3.2062278158265866e-07, |
| "loss": -0.0021, |
| "reward": 1.7323597908020019, |
| "reward_std": 0.15349715426564217, |
| "rewards/code_format_reward": 0.9837499976158142, |
| "rewards/code_reward": 0.6202423751354218, |
| "step": 3490, |
| "zero_std_ratio": 0.65 |
| }, |
| { |
| "clip_ratio/high_max": 0.07051093662157655, |
| "clip_ratio/high_mean": 0.009594869159627706, |
| "clip_ratio/low_mean": 0.001997971232049167, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.011592840391676873, |
| "completion_length": 96.4625, |
| "epoch": 0.6723657669772356, |
| "grad_norm": 12.833992958068848, |
| "kl": 0.37389371246099473, |
| "learning_rate": 3.182897895967338e-07, |
| "loss": 0.0008, |
| "reward": 1.6037346363067626, |
| "reward_std": 0.329493448138237, |
| "rewards/code_format_reward": 0.98125, |
| "rewards/code_reward": 0.556554788351059, |
| "step": 3500, |
| "zero_std_ratio": 0.325 |
| }, |
| { |
| "clip_ratio/high_max": 0.017038265755400062, |
| "clip_ratio/high_mean": 0.0027443476661574095, |
| "clip_ratio/low_mean": 0.000347714369854657, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.003092062100768089, |
| "completion_length": 84.05, |
| "epoch": 0.674286812025742, |
| "grad_norm": 6.119350910186768, |
| "kl": 0.4559432238340378, |
| "learning_rate": 3.15965242015187e-07, |
| "loss": 0.0298, |
| "reward": 1.6935490131378175, |
| "reward_std": 0.26772548258304596, |
| "rewards/code_format_reward": 0.9862499952316284, |
| "rewards/code_reward": 0.6002120196819305, |
| "step": 3510, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.03435098186600953, |
| "clip_ratio/high_mean": 0.005973302901838906, |
| "clip_ratio/low_mean": 0.0006568559459992684, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006630158764892258, |
| "completion_length": 95.0, |
| "epoch": 0.6762078570742484, |
| "grad_norm": 4.796656608581543, |
| "kl": 0.3851431407034397, |
| "learning_rate": 3.1364922355340346e-07, |
| "loss": 0.0214, |
| "reward": 1.8059131860733033, |
| "reward_std": 0.18592590391635894, |
| "rewards/code_format_reward": 0.9899999976158143, |
| "rewards/code_reward": 0.6554565787315368, |
| "step": 3520, |
| "zero_std_ratio": 0.6 |
| }, |
| { |
| "clip_ratio/high_max": 0.024073917022906243, |
| "clip_ratio/high_mean": 0.0035194387339288367, |
| "clip_ratio/low_mean": 0.0002464063392835669, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0037658450222807006, |
| "completion_length": 86.9800018310547, |
| "epoch": 0.6781289021227548, |
| "grad_norm": 7.799978256225586, |
| "kl": 0.2617302156984806, |
| "learning_rate": 3.113418186159349e-07, |
| "loss": -0.0088, |
| "reward": 1.515157699584961, |
| "reward_std": 0.2593328535556793, |
| "rewards/code_format_reward": 0.975, |
| "rewards/code_reward": 0.5138288617134095, |
| "step": 3530, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.014256173744797707, |
| "clip_ratio/high_mean": 0.002001363394083455, |
| "clip_ratio/low_mean": 0.001286455297667999, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0032878186670131982, |
| "completion_length": 93.04500274658203, |
| "epoch": 0.6800499471712612, |
| "grad_norm": 1.323721170425415, |
| "kl": 0.32287237197160723, |
| "learning_rate": 3.090431112934235e-07, |
| "loss": -0.0056, |
| "reward": 1.8219903230667114, |
| "reward_std": 0.28862411081790923, |
| "rewards/code_format_reward": 0.9875, |
| "rewards/code_reward": 0.6641201436519623, |
| "step": 3540, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.0389601900940761, |
| "clip_ratio/high_mean": 0.005975415915600024, |
| "clip_ratio/low_mean": 0.0006638197373831645, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006639235676266253, |
| "completion_length": 95.84250183105469, |
| "epoch": 0.6819709922197675, |
| "grad_norm": 4.850042819976807, |
| "kl": 1.8627108559012413, |
| "learning_rate": 3.067531853595369e-07, |
| "loss": 1.6968, |
| "reward": 1.8796481132507323, |
| "reward_std": 0.13976119682192803, |
| "rewards/code_format_reward": 0.9837500095367432, |
| "rewards/code_reward": 0.6938865780830383, |
| "step": 3550, |
| "zero_std_ratio": 0.65 |
| }, |
| { |
| "clip_ratio/high_max": 0.01971529610455036, |
| "clip_ratio/high_mean": 0.002580236754147336, |
| "clip_ratio/low_mean": 0.0005060694311396219, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0030863061954732986, |
| "completion_length": 85.99000091552735, |
| "epoch": 0.683892037268274, |
| "grad_norm": 499.5800476074219, |
| "kl": 3.8079170405864717, |
| "learning_rate": 3.0447212426791546e-07, |
| "loss": 0.0153, |
| "reward": 1.73906729221344, |
| "reward_std": 0.21255102157592773, |
| "rewards/code_format_reward": 0.9875, |
| "rewards/code_reward": 0.622658634185791, |
| "step": 3560, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.03546578506939113, |
| "clip_ratio/high_mean": 0.005373837990919128, |
| "clip_ratio/low_mean": 0.0011442803021054714, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0065181183628737925, |
| "completion_length": 93.75249938964843, |
| "epoch": 0.6858130823167803, |
| "grad_norm": 3.144973039627075, |
| "kl": 0.7828342400491237, |
| "learning_rate": 3.022000111491309e-07, |
| "loss": 0.0001, |
| "reward": 1.8471190690994264, |
| "reward_std": 0.27725095450878146, |
| "rewards/code_format_reward": 0.9487499952316284, |
| "rewards/code_reward": 0.686372023820877, |
| "step": 3570, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.01367393396794796, |
| "clip_ratio/high_mean": 0.001831050164764747, |
| "clip_ratio/low_mean": 0.0008013980732357595, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.002632448251824826, |
| "completion_length": 96.29500122070313, |
| "epoch": 0.6877341273652867, |
| "grad_norm": 3.9027657508850098, |
| "kl": 0.8669951900839805, |
| "learning_rate": 2.99936928807657e-07, |
| "loss": -0.0007, |
| "reward": 1.6410433769226074, |
| "reward_std": 0.25681858956813813, |
| "rewards/code_format_reward": 0.9875, |
| "rewards/code_reward": 0.5736466705799103, |
| "step": 3580, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.028569919406436384, |
| "clip_ratio/high_mean": 0.0037314103537937626, |
| "clip_ratio/low_mean": 0.0012738955876557157, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005005306130624376, |
| "completion_length": 84.3875015258789, |
| "epoch": 0.6896551724137931, |
| "grad_norm": 1.8412340879440308, |
| "kl": 0.6606554225087166, |
| "learning_rate": 2.976829597188506e-07, |
| "loss": -0.0007, |
| "reward": 1.6131571292877198, |
| "reward_std": 0.15807003602385522, |
| "rewards/code_format_reward": 0.9950000047683716, |
| "rewards/code_reward": 0.5578285574913024, |
| "step": 3590, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.045477401558309795, |
| "clip_ratio/high_mean": 0.007051247591152787, |
| "clip_ratio/low_mean": 0.00021358822996262461, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.007264835678506642, |
| "completion_length": 92.69250030517578, |
| "epoch": 0.6915762174622995, |
| "grad_norm": 4.787570953369141, |
| "kl": 0.2786871612071991, |
| "learning_rate": 2.9543818602594826e-07, |
| "loss": 0.0001, |
| "reward": 1.6197675943374634, |
| "reward_std": 0.2863120764493942, |
| "rewards/code_format_reward": 0.9787499904632568, |
| "rewards/code_reward": 0.5651962697505951, |
| "step": 3600, |
| "zero_std_ratio": 0.4 |
| }, |
| { |
| "clip_ratio/high_max": 0.047238136362284425, |
| "clip_ratio/high_mean": 0.006483453582040966, |
| "clip_ratio/low_mean": 0.0015064548759255558, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.007989908382296563, |
| "completion_length": 83.11750183105468, |
| "epoch": 0.6934972625108059, |
| "grad_norm": 1.5795401334762573, |
| "kl": 0.512858135998249, |
| "learning_rate": 2.932026895370697e-07, |
| "loss": 0.0021, |
| "reward": 1.6763751983642579, |
| "reward_std": 0.12559455148875714, |
| "rewards/code_format_reward": 0.9912499904632568, |
| "rewards/code_reward": 0.5903751432895661, |
| "step": 3610, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.031613614642992616, |
| "clip_ratio/high_mean": 0.00453935784753412, |
| "clip_ratio/low_mean": 0.0026672417909139766, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.007206599647179246, |
| "completion_length": 89.80250244140625, |
| "epoch": 0.6954183075593122, |
| "grad_norm": 0.9828081130981445, |
| "kl": 2.053369848430157, |
| "learning_rate": 2.909765517222392e-07, |
| "loss": -0.0015, |
| "reward": 1.6560463190078736, |
| "reward_std": 0.2526627391576767, |
| "rewards/code_format_reward": 0.9837499976158142, |
| "rewards/code_reward": 0.5820856630802155, |
| "step": 3620, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.00962083850754425, |
| "clip_ratio/high_mean": 0.0013845860186847859, |
| "clip_ratio/low_mean": 0.00100141861839802, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0023860046290792524, |
| "completion_length": 93.0125015258789, |
| "epoch": 0.6973393526078187, |
| "grad_norm": 1.4326051473617554, |
| "kl": 0.7425350762903691, |
| "learning_rate": 2.887598537104141e-07, |
| "loss": 0.017, |
| "reward": 1.608488416671753, |
| "reward_std": 0.18181688338518143, |
| "rewards/code_format_reward": 0.9887499928474426, |
| "rewards/code_reward": 0.557056725025177, |
| "step": 3630, |
| "zero_std_ratio": 0.6 |
| }, |
| { |
| "clip_ratio/high_max": 0.10694800971541554, |
| "clip_ratio/high_mean": 0.016866487907827833, |
| "clip_ratio/low_mean": 0.000146488708560355, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.017012976511614396, |
| "completion_length": 86.96750183105469, |
| "epoch": 0.6992603976563251, |
| "grad_norm": 5.3714776039123535, |
| "kl": 0.5909165881574154, |
| "learning_rate": 2.8655267628653044e-07, |
| "loss": 0.0005, |
| "reward": 1.6461472749710082, |
| "reward_std": 0.22788509875535964, |
| "rewards/code_format_reward": 0.9862500071525574, |
| "rewards/code_reward": 0.5765111327171326, |
| "step": 3640, |
| "zero_std_ratio": 0.6 |
| }, |
| { |
| "clip_ratio/high_max": 0.015923467138782142, |
| "clip_ratio/high_mean": 0.0022047571546863765, |
| "clip_ratio/low_mean": 0.0014544774603564292, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0036592346790712328, |
| "completion_length": 91.53250122070312, |
| "epoch": 0.7011814427048314, |
| "grad_norm": 7.581000328063965, |
| "kl": 3.2652564592659474, |
| "learning_rate": 2.8435509988855683e-07, |
| "loss": -0.0019, |
| "reward": 1.6843700885772706, |
| "reward_std": 0.20299706608057022, |
| "rewards/code_format_reward": 0.993749988079071, |
| "rewards/code_reward": 0.5937475442886353, |
| "step": 3650, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.04569347179494798, |
| "clip_ratio/high_mean": 0.00580012007849291, |
| "clip_ratio/low_mean": 0.003195645064988639, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.008995765156578272, |
| "completion_length": 82.49500122070313, |
| "epoch": 0.7031024877533378, |
| "grad_norm": 10.031012535095215, |
| "kl": 0.3446802504360676, |
| "learning_rate": 2.821672046045642e-07, |
| "loss": -0.003, |
| "reward": 1.9148546934127808, |
| "reward_std": 0.15906044691801072, |
| "rewards/code_format_reward": 0.99375, |
| "rewards/code_reward": 0.7089898109436035, |
| "step": 3660, |
| "zero_std_ratio": 0.6 |
| }, |
| { |
| "clip_ratio/high_max": 0.01652005296200514, |
| "clip_ratio/high_mean": 0.0032194001134485005, |
| "clip_ratio/low_mean": 0.0004348491333075799, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.003654249230748974, |
| "completion_length": 87.48000030517578, |
| "epoch": 0.7050235328018442, |
| "grad_norm": 4.5817551612854, |
| "kl": 0.5381794683635235, |
| "learning_rate": 2.799890701698068e-07, |
| "loss": -0.0018, |
| "reward": 1.4432553768157959, |
| "reward_std": 0.19258553311228752, |
| "rewards/code_format_reward": 0.9887499928474426, |
| "rewards/code_reward": 0.4744401514530182, |
| "step": 3670, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.03230769606307149, |
| "clip_ratio/high_mean": 0.004254726751241833, |
| "clip_ratio/low_mean": 0.0003341716161230579, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0045888983644545075, |
| "completion_length": 91.96000366210937, |
| "epoch": 0.7069445778503506, |
| "grad_norm": 3.1825077533721924, |
| "kl": 0.5493438571691514, |
| "learning_rate": 2.7782077596381596e-07, |
| "loss": 0.0032, |
| "reward": 1.8943065643310546, |
| "reward_std": 0.22485891729593277, |
| "rewards/code_format_reward": 0.9924999952316285, |
| "rewards/code_reward": 0.6990282416343689, |
| "step": 3680, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.017006529681384563, |
| "clip_ratio/high_mean": 0.0026059710187837483, |
| "clip_ratio/low_mean": 0.00022266755404416473, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0028286385582759976, |
| "completion_length": 92.6875015258789, |
| "epoch": 0.708865622898857, |
| "grad_norm": 3.126534938812256, |
| "kl": 2.302929486706853, |
| "learning_rate": 2.7566240100750794e-07, |
| "loss": 0.0024, |
| "reward": 1.6279277324676513, |
| "reward_std": 0.3058730036020279, |
| "rewards/code_format_reward": 0.9837499976158142, |
| "rewards/code_reward": 0.568026351928711, |
| "step": 3690, |
| "zero_std_ratio": 0.4 |
| }, |
| { |
| "clip_ratio/high_max": 0.020053896540775894, |
| "clip_ratio/high_mean": 0.0029980215302202852, |
| "clip_ratio/low_mean": 0.0004860887274844572, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.003484110155841336, |
| "completion_length": 97.92250061035156, |
| "epoch": 0.7107866679473633, |
| "grad_norm": 4.224461555480957, |
| "kl": 4.42233342602849, |
| "learning_rate": 2.735140239603034e-07, |
| "loss": -0.0003, |
| "reward": 1.960454559326172, |
| "reward_std": 0.24239360094070433, |
| "rewards/code_format_reward": 0.981249988079071, |
| "rewards/code_reward": 0.7349147796630859, |
| "step": 3700, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.02752018291503191, |
| "clip_ratio/high_mean": 0.005125764373224229, |
| "clip_ratio/low_mean": 0.00023403638042509555, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005359800613950938, |
| "completion_length": 101.37250061035157, |
| "epoch": 0.7127077129958698, |
| "grad_norm": 4.285885334014893, |
| "kl": 0.952894814312458, |
| "learning_rate": 2.713757231172611e-07, |
| "loss": -0.0013, |
| "reward": 1.6773537874221802, |
| "reward_std": 0.2778655707836151, |
| "rewards/code_format_reward": 0.9837499976158142, |
| "rewards/code_reward": 0.5927394092082977, |
| "step": 3710, |
| "zero_std_ratio": 0.375 |
| }, |
| { |
| "clip_ratio/high_max": 0.021348989009857176, |
| "clip_ratio/high_mean": 0.0030060237273573875, |
| "clip_ratio/low_mean": 0.0013181588088627904, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.004324182611890137, |
| "completion_length": 95.14250183105469, |
| "epoch": 0.7146287580443761, |
| "grad_norm": 2.7202091217041016, |
| "kl": 2.8931914918124675, |
| "learning_rate": 2.692475764062245e-07, |
| "loss": -0.0021, |
| "reward": 1.8867613315582275, |
| "reward_std": 0.18746355026960373, |
| "rewards/code_format_reward": 0.9987499952316284, |
| "rewards/code_reward": 0.6936931371688843, |
| "step": 3720, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.007143327506491914, |
| "clip_ratio/high_mean": 0.0009208801442582626, |
| "clip_ratio/low_mean": 0.00037053466949146243, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001291414822480874, |
| "completion_length": 94.1875015258789, |
| "epoch": 0.7165498030928825, |
| "grad_norm": 2.7853496074676514, |
| "kl": 0.6755535811185837, |
| "learning_rate": 2.6712966138498174e-07, |
| "loss": -0.003, |
| "reward": 1.723927640914917, |
| "reward_std": 0.2750594407320023, |
| "rewards/code_format_reward": 0.9825000047683716, |
| "rewards/code_reward": 0.6163387894630432, |
| "step": 3730, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.019027433777227997, |
| "clip_ratio/high_mean": 0.002618219889700413, |
| "clip_ratio/low_mean": 0.0018478537182090803, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.004466073628282175, |
| "completion_length": 102.04000091552734, |
| "epoch": 0.718470848141389, |
| "grad_norm": 5.998534202575684, |
| "kl": 0.9062080264091492, |
| "learning_rate": 2.650220552384391e-07, |
| "loss": 0.0289, |
| "reward": 1.8737354516983031, |
| "reward_std": 0.34540517926216124, |
| "rewards/code_format_reward": 0.9824999928474426, |
| "rewards/code_reward": 0.6912427186965943, |
| "step": 3740, |
| "zero_std_ratio": 0.375 |
| }, |
| { |
| "clip_ratio/high_max": 0.056439303827937694, |
| "clip_ratio/high_mean": 0.007310985976073425, |
| "clip_ratio/low_mean": 0.0005420514833531342, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.007853037484164816, |
| "completion_length": 92.48250122070313, |
| "epoch": 0.7203918931898953, |
| "grad_norm": 5.3343424797058105, |
| "kl": 0.3819971337914467, |
| "learning_rate": 2.6292483477580816e-07, |
| "loss": -0.011, |
| "reward": 1.672910475730896, |
| "reward_std": 0.2516419067978859, |
| "rewards/code_format_reward": 0.9774999976158142, |
| "rewards/code_reward": 0.5920802116394043, |
| "step": 3750, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.13834233868401496, |
| "clip_ratio/high_mean": 0.018591971611022017, |
| "clip_ratio/low_mean": 0.0006771487518562935, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.019269120390526952, |
| "completion_length": 99.33000030517579, |
| "epoch": 0.7223129382384017, |
| "grad_norm": 1.4892189502716064, |
| "kl": 0.9441468060016632, |
| "learning_rate": 2.6083807642780644e-07, |
| "loss": -0.0084, |
| "reward": 1.5579908847808839, |
| "reward_std": 0.272139647603035, |
| "rewards/code_format_reward": 0.9787500023841857, |
| "rewards/code_reward": 0.5343079507350922, |
| "step": 3760, |
| "zero_std_ratio": 0.4 |
| }, |
| { |
| "clip_ratio/high_max": 0.023900310718454422, |
| "clip_ratio/high_mean": 0.005545906673069112, |
| "clip_ratio/low_mean": 0.0007872088695876301, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006333115603774786, |
| "completion_length": 90.84000396728516, |
| "epoch": 0.724233983286908, |
| "grad_norm": 12.181316375732422, |
| "kl": 8.179486125707626, |
| "learning_rate": 2.5876185624387225e-07, |
| "loss": 0.0398, |
| "reward": 1.743166995048523, |
| "reward_std": 0.3216101437807083, |
| "rewards/code_format_reward": 0.9824999928474426, |
| "rewards/code_reward": 0.6259585380554199, |
| "step": 3770, |
| "zero_std_ratio": 0.325 |
| }, |
| { |
| "clip_ratio/high_max": 0.00846199265215546, |
| "clip_ratio/high_mean": 0.0012625553936231881, |
| "clip_ratio/low_mean": 0.00030621195983258074, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001568767352728173, |
| "completion_length": 118.35750122070313, |
| "epoch": 0.7261550283354145, |
| "grad_norm": 1.6517783403396606, |
| "kl": 0.968211068212986, |
| "learning_rate": 2.5669624988939287e-07, |
| "loss": 0.1551, |
| "reward": 1.7871047019958497, |
| "reward_std": 0.21420088410377502, |
| "rewards/code_format_reward": 0.9949999928474427, |
| "rewards/code_reward": 0.644802349805832, |
| "step": 3780, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.02817701958119869, |
| "clip_ratio/high_mean": 0.0037564294645562766, |
| "clip_ratio/low_mean": 0.011859719056519679, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.015616148672415875, |
| "completion_length": 93.14750213623047, |
| "epoch": 0.7280760733839209, |
| "grad_norm": 11.322369575500488, |
| "kl": 0.45075275003910065, |
| "learning_rate": 2.5464133264294705e-07, |
| "loss": -0.0008, |
| "reward": 1.662767267227173, |
| "reward_std": 0.24967537969350814, |
| "rewards/code_format_reward": 0.9862500071525574, |
| "rewards/code_reward": 0.5848211228847504, |
| "step": 3790, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.05006286900024861, |
| "clip_ratio/high_mean": 0.007249254969065077, |
| "clip_ratio/low_mean": 0.00040258544613607227, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.007651840391918086, |
| "completion_length": 110.32750396728515, |
| "epoch": 0.7299971184324272, |
| "grad_norm": 16.862590789794922, |
| "kl": 0.3901309326291084, |
| "learning_rate": 2.5259717939356175e-07, |
| "loss": -0.0019, |
| "reward": 1.7777814149856568, |
| "reward_std": 0.25982470586895945, |
| "rewards/code_format_reward": 0.987500011920929, |
| "rewards/code_reward": 0.6420157194137573, |
| "step": 3800, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.007159786019474268, |
| "clip_ratio/high_mean": 0.0011859470629133283, |
| "clip_ratio/low_mean": 0.0021440873795654626, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0033300343551672996, |
| "completion_length": 96.07000122070312, |
| "epoch": 0.7319181634809336, |
| "grad_norm": 2.4953460693359375, |
| "kl": 0.3146058402955532, |
| "learning_rate": 2.505638646379831e-07, |
| "loss": -0.0042, |
| "reward": 1.7296765804290772, |
| "reward_std": 0.3011175274848938, |
| "rewards/code_format_reward": 0.9837499976158142, |
| "rewards/code_reward": 0.6189007639884949, |
| "step": 3810, |
| "zero_std_ratio": 0.325 |
| }, |
| { |
| "clip_ratio/high_max": 0.03548359724227339, |
| "clip_ratio/high_mean": 0.004679994014441036, |
| "clip_ratio/low_mean": 0.00017329893162241206, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.004853292935877107, |
| "completion_length": 101.32000122070312, |
| "epoch": 0.7338392085294401, |
| "grad_norm": 3.954063892364502, |
| "kl": 0.34448319524526594, |
| "learning_rate": 2.485414624779603e-07, |
| "loss": -0.0051, |
| "reward": 1.690654444694519, |
| "reward_std": 0.24299487322568894, |
| "rewards/code_format_reward": 0.981249988079071, |
| "rewards/code_reward": 0.6000146985054016, |
| "step": 3820, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.00636189088691026, |
| "clip_ratio/high_mean": 0.0008543322241166606, |
| "clip_ratio/low_mean": 0.00028777473780792204, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0011421069371863267, |
| "completion_length": 94.74000244140625, |
| "epoch": 0.7357602535779464, |
| "grad_norm": 1.0420587062835693, |
| "kl": 0.28902386128902435, |
| "learning_rate": 2.4653004661754703e-07, |
| "loss": 0.0021, |
| "reward": 1.929768443107605, |
| "reward_std": 0.19695264101028442, |
| "rewards/code_format_reward": 0.9899999976158143, |
| "rewards/code_reward": 0.7173841595649719, |
| "step": 3830, |
| "zero_std_ratio": 0.7 |
| }, |
| { |
| "clip_ratio/high_max": 0.0385974693344906, |
| "clip_ratio/high_mean": 0.0054892279236810285, |
| "clip_ratio/low_mean": 0.0004371934803202748, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005926421421463601, |
| "completion_length": 100.23250122070313, |
| "epoch": 0.7376812986264528, |
| "grad_norm": 6.22709846496582, |
| "kl": 0.39053357392549515, |
| "learning_rate": 2.445296903604131e-07, |
| "loss": -0.0123, |
| "reward": 1.7683161497116089, |
| "reward_std": 0.4236398935317993, |
| "rewards/code_format_reward": 0.9712499976158142, |
| "rewards/code_reward": 0.6413455486297608, |
| "step": 3840, |
| "zero_std_ratio": 0.3 |
| }, |
| { |
| "clip_ratio/high_max": 0.013776408764533699, |
| "clip_ratio/high_mean": 0.0019065461441641674, |
| "clip_ratio/low_mean": 0.0035487653221935034, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005455311315017752, |
| "completion_length": 91.36000213623046, |
| "epoch": 0.7396023436749591, |
| "grad_norm": 3.84639573097229, |
| "kl": 9.267435324192046, |
| "learning_rate": 2.4254046660717555e-07, |
| "loss": 0.0107, |
| "reward": 1.7194789409637452, |
| "reward_std": 0.23012096285820008, |
| "rewards/code_format_reward": 0.98125, |
| "rewards/code_reward": 0.6144269347190857, |
| "step": 3850, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.019276025268482044, |
| "clip_ratio/high_mean": 0.0034578723403683397, |
| "clip_ratio/low_mean": 0.0028569042566232382, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006314776389626786, |
| "completion_length": 96.3250015258789, |
| "epoch": 0.7415233887234656, |
| "grad_norm": 4.765519142150879, |
| "kl": 0.5375766545534134, |
| "learning_rate": 2.4056244785273895e-07, |
| "loss": -0.0038, |
| "reward": 1.713827419281006, |
| "reward_std": 0.28884910941123965, |
| "rewards/code_format_reward": 0.9824999928474426, |
| "rewards/code_reward": 0.6112887144088746, |
| "step": 3860, |
| "zero_std_ratio": 0.35 |
| }, |
| { |
| "clip_ratio/high_max": 0.06692883024225012, |
| "clip_ratio/high_mean": 0.008779459849756676, |
| "clip_ratio/low_mean": 0.0002708235711907037, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.009050283460237552, |
| "completion_length": 103.41250152587891, |
| "epoch": 0.743444433771972, |
| "grad_norm": 2.68007493019104, |
| "kl": 0.34222877621650694, |
| "learning_rate": 2.3859570618365614e-07, |
| "loss": -0.0009, |
| "reward": 1.74418466091156, |
| "reward_std": 0.20953620076179505, |
| "rewards/code_format_reward": 0.9912499785423279, |
| "rewards/code_reward": 0.6242798089981079, |
| "step": 3870, |
| "zero_std_ratio": 0.6 |
| }, |
| { |
| "clip_ratio/high_max": 0.01482260066550225, |
| "clip_ratio/high_mean": 0.0023997865355340764, |
| "clip_ratio/low_mean": 0.00038790585967944934, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.002787692387937568, |
| "completion_length": 98.42250061035156, |
| "epoch": 0.7453654788204783, |
| "grad_norm": 4.816893100738525, |
| "kl": 0.4661983668804169, |
| "learning_rate": 2.366403132754995e-07, |
| "loss": -0.0019, |
| "reward": 1.6338875532150268, |
| "reward_std": 0.21452725008130075, |
| "rewards/code_format_reward": 0.9887500047683716, |
| "rewards/code_reward": 0.5697562634944916, |
| "step": 3880, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.02494101980701089, |
| "clip_ratio/high_mean": 0.003492716047912836, |
| "clip_ratio/low_mean": 0.00024301124794874341, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0037357273045927285, |
| "completion_length": 97.44000091552735, |
| "epoch": 0.7472865238689848, |
| "grad_norm": 82.46282958984375, |
| "kl": 0.5981974095106125, |
| "learning_rate": 2.3469634039024927e-07, |
| "loss": 0.0024, |
| "reward": 1.8161945581436156, |
| "reward_std": 0.17759706005454062, |
| "rewards/code_format_reward": 0.9837499976158142, |
| "rewards/code_reward": 0.6621597528457641, |
| "step": 3890, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.0019452353473752737, |
| "clip_ratio/high_mean": 0.00039936143439263106, |
| "clip_ratio/low_mean": 0.0002315789126441814, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0006309403397608548, |
| "completion_length": 94.07750091552734, |
| "epoch": 0.7492075689174911, |
| "grad_norm": 6.090396404266357, |
| "kl": 0.8421477146446705, |
| "learning_rate": 2.3276385837369632e-07, |
| "loss": 0.014, |
| "reward": 1.4471250534057618, |
| "reward_std": 0.25895166750997306, |
| "rewards/code_format_reward": 0.9849999904632568, |
| "rewards/code_reward": 0.4773125171661377, |
| "step": 3900, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.02143551183398813, |
| "clip_ratio/high_mean": 0.002903820894425735, |
| "clip_ratio/low_mean": 0.00011704202042892576, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0030208629119442775, |
| "completion_length": 89.32750091552734, |
| "epoch": 0.7511286139659975, |
| "grad_norm": 7.675207614898682, |
| "kl": 4.630686198174954, |
| "learning_rate": 2.3084293765286074e-07, |
| "loss": 0.0109, |
| "reward": 1.7639801740646361, |
| "reward_std": 0.32505679726600645, |
| "rewards/code_format_reward": 0.9837499976158142, |
| "rewards/code_reward": 0.6360525727272034, |
| "step": 3910, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.007216949050780385, |
| "clip_ratio/high_mean": 0.0012314463703660295, |
| "clip_ratio/low_mean": 0.000596191274235025, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0018276376475114375, |
| "completion_length": 93.16250152587891, |
| "epoch": 0.7530496590145039, |
| "grad_norm": 3.4967644214630127, |
| "kl": 0.9979558669030666, |
| "learning_rate": 2.2893364823342454e-07, |
| "loss": 0.0016, |
| "reward": 1.5569410085678101, |
| "reward_std": 0.2807903170585632, |
| "rewards/code_format_reward": 0.9674999952316284, |
| "rewards/code_reward": 0.5365955173969269, |
| "step": 3920, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.01850514723919332, |
| "clip_ratio/high_mean": 0.003044746146770194, |
| "clip_ratio/low_mean": 0.0006967324326978997, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0037414785125292837, |
| "completion_length": 95.95500183105469, |
| "epoch": 0.7549707040630103, |
| "grad_norm": 2.8742544651031494, |
| "kl": 0.44021010398864746, |
| "learning_rate": 2.270360596971809e-07, |
| "loss": -0.0037, |
| "reward": 1.823073673248291, |
| "reward_std": 0.24968771934509276, |
| "rewards/code_format_reward": 0.9924999952316285, |
| "rewards/code_reward": 0.663411819934845, |
| "step": 3930, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.015126590803265571, |
| "clip_ratio/high_mean": 0.0023361636558547616, |
| "clip_ratio/low_mean": 0.00015787699958309532, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.002494040655437857, |
| "completion_length": 91.19500122070312, |
| "epoch": 0.7568917491115167, |
| "grad_norm": 3.40413236618042, |
| "kl": 0.386103405430913, |
| "learning_rate": 2.2515024119949826e-07, |
| "loss": -0.011, |
| "reward": 1.5718731164932251, |
| "reward_std": 0.2807211749255657, |
| "rewards/code_format_reward": 0.9774999976158142, |
| "rewards/code_reward": 0.5415615499019623, |
| "step": 3940, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.015597179555334151, |
| "clip_ratio/high_mean": 0.0027747701620683073, |
| "clip_ratio/low_mean": 0.00042166481143794953, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.003196435049176216, |
| "completion_length": 98.425, |
| "epoch": 0.758812794160023, |
| "grad_norm": 4.560734272003174, |
| "kl": 0.4831135801970959, |
| "learning_rate": 2.2327626146679974e-07, |
| "loss": -0.0022, |
| "reward": 1.7759766340255738, |
| "reward_std": 0.2547271862626076, |
| "rewards/code_format_reward": 0.9625, |
| "rewards/code_reward": 0.6473633050918579, |
| "step": 3950, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.008683394081890583, |
| "clip_ratio/high_mean": 0.0011145618045702577, |
| "clip_ratio/low_mean": 0.0009394719265401364, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0020540336496196686, |
| "completion_length": 102.31250305175782, |
| "epoch": 0.7607338392085294, |
| "grad_norm": 0.1577247530221939, |
| "kl": 1.2770531885325909, |
| "learning_rate": 2.2141418879405855e-07, |
| "loss": 0.0032, |
| "reward": 1.7324957370758056, |
| "reward_std": 0.19914634823799132, |
| "rewards/code_format_reward": 0.9850000023841858, |
| "rewards/code_reward": 0.6199978470802308, |
| "step": 3960, |
| "zero_std_ratio": 0.625 |
| }, |
| { |
| "clip_ratio/high_max": 0.004086668835952878, |
| "clip_ratio/high_mean": 0.0005708287237212062, |
| "clip_ratio/low_mean": 2.821670495904982e-05, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0005990454228594899, |
| "completion_length": 95.21750335693359, |
| "epoch": 0.7626548842570359, |
| "grad_norm": 268.0164794921875, |
| "kl": 3.985953611135483, |
| "learning_rate": 2.1956409104230986e-07, |
| "loss": 0.0127, |
| "reward": 1.7277408480644225, |
| "reward_std": 0.19516595900058747, |
| "rewards/code_format_reward": 0.9737500071525573, |
| "rewards/code_reward": 0.6204329133033752, |
| "step": 3970, |
| "zero_std_ratio": 0.625 |
| }, |
| { |
| "clip_ratio/high_max": 0.02115430913399905, |
| "clip_ratio/high_mean": 0.003100222998182289, |
| "clip_ratio/low_mean": 0.00045731081045232715, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0035575337868067438, |
| "completion_length": 99.47250213623047, |
| "epoch": 0.7645759293055422, |
| "grad_norm": 4.087578773498535, |
| "kl": 0.2619202695786953, |
| "learning_rate": 2.1772603563617603e-07, |
| "loss": -0.0024, |
| "reward": 1.6976868152618407, |
| "reward_std": 0.31094631999731065, |
| "rewards/code_format_reward": 0.9787499904632568, |
| "rewards/code_reward": 0.6041558861732483, |
| "step": 3980, |
| "zero_std_ratio": 0.4 |
| }, |
| { |
| "clip_ratio/high_max": 0.022111613873858006, |
| "clip_ratio/high_mean": 0.0033171431292430497, |
| "clip_ratio/low_mean": 0.00019350402581039817, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.003510647150687873, |
| "completion_length": 93.09000091552734, |
| "epoch": 0.7664969743540486, |
| "grad_norm": 2.557553291320801, |
| "kl": 0.4590821463614702, |
| "learning_rate": 2.1590008956141137e-07, |
| "loss": -0.0014, |
| "reward": 1.7825278520584107, |
| "reward_std": 0.26515288949012755, |
| "rewards/code_format_reward": 0.9887500047683716, |
| "rewards/code_reward": 0.6440764307975769, |
| "step": 3990, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.03076116186566651, |
| "clip_ratio/high_mean": 0.004437833256088197, |
| "clip_ratio/low_mean": 0.0004819675668841228, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.004919800782226957, |
| "completion_length": 89.73500061035156, |
| "epoch": 0.7684180194025549, |
| "grad_norm": 2.5422067642211914, |
| "kl": 0.26607592329382895, |
| "learning_rate": 2.1408631936245908e-07, |
| "loss": 0.0026, |
| "reward": 1.8288384914398192, |
| "reward_std": 0.2508297085762024, |
| "rewards/code_format_reward": 0.9837499856948853, |
| "rewards/code_reward": 0.6684817314147949, |
| "step": 4000, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.020826040930114687, |
| "clip_ratio/high_mean": 0.0040985049330629405, |
| "clip_ratio/low_mean": 0.000369196553947404, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.004467701492831111, |
| "completion_length": 97.69500122070312, |
| "epoch": 0.7703390644510614, |
| "grad_norm": 2.079371929168701, |
| "kl": 0.3304180882871151, |
| "learning_rate": 2.122847911400278e-07, |
| "loss": 0.0019, |
| "reward": 1.693557620048523, |
| "reward_std": 0.21333991810679437, |
| "rewards/code_format_reward": 0.9962499976158142, |
| "rewards/code_reward": 0.5977162718772888, |
| "step": 4010, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.009364375309087337, |
| "clip_ratio/high_mean": 0.0013745424774242565, |
| "clip_ratio/low_mean": 0.0020853754234849476, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0034599179547512905, |
| "completion_length": 94.00750274658203, |
| "epoch": 0.7722601094995678, |
| "grad_norm": 3.2660512924194336, |
| "kl": 0.6432372182607651, |
| "learning_rate": 2.1049557054868082e-07, |
| "loss": 0.0073, |
| "reward": 1.8483120203018188, |
| "reward_std": 0.316910046339035, |
| "rewards/code_format_reward": 0.9649999856948852, |
| "rewards/code_reward": 0.6829060018062592, |
| "step": 4020, |
| "zero_std_ratio": 0.375 |
| }, |
| { |
| "clip_ratio/high_max": 0.08980275879148394, |
| "clip_ratio/high_mean": 0.011746273408061825, |
| "clip_ratio/low_mean": 0.000331929670937825, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.012078202966949902, |
| "completion_length": 92.7925018310547, |
| "epoch": 0.7741811545480741, |
| "grad_norm": 3.004549503326416, |
| "kl": 0.74478175714612, |
| "learning_rate": 2.0871872279444554e-07, |
| "loss": -0.0021, |
| "reward": 1.7010861873626708, |
| "reward_std": 0.25111902356147764, |
| "rewards/code_format_reward": 0.9737499952316284, |
| "rewards/code_reward": 0.6071055889129638, |
| "step": 4030, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.0778543038177304, |
| "clip_ratio/high_mean": 0.00988141688721953, |
| "clip_ratio/low_mean": 0.0002543082577176392, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.01013572499359725, |
| "completion_length": 105.63250122070312, |
| "epoch": 0.7761021995965806, |
| "grad_norm": 6.268821716308594, |
| "kl": 0.32837071269750595, |
| "learning_rate": 2.0695431263243512e-07, |
| "loss": -0.0003, |
| "reward": 1.716653084754944, |
| "reward_std": 0.2870768278837204, |
| "rewards/code_format_reward": 0.9899999976158143, |
| "rewards/code_reward": 0.6108265280723572, |
| "step": 4040, |
| "zero_std_ratio": 0.4 |
| }, |
| { |
| "clip_ratio/high_max": 0.07360692555084825, |
| "clip_ratio/high_mean": 0.009302017895970493, |
| "clip_ratio/low_mean": 0.0003425976261496544, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.009644615522120148, |
| "completion_length": 91.73750152587891, |
| "epoch": 0.7780232446450869, |
| "grad_norm": 4.801341533660889, |
| "kl": 13.291237189993263, |
| "learning_rate": 2.052024043644897e-07, |
| "loss": 0.0294, |
| "reward": 1.7232446193695068, |
| "reward_std": 0.24269133806228638, |
| "rewards/code_format_reward": 0.9924999952316285, |
| "rewards/code_reward": 0.6134972870349884, |
| "step": 4050, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.012731208954937756, |
| "clip_ratio/high_mean": 0.00180651948612649, |
| "clip_ratio/low_mean": 0.00015854310477152466, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0019650626258226112, |
| "completion_length": 92.22000274658203, |
| "epoch": 0.7799442896935933, |
| "grad_norm": 0.6561126112937927, |
| "kl": 0.4966626279056072, |
| "learning_rate": 2.0346306183683254e-07, |
| "loss": 0.0001, |
| "reward": 1.8969059467315674, |
| "reward_std": 0.33292114436626435, |
| "rewards/code_format_reward": 0.9800000071525574, |
| "rewards/code_reward": 0.7034529447555542, |
| "step": 4060, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.015003547444939614, |
| "clip_ratio/high_mean": 0.002088976529194042, |
| "clip_ratio/low_mean": 0.0003269152017310262, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.002415891730925068, |
| "completion_length": 88.61250152587891, |
| "epoch": 0.7818653347420997, |
| "grad_norm": 3.062511920928955, |
| "kl": 27.40203034952283, |
| "learning_rate": 2.0173634843774363e-07, |
| "loss": 0.0554, |
| "reward": 1.7011754512786865, |
| "reward_std": 0.3188599109649658, |
| "rewards/code_format_reward": 0.981250011920929, |
| "rewards/code_reward": 0.6052752196788788, |
| "step": 4070, |
| "zero_std_ratio": 0.375 |
| }, |
| { |
| "clip_ratio/high_max": 0.006837105128215626, |
| "clip_ratio/high_mean": 0.0008925169277063105, |
| "clip_ratio/low_mean": 0.0005512935545993969, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0014438104728469626, |
| "completion_length": 91.84750213623047, |
| "epoch": 0.7837863797906061, |
| "grad_norm": 3.0254440307617188, |
| "kl": 1.3981286019086838, |
| "learning_rate": 2.0002232709524897e-07, |
| "loss": 0.0033, |
| "reward": 1.6401101350784302, |
| "reward_std": 0.26853239685297015, |
| "rewards/code_format_reward": 0.9849999904632568, |
| "rewards/code_reward": 0.5738050699234009, |
| "step": 4080, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.03983018643921241, |
| "clip_ratio/high_mean": 0.005185264609463047, |
| "clip_ratio/low_mean": 0.0019072047754889355, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.007092469278723002, |
| "completion_length": 88.79250030517578, |
| "epoch": 0.7857074248391125, |
| "grad_norm": 2.8119072914123535, |
| "kl": 0.41205914914608, |
| "learning_rate": 1.983210602748279e-07, |
| "loss": -0.0029, |
| "reward": 1.9083050966262818, |
| "reward_std": 0.29446094632148745, |
| "rewards/code_format_reward": 0.9825000047683716, |
| "rewards/code_reward": 0.7085274815559387, |
| "step": 4090, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.013765955006238072, |
| "clip_ratio/high_mean": 0.0018926289907540196, |
| "clip_ratio/low_mean": 0.0033484802523162218, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00524110905098496, |
| "completion_length": 85.72500305175781, |
| "epoch": 0.7876284698876188, |
| "grad_norm": 9.436022758483887, |
| "kl": 0.5864221028983593, |
| "learning_rate": 1.966326099771361e-07, |
| "loss": -0.0013, |
| "reward": 1.8478533029556274, |
| "reward_std": 0.2244624227285385, |
| "rewards/code_format_reward": 0.987499988079071, |
| "rewards/code_reward": 0.6770516157150268, |
| "step": 4100, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.008409230364486575, |
| "clip_ratio/high_mean": 0.0011749810015317052, |
| "clip_ratio/low_mean": 0.00043775633239420133, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001612737326649949, |
| "completion_length": 91.16000213623047, |
| "epoch": 0.7895495149361252, |
| "grad_norm": 6.15724515914917, |
| "kl": 19.288723162561656, |
| "learning_rate": 1.9495703773574628e-07, |
| "loss": 0.0383, |
| "reward": 1.6099607944488525, |
| "reward_std": 0.30300846993923186, |
| "rewards/code_format_reward": 0.9799999952316284, |
| "rewards/code_reward": 0.5599803984165191, |
| "step": 4110, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.009142859559506177, |
| "clip_ratio/high_mean": 0.001581054090638645, |
| "clip_ratio/low_mean": 0.0003552554393536411, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0019363095459993928, |
| "completion_length": 91.99500122070313, |
| "epoch": 0.7914705599846317, |
| "grad_norm": 6.634824752807617, |
| "kl": 6.53539779484272, |
| "learning_rate": 1.9329440461490576e-07, |
| "loss": 0.0342, |
| "reward": 1.647179627418518, |
| "reward_std": 0.2863168239593506, |
| "rewards/code_format_reward": 0.9862499952316284, |
| "rewards/code_reward": 0.5770273089408875, |
| "step": 4120, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.002549535338766873, |
| "clip_ratio/high_mean": 0.0003399143257411197, |
| "clip_ratio/low_mean": 0.00017536718805786223, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.000515281516709365, |
| "completion_length": 90.89250183105469, |
| "epoch": 0.793391605033138, |
| "grad_norm": 2.817605972290039, |
| "kl": 2.417458937317133, |
| "learning_rate": 1.9164477120731066e-07, |
| "loss": 0.0066, |
| "reward": 1.7660948038101196, |
| "reward_std": 0.2769928514957428, |
| "rewards/code_format_reward": 0.96875, |
| "rewards/code_reward": 0.640859854221344, |
| "step": 4130, |
| "zero_std_ratio": 0.375 |
| }, |
| { |
| "clip_ratio/high_max": 0.037738511635689066, |
| "clip_ratio/high_mean": 0.0050403060296957845, |
| "clip_ratio/low_mean": 0.0007518758837250061, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005792181929427898, |
| "completion_length": 96.35750122070313, |
| "epoch": 0.7953126500816444, |
| "grad_norm": 4.240172386169434, |
| "kl": 0.28425633125007155, |
| "learning_rate": 1.900081976318983e-07, |
| "loss": 0.002, |
| "reward": 1.6942025184631349, |
| "reward_std": 0.3146607309579849, |
| "rewards/code_format_reward": 0.9737499833106995, |
| "rewards/code_reward": 0.6036637306213379, |
| "step": 4140, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.005694918753579259, |
| "clip_ratio/high_mean": 0.0007544978521764279, |
| "clip_ratio/low_mean": 0.000571403895446565, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001325901737436652, |
| "completion_length": 91.79500122070313, |
| "epoch": 0.7972336951301509, |
| "grad_norm": 3.9649434089660645, |
| "kl": 0.5314306125044823, |
| "learning_rate": 1.8838474353165547e-07, |
| "loss": -0.0054, |
| "reward": 1.7638010501861572, |
| "reward_std": 0.2793388396501541, |
| "rewards/code_format_reward": 0.9824999928474426, |
| "rewards/code_reward": 0.6362755179405213, |
| "step": 4150, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.07577090607956052, |
| "clip_ratio/high_mean": 0.009897856542374938, |
| "clip_ratio/low_mean": 0.00011142043076688424, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.010009276978962589, |
| "completion_length": 94.04000244140624, |
| "epoch": 0.7991547401786572, |
| "grad_norm": 2.2340188026428223, |
| "kl": 0.524626237899065, |
| "learning_rate": 1.8677446807144554e-07, |
| "loss": -0.0045, |
| "reward": 1.7472325563430786, |
| "reward_std": 0.3027869775891304, |
| "rewards/code_format_reward": 0.9787499904632568, |
| "rewards/code_reward": 0.6289287328720092, |
| "step": 4160, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.012572024948894978, |
| "clip_ratio/high_mean": 0.0020916348788887263, |
| "clip_ratio/low_mean": 0.00022255638323258607, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0023141912854043765, |
| "completion_length": 94.53000183105469, |
| "epoch": 0.8010757852271636, |
| "grad_norm": 10.561907768249512, |
| "kl": 2.102495136484504, |
| "learning_rate": 1.8517742993585178e-07, |
| "loss": 0.0137, |
| "reward": 1.7456205368041993, |
| "reward_std": 0.2167625606060028, |
| "rewards/code_format_reward": 0.9862500071525574, |
| "rewards/code_reward": 0.6262477397918701, |
| "step": 4170, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.06855184989399277, |
| "clip_ratio/high_mean": 0.008778795686521335, |
| "clip_ratio/low_mean": 5.122950533404946e-05, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.008830025191855384, |
| "completion_length": 101.0425018310547, |
| "epoch": 0.8029968302756699, |
| "grad_norm": 5.673184871673584, |
| "kl": 0.428597304970026, |
| "learning_rate": 1.835936873270389e-07, |
| "loss": -0.0078, |
| "reward": 1.818405318260193, |
| "reward_std": 0.23994216322898865, |
| "rewards/code_format_reward": 0.9862499952316284, |
| "rewards/code_reward": 0.6626401782035828, |
| "step": 4180, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.002845590282231569, |
| "clip_ratio/high_mean": 0.0004983038117643446, |
| "clip_ratio/low_mean": 0.00037282529519870876, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0008711291156942025, |
| "completion_length": 92.31250305175782, |
| "epoch": 0.8049178753241764, |
| "grad_norm": 6.281589508056641, |
| "kl": 0.4346353754401207, |
| "learning_rate": 1.8202329796263172e-07, |
| "loss": -0.0009, |
| "reward": 1.8768694639205932, |
| "reward_std": 0.21767425537109375, |
| "rewards/code_format_reward": 0.9862499952316284, |
| "rewards/code_reward": 0.6918722629547119, |
| "step": 4190, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.003876271191984415, |
| "clip_ratio/high_mean": 0.0004845338989980519, |
| "clip_ratio/low_mean": 0.0001560977878398262, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0006406316824723035, |
| "completion_length": 75.7275016784668, |
| "epoch": 0.8068389203726828, |
| "grad_norm": 1.0423272848129272, |
| "kl": 0.9193772681057453, |
| "learning_rate": 1.8046631907361226e-07, |
| "loss": 0.0041, |
| "reward": 1.8756553649902343, |
| "reward_std": 0.18836807161569596, |
| "rewards/code_format_reward": 0.99375, |
| "rewards/code_reward": 0.6893901348114013, |
| "step": 4200, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.004609162057749927, |
| "clip_ratio/high_mean": 0.0007380300055956468, |
| "clip_ratio/low_mean": 0.00015018127305665985, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.000888211271376349, |
| "completion_length": 86.45750122070312, |
| "epoch": 0.8087599654211891, |
| "grad_norm": 4.096966743469238, |
| "kl": 0.45643181502819063, |
| "learning_rate": 1.7892280740223303e-07, |
| "loss": -0.004, |
| "reward": 1.5836501359939574, |
| "reward_std": 0.2258547842502594, |
| "rewards/code_format_reward": 0.9799999952316284, |
| "rewards/code_reward": 0.5468250632286071, |
| "step": 4210, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.007562826108187437, |
| "clip_ratio/high_mean": 0.0010166528285481037, |
| "clip_ratio/low_mean": 0.000701455632224679, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0017181085073389112, |
| "completion_length": 90.34250335693359, |
| "epoch": 0.8106810104696955, |
| "grad_norm": 0.29423439502716064, |
| "kl": 0.2636001568287611, |
| "learning_rate": 1.7739281919995045e-07, |
| "loss": 0.0161, |
| "reward": 1.5646157741546631, |
| "reward_std": 0.12648468129336835, |
| "rewards/code_format_reward": 0.9837499976158142, |
| "rewards/code_reward": 0.5363703727722168, |
| "step": 4220, |
| "zero_std_ratio": 0.625 |
| }, |
| { |
| "clip_ratio/high_max": 0.02073557274416089, |
| "clip_ratio/high_mean": 0.002738419675733894, |
| "clip_ratio/low_mean": 0.001565844019933138, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.004304263507947326, |
| "completion_length": 85.92750091552735, |
| "epoch": 0.8126020555182019, |
| "grad_norm": 3.8796801567077637, |
| "kl": 0.6587013073265553, |
| "learning_rate": 1.7587641022537335e-07, |
| "loss": -0.0031, |
| "reward": 1.598485040664673, |
| "reward_std": 0.23664331436157227, |
| "rewards/code_format_reward": 0.9862499952316284, |
| "rewards/code_reward": 0.5526800036430359, |
| "step": 4230, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.0027086240705102684, |
| "clip_ratio/high_mean": 0.0003605463745770976, |
| "clip_ratio/low_mean": 0.0004666288397856988, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0008271752245491371, |
| "completion_length": 86.6875, |
| "epoch": 0.8145231005667083, |
| "grad_norm": 6.270168781280518, |
| "kl": 3.9651204235851765, |
| "learning_rate": 1.7437363574223244e-07, |
| "loss": 0.0141, |
| "reward": 1.8213656187057494, |
| "reward_std": 0.2221561223268509, |
| "rewards/code_format_reward": 0.9837499976158142, |
| "rewards/code_reward": 0.66474529504776, |
| "step": 4240, |
| "zero_std_ratio": 0.6 |
| }, |
| { |
| "clip_ratio/high_max": 0.009644670388661325, |
| "clip_ratio/high_mean": 0.0013658979878528044, |
| "clip_ratio/low_mean": 0.0006750999338692055, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.002040997930453159, |
| "completion_length": 86.42250061035156, |
| "epoch": 0.8164441456152147, |
| "grad_norm": 4.402440071105957, |
| "kl": 0.27487861886620524, |
| "learning_rate": 1.7288455051736474e-07, |
| "loss": -0.0005, |
| "reward": 1.6581492662429809, |
| "reward_std": 0.14444592781364918, |
| "rewards/code_format_reward": 0.9862499952316284, |
| "rewards/code_reward": 0.5825121104717255, |
| "step": 4250, |
| "zero_std_ratio": 0.65 |
| }, |
| { |
| "clip_ratio/high_max": 0.017426467640325426, |
| "clip_ratio/high_mean": 0.0023936200188472865, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0023936200188472865, |
| "completion_length": 88.66250152587891, |
| "epoch": 0.818365190663721, |
| "grad_norm": 15.625293731689453, |
| "kl": 0.5453658372163772, |
| "learning_rate": 1.7140920881871927e-07, |
| "loss": 0.0001, |
| "reward": 1.9025921821594238, |
| "reward_std": 0.1951783686876297, |
| "rewards/code_format_reward": 0.9899999856948852, |
| "rewards/code_reward": 0.7037960886955261, |
| "step": 4260, |
| "zero_std_ratio": 0.65 |
| }, |
| { |
| "clip_ratio/high_max": 0.024276501801796257, |
| "clip_ratio/high_mean": 0.0037041545001557097, |
| "clip_ratio/low_mean": 0.0004929742426611483, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.004197128777741454, |
| "completion_length": 94.525, |
| "epoch": 0.8202862357122275, |
| "grad_norm": 19.728607177734375, |
| "kl": 3.983573118597269, |
| "learning_rate": 1.699476644133778e-07, |
| "loss": 0.0122, |
| "reward": 1.7488954544067383, |
| "reward_std": 0.2558127373456955, |
| "rewards/code_format_reward": 0.9899999976158143, |
| "rewards/code_reward": 0.6269477069377899, |
| "step": 4270, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.00913497168221511, |
| "clip_ratio/high_mean": 0.0011987716374278535, |
| "clip_ratio/low_mean": 0.0007998564062290825, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0019986280538432767, |
| "completion_length": 87.73999938964843, |
| "epoch": 0.8222072807607338, |
| "grad_norm": 4.567457675933838, |
| "kl": 0.6975361555814743, |
| "learning_rate": 1.6849997056559662e-07, |
| "loss": -0.0116, |
| "reward": 1.7202219009399413, |
| "reward_std": 0.27057143300771713, |
| "rewards/code_format_reward": 0.9725000023841858, |
| "rewards/code_reward": 0.6169859290122985, |
| "step": 4280, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.016106344643048942, |
| "clip_ratio/high_mean": 0.002376156343962066, |
| "clip_ratio/low_mean": 5.540161509998143e-05, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0024315579648828134, |
| "completion_length": 94.21750030517578, |
| "epoch": 0.8241283258092402, |
| "grad_norm": 17.505773544311523, |
| "kl": 1.1381098613142968, |
| "learning_rate": 1.670661800348644e-07, |
| "loss": -0.0006, |
| "reward": 1.7664429664611816, |
| "reward_std": 0.283676877617836, |
| "rewards/code_format_reward": 0.9849999904632568, |
| "rewards/code_reward": 0.6369715094566345, |
| "step": 4290, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.05613061334006488, |
| "clip_ratio/high_mean": 0.0073514855874236675, |
| "clip_ratio/low_mean": 0.00012296391359996052, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0074744494573678825, |
| "completion_length": 94.24500122070313, |
| "epoch": 0.8260493708577467, |
| "grad_norm": 36.729576110839844, |
| "kl": 2.2444246262311935, |
| "learning_rate": 1.656463450739801e-07, |
| "loss": 0.0024, |
| "reward": 1.7431164741516114, |
| "reward_std": 0.29661422967910767, |
| "rewards/code_format_reward": 0.9787499904632568, |
| "rewards/code_reward": 0.6268707036972045, |
| "step": 4300, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.003962649451568723, |
| "clip_ratio/high_mean": 0.0005655559070874006, |
| "clip_ratio/low_mean": 0.00023454214970115573, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0008000980655197054, |
| "completion_length": 91.94250030517578, |
| "epoch": 0.827970415906253, |
| "grad_norm": 5.331088066101074, |
| "kl": 0.6558065637946129, |
| "learning_rate": 1.6424051742714851e-07, |
| "loss": 0.0002, |
| "reward": 1.76786208152771, |
| "reward_std": 0.17127570807933806, |
| "rewards/code_format_reward": 0.9912500023841858, |
| "rewards/code_reward": 0.6361185550689697, |
| "step": 4310, |
| "zero_std_ratio": 0.6 |
| }, |
| { |
| "clip_ratio/high_max": 0.007816581195220352, |
| "clip_ratio/high_mean": 0.001516599569004029, |
| "clip_ratio/low_mean": 4.7630388871766625e-05, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0015642299549654126, |
| "completion_length": 82.7000015258789, |
| "epoch": 0.8298914609547594, |
| "grad_norm": 9.622750282287598, |
| "kl": 0.9509772717952728, |
| "learning_rate": 1.6284874832809436e-07, |
| "loss": 0.0023, |
| "reward": 1.9346927881240845, |
| "reward_std": 0.3074748650193214, |
| "rewards/code_format_reward": 0.99375, |
| "rewards/code_reward": 0.7189089298248291, |
| "step": 4320, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.02933923137607053, |
| "clip_ratio/high_mean": 0.004640504893905018, |
| "clip_ratio/low_mean": 0.00011013215407729149, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.004750637047982309, |
| "completion_length": 88.08000183105469, |
| "epoch": 0.8318125060032657, |
| "grad_norm": 1.8961539268493652, |
| "kl": 1.2761327236890794, |
| "learning_rate": 1.614710884981951e-07, |
| "loss": 0.0002, |
| "reward": 1.5815791606903076, |
| "reward_std": 0.24661691784858703, |
| "rewards/code_format_reward": 0.9862499833106995, |
| "rewards/code_reward": 0.5442270636558533, |
| "step": 4330, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.035589413810521366, |
| "clip_ratio/high_mean": 0.005689902242738754, |
| "clip_ratio/low_mean": 0.00015636042662663384, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005846262606792152, |
| "completion_length": 89.16500244140624, |
| "epoch": 0.8337335510517722, |
| "grad_norm": 1.6006284952163696, |
| "kl": 0.6420656457543373, |
| "learning_rate": 1.6010758814463287e-07, |
| "loss": 0.0027, |
| "reward": 1.643228530883789, |
| "reward_std": 0.2129346549510956, |
| "rewards/code_format_reward": 0.9899999856948852, |
| "rewards/code_reward": 0.5741142451763153, |
| "step": 4340, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.007347659638617188, |
| "clip_ratio/high_mean": 0.001005946182704065, |
| "clip_ratio/low_mean": 0.00028780620195902886, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0012937523904838599, |
| "completion_length": 98.85000152587891, |
| "epoch": 0.8356545961002786, |
| "grad_norm": 5.479083061218262, |
| "kl": 0.3409851986914873, |
| "learning_rate": 1.5875829695856406e-07, |
| "loss": -0.0007, |
| "reward": 1.882705855369568, |
| "reward_std": 0.22037020921707154, |
| "rewards/code_format_reward": 0.9899999856948852, |
| "rewards/code_reward": 0.6938528895378113, |
| "step": 4350, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.032100778096355496, |
| "clip_ratio/high_mean": 0.004409284892608412, |
| "clip_ratio/low_mean": 4.9924499762710184e-05, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.004459209358901717, |
| "completion_length": 90.73500213623046, |
| "epoch": 0.8375756411487849, |
| "grad_norm": 56.100852966308594, |
| "kl": 0.22566271349787712, |
| "learning_rate": 1.5742326411330942e-07, |
| "loss": 0.0011, |
| "reward": 1.8064903020858765, |
| "reward_std": 0.1691088706254959, |
| "rewards/code_format_reward": 0.9962499976158142, |
| "rewards/code_reward": 0.65418261885643, |
| "step": 4360, |
| "zero_std_ratio": 0.675 |
| }, |
| { |
| "clip_ratio/high_max": 0.005500979837961495, |
| "clip_ratio/high_mean": 0.0008934643206885085, |
| "clip_ratio/low_mean": 0.0005694760067854077, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001462940318742767, |
| "completion_length": 93.03750305175781, |
| "epoch": 0.8394966861972913, |
| "grad_norm": 7.828958034515381, |
| "kl": 0.6565275602042675, |
| "learning_rate": 1.5610253826256036e-07, |
| "loss": 0.003, |
| "reward": 1.7732144832611083, |
| "reward_std": 0.33924323320388794, |
| "rewards/code_format_reward": 0.9825000047683716, |
| "rewards/code_reward": 0.6409822225570678, |
| "step": 4370, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.0038046793546527625, |
| "clip_ratio/high_mean": 0.0004755849193315953, |
| "clip_ratio/low_mean": 0.0006387827248545364, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0011143676441861317, |
| "completion_length": 85.95250244140625, |
| "epoch": 0.8414177312457977, |
| "grad_norm": 3.0064802169799805, |
| "kl": 9.46174124404788, |
| "learning_rate": 1.5479616753860792e-07, |
| "loss": 0.0195, |
| "reward": 1.8130270481109618, |
| "reward_std": 0.1679749459028244, |
| "rewards/code_format_reward": 0.9924999952316285, |
| "rewards/code_reward": 0.6583885312080383, |
| "step": 4380, |
| "zero_std_ratio": 0.675 |
| }, |
| { |
| "clip_ratio/high_max": 0.020394568890333177, |
| "clip_ratio/high_mean": 0.002549321111291647, |
| "clip_ratio/low_mean": 0.0012285682838410138, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.003777889395132661, |
| "completion_length": 94.72500305175781, |
| "epoch": 0.8433387762943041, |
| "grad_norm": 8.23426342010498, |
| "kl": 0.3538756832480431, |
| "learning_rate": 1.5350419955058645e-07, |
| "loss": -0.0046, |
| "reward": 1.6075192928314208, |
| "reward_std": 0.16927714347839357, |
| "rewards/code_format_reward": 0.9962499976158142, |
| "rewards/code_reward": 0.5546970963478088, |
| "step": 4390, |
| "zero_std_ratio": 0.675 |
| }, |
| { |
| "clip_ratio/high_max": 0.06588131491444074, |
| "clip_ratio/high_mean": 0.009102540424646578, |
| "clip_ratio/low_mean": 0.0007730728961178101, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.009875613666372374, |
| "completion_length": 90.04500274658203, |
| "epoch": 0.8452598213428105, |
| "grad_norm": 7.7215776443481445, |
| "kl": 0.2362464390695095, |
| "learning_rate": 1.522266813827407e-07, |
| "loss": 0.0036, |
| "reward": 1.8586368560791016, |
| "reward_std": 0.2194239765405655, |
| "rewards/code_format_reward": 0.9949999928474427, |
| "rewards/code_reward": 0.6805683970451355, |
| "step": 4400, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.004174966411665082, |
| "clip_ratio/high_mean": 0.0007423789938911796, |
| "clip_ratio/low_mean": 7.375134955509566e-05, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0008161303412634879, |
| "completion_length": 88.90750122070312, |
| "epoch": 0.8471808663913168, |
| "grad_norm": 2.829716920852661, |
| "kl": 1.5581397600471973, |
| "learning_rate": 1.509636595927078e-07, |
| "loss": 0.003, |
| "reward": 1.9052275657653808, |
| "reward_std": 0.256375952064991, |
| "rewards/code_format_reward": 0.9787499904632568, |
| "rewards/code_reward": 0.7079262495040893, |
| "step": 4410, |
| "zero_std_ratio": 0.6 |
| }, |
| { |
| "clip_ratio/high_max": 0.07640773041639477, |
| "clip_ratio/high_mean": 0.009898414360941387, |
| "clip_ratio/low_mean": 9.467430354561656e-05, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00999308866157662, |
| "completion_length": 95.14500122070312, |
| "epoch": 0.8491019114398233, |
| "grad_norm": 0.3017069101333618, |
| "kl": 0.8497596487402916, |
| "learning_rate": 1.4971518020982232e-07, |
| "loss": -0.0017, |
| "reward": 1.5574845552444458, |
| "reward_std": 0.1220448928885162, |
| "rewards/code_format_reward": 0.9862499952316284, |
| "rewards/code_reward": 0.5321797609329224, |
| "step": 4420, |
| "zero_std_ratio": 0.6 |
| }, |
| { |
| "clip_ratio/high_max": 0.02955477687064558, |
| "clip_ratio/high_mean": 0.00414732932113111, |
| "clip_ratio/low_mean": 4.42216987721622e-05, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00419155100826174, |
| "completion_length": 99.7, |
| "epoch": 0.8510229564883296, |
| "grad_norm": 5.942404747009277, |
| "kl": 0.5168043114244938, |
| "learning_rate": 1.4848128873343773e-07, |
| "loss": -0.0003, |
| "reward": 1.6633994817733764, |
| "reward_std": 0.2619109332561493, |
| "rewards/code_format_reward": 0.9762499928474426, |
| "rewards/code_reward": 0.5876372039318085, |
| "step": 4430, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.014663098810706288, |
| "clip_ratio/high_mean": 0.0024809099428239278, |
| "clip_ratio/low_mean": 3.1672295881435276e-05, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.002512582238705363, |
| "completion_length": 100.20750122070312, |
| "epoch": 0.852944001536836, |
| "grad_norm": 3.1078836917877197, |
| "kl": 0.39873379915952684, |
| "learning_rate": 1.4726203013126844e-07, |
| "loss": 0.006, |
| "reward": 1.7631917238235473, |
| "reward_std": 0.22433922737836837, |
| "rewards/code_format_reward": 0.9862499952316284, |
| "rewards/code_reward": 0.6350333511829376, |
| "step": 4440, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.014706605696119368, |
| "clip_ratio/high_mean": 0.00257694432802964, |
| "clip_ratio/low_mean": 0.00032811136916279795, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.002905055697192438, |
| "completion_length": 99.6875, |
| "epoch": 0.8548650465853425, |
| "grad_norm": 8.708415985107422, |
| "kl": 0.4677444875240326, |
| "learning_rate": 1.4605744883775122e-07, |
| "loss": -0.0036, |
| "reward": 1.8840698957443238, |
| "reward_std": 0.2510286644101143, |
| "rewards/code_format_reward": 0.9850000143051147, |
| "rewards/code_reward": 0.6957849264144897, |
| "step": 4450, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.008664844953455032, |
| "clip_ratio/high_mean": 0.0019024941400857642, |
| "clip_ratio/low_mean": 0.002259151160251349, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.004161645277054049, |
| "completion_length": 89.9000015258789, |
| "epoch": 0.8567860916338488, |
| "grad_norm": 7.514847755432129, |
| "kl": 0.3879747323691845, |
| "learning_rate": 1.4486758875242557e-07, |
| "loss": -0.0046, |
| "reward": 1.9147763013839723, |
| "reward_std": 0.2857444554567337, |
| "rewards/code_format_reward": 0.9887499928474426, |
| "rewards/code_reward": 0.7102006316184998, |
| "step": 4460, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.01213214877061546, |
| "clip_ratio/high_mean": 0.0017360628451569937, |
| "clip_ratio/low_mean": 0.0008027118048630655, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.002538774654385634, |
| "completion_length": 100.34250183105469, |
| "epoch": 0.8587071366823552, |
| "grad_norm": 4.4957380294799805, |
| "kl": 0.7120470233261585, |
| "learning_rate": 1.436924932383341e-07, |
| "loss": -0.0029, |
| "reward": 1.7210463523864745, |
| "reward_std": 0.348609185218811, |
| "rewards/code_format_reward": 0.9762500047683715, |
| "rewards/code_reward": 0.6164606809616089, |
| "step": 4470, |
| "zero_std_ratio": 0.375 |
| }, |
| { |
| "clip_ratio/high_max": 0.04909939672797918, |
| "clip_ratio/high_mean": 0.006639666750561446, |
| "clip_ratio/low_mean": 0.0001961415633559227, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006835808313917368, |
| "completion_length": 89.65500030517578, |
| "epoch": 0.8606281817308616, |
| "grad_norm": 0.6291245818138123, |
| "kl": 0.914973171055317, |
| "learning_rate": 1.4253220512044194e-07, |
| "loss": 0.0052, |
| "reward": 1.5310453414916991, |
| "reward_std": 0.2040669571608305, |
| "rewards/code_format_reward": 0.9849999904632568, |
| "rewards/code_reward": 0.519272655248642, |
| "step": 4480, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.023789329756982624, |
| "clip_ratio/high_mean": 0.0034216867323266344, |
| "clip_ratio/low_mean": 6.479026051238179e-05, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.003486476981197484, |
| "completion_length": 94.68500061035157, |
| "epoch": 0.862549226779368, |
| "grad_norm": 3.6435203552246094, |
| "kl": 0.24871882200241088, |
| "learning_rate": 1.4138676668407637e-07, |
| "loss": -0.004, |
| "reward": 1.7728254079818726, |
| "reward_std": 0.21846108362078667, |
| "rewards/code_format_reward": 0.9912499904632568, |
| "rewards/code_reward": 0.6386001646518707, |
| "step": 4490, |
| "zero_std_ratio": 0.6 |
| }, |
| { |
| "clip_ratio/high_max": 0.02858473571250215, |
| "clip_ratio/high_mean": 0.004445229801058303, |
| "clip_ratio/low_mean": 0.005347848287783563, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.009793078135407996, |
| "completion_length": 94.14000091552734, |
| "epoch": 0.8644702718278744, |
| "grad_norm": 7.250815391540527, |
| "kl": 1.268965845555067, |
| "learning_rate": 1.402562196733855e-07, |
| "loss": 0.1222, |
| "reward": 1.6482325553894044, |
| "reward_std": 0.321136474609375, |
| "rewards/code_format_reward": 0.9712499976158142, |
| "rewards/code_reward": 0.5813037693500519, |
| "step": 4500, |
| "zero_std_ratio": 0.4 |
| }, |
| { |
| "clip_ratio/high_max": 0.0016861034324392675, |
| "clip_ratio/high_mean": 0.00025714511721162123, |
| "clip_ratio/low_mean": 8.047257215366699e-05, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0003376176857273094, |
| "completion_length": 89.82750244140625, |
| "epoch": 0.8663913168763807, |
| "grad_norm": 1.5697243213653564, |
| "kl": 0.3187939524650574, |
| "learning_rate": 1.3914060528981713e-07, |
| "loss": -0.0008, |
| "reward": 1.6549904108047486, |
| "reward_std": 0.15924324840307236, |
| "rewards/code_format_reward": 0.9912499785423279, |
| "rewards/code_reward": 0.5796826839447021, |
| "step": 4510, |
| "zero_std_ratio": 0.65 |
| }, |
| { |
| "clip_ratio/high_max": 0.005441831634379923, |
| "clip_ratio/high_mean": 0.0007462791429134086, |
| "clip_ratio/low_mean": 0.0008039395906962454, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0015502187496167607, |
| "completion_length": 97.60250091552734, |
| "epoch": 0.8683123619248871, |
| "grad_norm": 2.864607334136963, |
| "kl": 0.36184127181768416, |
| "learning_rate": 1.38039964190617e-07, |
| "loss": -0.0068, |
| "reward": 1.5000358819961548, |
| "reward_std": 0.22264644205570222, |
| "rewards/code_format_reward": 0.9850000023841858, |
| "rewards/code_reward": 0.5037679553031922, |
| "step": 4520, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.045888486225157975, |
| "clip_ratio/high_mean": 0.006488210440147668, |
| "clip_ratio/low_mean": 4.380840982776135e-05, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006532018849975429, |
| "completion_length": 107.06000061035157, |
| "epoch": 0.8702334069733936, |
| "grad_norm": 3.5723934173583984, |
| "kl": 0.21280892938375473, |
| "learning_rate": 1.369543364873474e-07, |
| "loss": 0.0008, |
| "reward": 1.8976154088974, |
| "reward_std": 0.22375442534685136, |
| "rewards/code_format_reward": 0.9737499952316284, |
| "rewards/code_reward": 0.7053701996803283, |
| "step": 4530, |
| "zero_std_ratio": 0.625 |
| }, |
| { |
| "clip_ratio/high_max": 0.021734172268770634, |
| "clip_ratio/high_mean": 0.00285033899708651, |
| "clip_ratio/low_mean": 0.00015430593703058548, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.003004644898464903, |
| "completion_length": 90.5125, |
| "epoch": 0.8721544520218999, |
| "grad_norm": 26.33332633972168, |
| "kl": 16.64756402745843, |
| "learning_rate": 1.3588376174442495e-07, |
| "loss": 0.0407, |
| "reward": 1.8465018033981324, |
| "reward_std": 0.26863393038511274, |
| "rewards/code_format_reward": 0.9900000095367432, |
| "rewards/code_reward": 0.6757509171962738, |
| "step": 4540, |
| "zero_std_ratio": 0.6 |
| }, |
| { |
| "clip_ratio/high_max": 0.01540006476570852, |
| "clip_ratio/high_mean": 0.00195431642132462, |
| "clip_ratio/low_mean": 0.0003294220077805221, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0022837384400190785, |
| "completion_length": 91.96750183105469, |
| "epoch": 0.8740754970704063, |
| "grad_norm": 5.637061595916748, |
| "kl": 0.5615961387753486, |
| "learning_rate": 1.348282789776792e-07, |
| "loss": 0.0006, |
| "reward": 1.7335857629776001, |
| "reward_std": 0.16677757501602172, |
| "rewards/code_format_reward": 0.9712500095367431, |
| "rewards/code_reward": 0.6239803791046142, |
| "step": 4550, |
| "zero_std_ratio": 0.65 |
| }, |
| { |
| "clip_ratio/high_max": 0.012607228197157382, |
| "clip_ratio/high_mean": 0.0017772652208805084, |
| "clip_ratio/low_mean": 0.00020470973395276816, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0019819749519228934, |
| "completion_length": 90.42000122070313, |
| "epoch": 0.8759965421189126, |
| "grad_norm": 4.87404727935791, |
| "kl": 0.5051522366702557, |
| "learning_rate": 1.3378792665293032e-07, |
| "loss": -0.0007, |
| "reward": 1.8114176988601685, |
| "reward_std": 0.27143858969211576, |
| "rewards/code_format_reward": 0.9687499880790711, |
| "rewards/code_reward": 0.6635213375091553, |
| "step": 4560, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.003521555650513619, |
| "clip_ratio/high_mean": 0.0005311336179147474, |
| "clip_ratio/low_mean": 0.00039719248161418363, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0009283260951633565, |
| "completion_length": 96.31000061035157, |
| "epoch": 0.8779175871674191, |
| "grad_norm": 3.5294971466064453, |
| "kl": 0.44891551434993743, |
| "learning_rate": 1.3276274268458749e-07, |
| "loss": -0.0011, |
| "reward": 1.8015916109085084, |
| "reward_std": 0.23535949736833572, |
| "rewards/code_format_reward": 0.9849999904632568, |
| "rewards/code_reward": 0.65454580783844, |
| "step": 4570, |
| "zero_std_ratio": 0.625 |
| }, |
| { |
| "clip_ratio/high_max": 0.016813984792679548, |
| "clip_ratio/high_mean": 0.0026305554260034115, |
| "clip_ratio/low_mean": 0.00013278115511639044, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0027633365796646105, |
| "completion_length": 92.12000122070313, |
| "epoch": 0.8798386322159255, |
| "grad_norm": 3.3064281940460205, |
| "kl": 147.6638460204005, |
| "learning_rate": 1.3175276443426704e-07, |
| "loss": 0.3018, |
| "reward": 1.8557111263275146, |
| "reward_std": 0.21927002370357512, |
| "rewards/code_format_reward": 0.9924999833106994, |
| "rewards/code_reward": 0.6797305464744567, |
| "step": 4580, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.004338507051579654, |
| "clip_ratio/high_mean": 0.0005835221760207787, |
| "clip_ratio/low_mean": 9.975638386094943e-05, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0006832785555161535, |
| "completion_length": 96.14250183105469, |
| "epoch": 0.8817596772644318, |
| "grad_norm": 5.933443546295166, |
| "kl": 0.7469953082501888, |
| "learning_rate": 1.3075802870943102e-07, |
| "loss": -0.0005, |
| "reward": 1.7140401601791382, |
| "reward_std": 0.32567469477653505, |
| "rewards/code_format_reward": 0.9699999928474426, |
| "rewards/code_reward": 0.6145200908184052, |
| "step": 4590, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.008221420878544449, |
| "clip_ratio/high_mean": 0.0010466745734447613, |
| "clip_ratio/low_mean": 0.00021989296365063638, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0012665675370953978, |
| "completion_length": 96.91000213623047, |
| "epoch": 0.8836807223129383, |
| "grad_norm": 3.6585068702697754, |
| "kl": 0.2884219281375408, |
| "learning_rate": 1.2977857176204554e-07, |
| "loss": -0.0014, |
| "reward": 1.745366358757019, |
| "reward_std": 0.28437634110450744, |
| "rewards/code_format_reward": 0.9612499952316285, |
| "rewards/code_reward": 0.6323706865310669, |
| "step": 4600, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.013106013461947442, |
| "clip_ratio/high_mean": 0.001983167743310332, |
| "clip_ratio/low_mean": 0.0010545071098022162, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0030376748647540806, |
| "completion_length": 95.46000366210937, |
| "epoch": 0.8856017673614446, |
| "grad_norm": 3.166572332382202, |
| "kl": 0.7999920375645161, |
| "learning_rate": 1.2881442928725997e-07, |
| "loss": 0.0024, |
| "reward": 1.7604058027267455, |
| "reward_std": 0.1588110476732254, |
| "rewards/code_format_reward": 0.9837499976158142, |
| "rewards/code_reward": 0.6342653870582581, |
| "step": 4610, |
| "zero_std_ratio": 0.725 |
| }, |
| { |
| "clip_ratio/high_max": 0.03971324802841991, |
| "clip_ratio/high_mean": 0.005240282195154577, |
| "clip_ratio/low_mean": 0.00012449334171833472, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005364775560155977, |
| "completion_length": 91.37250213623047, |
| "epoch": 0.887522812409951, |
| "grad_norm": 1.2688926458358765, |
| "kl": 52.058202140033245, |
| "learning_rate": 1.2786563642210536e-07, |
| "loss": 0.1059, |
| "reward": 1.6578764081001283, |
| "reward_std": 0.1922210179269314, |
| "rewards/code_format_reward": 0.9724999904632569, |
| "rewards/code_reward": 0.5858131945133209, |
| "step": 4620, |
| "zero_std_ratio": 0.6 |
| }, |
| { |
| "clip_ratio/high_max": 0.014312215382233262, |
| "clip_ratio/high_mean": 0.002295189391588792, |
| "clip_ratio/low_mean": 0.0010927254450507462, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0033879148249980062, |
| "completion_length": 92.44750061035157, |
| "epoch": 0.8894438574584574, |
| "grad_norm": 1.0473991632461548, |
| "kl": 0.48051133900880816, |
| "learning_rate": 1.269322277442151e-07, |
| "loss": 0.0015, |
| "reward": 1.8454564094543457, |
| "reward_std": 0.23949076235294342, |
| "rewards/code_format_reward": 0.9824999809265137, |
| "rewards/code_reward": 0.6771032094955445, |
| "step": 4630, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.040262592025101185, |
| "clip_ratio/high_mean": 0.005372756696306169, |
| "clip_ratio/low_mean": 0.0007898360927356407, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006162592757027597, |
| "completion_length": 84.6050018310547, |
| "epoch": 0.8913649025069638, |
| "grad_norm": 6.553028106689453, |
| "kl": 0.6895815744996071, |
| "learning_rate": 1.2601423727056346e-07, |
| "loss": -0.0001, |
| "reward": 1.6561978340148926, |
| "reward_std": 0.36703028678894045, |
| "rewards/code_format_reward": 0.975, |
| "rewards/code_reward": 0.5843489110469818, |
| "step": 4640, |
| "zero_std_ratio": 0.325 |
| }, |
| { |
| "clip_ratio/high_max": 0.06538669131696224, |
| "clip_ratio/high_mean": 0.009138646663632243, |
| "clip_ratio/low_mean": 0.0017342485502013006, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.010872895480133593, |
| "completion_length": 88.39750061035156, |
| "epoch": 0.8932859475554702, |
| "grad_norm": 4.167427062988281, |
| "kl": 1.728559673577547, |
| "learning_rate": 1.2511169845622699e-07, |
| "loss": 0.0019, |
| "reward": 1.6277015209197998, |
| "reward_std": 0.21625073552131652, |
| "rewards/code_format_reward": 0.975000011920929, |
| "rewards/code_reward": 0.5701007604598999, |
| "step": 4650, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.042488472175318745, |
| "clip_ratio/high_mean": 0.005791870540997479, |
| "clip_ratio/low_mean": 2.0525451691355557e-05, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005812396005785559, |
| "completion_length": 92.57250213623047, |
| "epoch": 0.8952069926039765, |
| "grad_norm": 6.026858806610107, |
| "kl": 0.7588046140968799, |
| "learning_rate": 1.2422464419316432e-07, |
| "loss": 0.0034, |
| "reward": 1.7008742094039917, |
| "reward_std": 0.27438378930091856, |
| "rewards/code_format_reward": 0.9699999928474426, |
| "rewards/code_reward": 0.6079370617866516, |
| "step": 4660, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.013316971366293728, |
| "clip_ratio/high_mean": 0.0019765587523579596, |
| "clip_ratio/low_mean": 8.563735173083842e-05, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.002062196109909564, |
| "completion_length": 93.35000305175781, |
| "epoch": 0.897128037652483, |
| "grad_norm": 4.863064765930176, |
| "kl": 7.6627843722701074, |
| "learning_rate": 1.233531068090184e-07, |
| "loss": 0.011, |
| "reward": 1.8806322813034058, |
| "reward_std": 0.28162118047475815, |
| "rewards/code_format_reward": 0.9887500047683716, |
| "rewards/code_reward": 0.6931286633014679, |
| "step": 4670, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.004586372757330537, |
| "clip_ratio/high_mean": 0.0006299379543634132, |
| "clip_ratio/low_mean": 1.7313018906861545e-05, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0006472509849118069, |
| "completion_length": 92.36500091552735, |
| "epoch": 0.8990490827009894, |
| "grad_norm": 2.1931703090667725, |
| "kl": 0.2519014351069927, |
| "learning_rate": 1.2249711806593762e-07, |
| "loss": 0.0034, |
| "reward": 1.8040930509567261, |
| "reward_std": 0.24223610311746596, |
| "rewards/code_format_reward": 0.9837499976158142, |
| "rewards/code_reward": 0.6561090111732483, |
| "step": 4680, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.006883417209610343, |
| "clip_ratio/high_mean": 0.0009808192204218357, |
| "clip_ratio/low_mean": 0.00029269491278682835, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001273514133208664, |
| "completion_length": 91.60500030517578, |
| "epoch": 0.9009701277494957, |
| "grad_norm": 21.0294132232666, |
| "kl": 0.25964570268988607, |
| "learning_rate": 1.2165670915941866e-07, |
| "loss": -0.0043, |
| "reward": 1.9244711637496947, |
| "reward_std": 0.1629927098751068, |
| "rewards/code_format_reward": 0.9862499952316284, |
| "rewards/code_reward": 0.7156730651855469, |
| "step": 4690, |
| "zero_std_ratio": 0.65 |
| }, |
| { |
| "clip_ratio/high_max": 0.010230390657670795, |
| "clip_ratio/high_mean": 0.0014585633180104196, |
| "clip_ratio/low_mean": 3.0266345129348336e-05, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001488829671870917, |
| "completion_length": 87.84500274658203, |
| "epoch": 0.9028911727980021, |
| "grad_norm": 1.7218002080917358, |
| "kl": 16.447690600901844, |
| "learning_rate": 1.2083191071716937e-07, |
| "loss": 0.0339, |
| "reward": 1.940086579322815, |
| "reward_std": 0.16455088555812836, |
| "rewards/code_format_reward": 0.993749988079071, |
| "rewards/code_reward": 0.7216057777404785, |
| "step": 4700, |
| "zero_std_ratio": 0.675 |
| }, |
| { |
| "clip_ratio/high_max": 0.025706328079104425, |
| "clip_ratio/high_mean": 0.003573437442537397, |
| "clip_ratio/low_mean": 3.392130311112851e-05, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0036073587427381424, |
| "completion_length": 81.84500274658203, |
| "epoch": 0.9048122178465084, |
| "grad_norm": 0.22543705999851227, |
| "kl": 0.31741214692592623, |
| "learning_rate": 1.2002275279799288e-07, |
| "loss": -0.0056, |
| "reward": 1.8292718410491944, |
| "reward_std": 0.12828939855098725, |
| "rewards/code_format_reward": 0.9987499952316284, |
| "rewards/code_reward": 0.6649484276771546, |
| "step": 4710, |
| "zero_std_ratio": 0.65 |
| }, |
| { |
| "clip_ratio/high_max": 0.009241180948447437, |
| "clip_ratio/high_mean": 0.0013442957555525937, |
| "clip_ratio/low_mean": 4.643963038688526e-05, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0013907353932154365, |
| "completion_length": 95.63500213623047, |
| "epoch": 0.9067332628950149, |
| "grad_norm": 5.23514986038208, |
| "kl": 0.804936108738184, |
| "learning_rate": 1.192292648906918e-07, |
| "loss": 0.0031, |
| "reward": 1.925449275970459, |
| "reward_std": 0.2213977299630642, |
| "rewards/code_format_reward": 0.9899999976158143, |
| "rewards/code_reward": 0.7152246475219727, |
| "step": 4720, |
| "zero_std_ratio": 0.6 |
| }, |
| { |
| "clip_ratio/high_max": 0.021669640118489042, |
| "clip_ratio/high_mean": 0.003889294656983111, |
| "clip_ratio/low_mean": 0.00044275675172684715, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.004332051414530724, |
| "completion_length": 92.25250244140625, |
| "epoch": 0.9086543079435213, |
| "grad_norm": 66.00515747070312, |
| "kl": 2.1086502872407435, |
| "learning_rate": 1.1845147591299378e-07, |
| "loss": 0.0162, |
| "reward": 1.5327723979949952, |
| "reward_std": 0.2872114762663841, |
| "rewards/code_format_reward": 0.9725000023841858, |
| "rewards/code_reward": 0.5232611894607544, |
| "step": 4730, |
| "zero_std_ratio": 0.4 |
| }, |
| { |
| "clip_ratio/high_max": 0.006079713994404301, |
| "clip_ratio/high_mean": 0.0010439059922646265, |
| "clip_ratio/low_mean": 0.0013928397551353556, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.002436745767045068, |
| "completion_length": 97.43000030517578, |
| "epoch": 0.9105753529920276, |
| "grad_norm": 2.8770546913146973, |
| "kl": 3.1866038836538793, |
| "learning_rate": 1.1768941421049768e-07, |
| "loss": 0.0069, |
| "reward": 1.7776832818984984, |
| "reward_std": 0.29561240673065187, |
| "rewards/code_format_reward": 0.9949999928474427, |
| "rewards/code_reward": 0.6400915861129761, |
| "step": 4740, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.005499497149139642, |
| "clip_ratio/high_mean": 0.0006874371436424553, |
| "clip_ratio/low_mean": 0.000482194940559566, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0011696320783812554, |
| "completion_length": 88.9500015258789, |
| "epoch": 0.9124963980405341, |
| "grad_norm": 8.540057182312012, |
| "kl": 0.9072364956140518, |
| "learning_rate": 1.1694310755564014e-07, |
| "loss": -0.0021, |
| "reward": 1.6791202545166015, |
| "reward_std": 0.326928648352623, |
| "rewards/code_format_reward": 0.9774999976158142, |
| "rewards/code_reward": 0.595185148715973, |
| "step": 4750, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.008684736292343587, |
| "clip_ratio/high_mean": 0.001148225087672472, |
| "clip_ratio/low_mean": 0.0005504319502506405, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0016986570524750277, |
| "completion_length": 95.68250122070313, |
| "epoch": 0.9144174430890404, |
| "grad_norm": 4.539205551147461, |
| "kl": 0.860013198107481, |
| "learning_rate": 1.1621258314668402e-07, |
| "loss": 0.0, |
| "reward": 1.7214089155197143, |
| "reward_std": 0.1847836285829544, |
| "rewards/code_format_reward": 0.9674999952316284, |
| "rewards/code_reward": 0.6188294351100921, |
| "step": 4760, |
| "zero_std_ratio": 0.625 |
| }, |
| { |
| "clip_ratio/high_max": 0.012012088089250028, |
| "clip_ratio/high_mean": 0.0020424059097422288, |
| "clip_ratio/low_mean": 7.006222731433808e-05, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0021124681399669496, |
| "completion_length": 93.21750183105469, |
| "epoch": 0.9163384881375468, |
| "grad_norm": 6.60590124130249, |
| "kl": 0.45315413996577264, |
| "learning_rate": 1.1549786760672676e-07, |
| "loss": -0.0013, |
| "reward": 1.7664082288742065, |
| "reward_std": 0.24015129953622819, |
| "rewards/code_format_reward": 0.9849999904632568, |
| "rewards/code_reward": 0.6369540929794312, |
| "step": 4770, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.028257530624978246, |
| "clip_ratio/high_mean": 0.00573968501703348, |
| "clip_ratio/low_mean": 0.00028595919138751924, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006025644272449426, |
| "completion_length": 93.59000244140626, |
| "epoch": 0.9182595331860532, |
| "grad_norm": 3.693448781967163, |
| "kl": 0.5863466400653123, |
| "learning_rate": 1.1479898698273037e-07, |
| "loss": 0.0001, |
| "reward": 1.7522862911224366, |
| "reward_std": 0.24038469642400742, |
| "rewards/code_format_reward": 0.9762500047683715, |
| "rewards/code_reward": 0.6320806205272674, |
| "step": 4780, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.006044295988976956, |
| "clip_ratio/high_mean": 0.0008545084856450558, |
| "clip_ratio/low_mean": 0.0004956311546266079, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0013501396053470672, |
| "completion_length": 100.22000122070312, |
| "epoch": 0.9201805782345596, |
| "grad_norm": 17.894886016845703, |
| "kl": 0.33935268595814705, |
| "learning_rate": 1.1411596674457193e-07, |
| "loss": -0.0019, |
| "reward": 1.697510004043579, |
| "reward_std": 0.16087576895952224, |
| "rewards/code_format_reward": 0.987499988079071, |
| "rewards/code_reward": 0.6018799901008606, |
| "step": 4790, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.005959878279827535, |
| "clip_ratio/high_mean": 0.0009170519857434556, |
| "clip_ratio/low_mean": 7.972503372002392e-05, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0009967770281946286, |
| "completion_length": 98.54750213623046, |
| "epoch": 0.922101623283066, |
| "grad_norm": 3.242460250854492, |
| "kl": 0.46977903619408606, |
| "learning_rate": 1.1344883178411565e-07, |
| "loss": -0.0036, |
| "reward": 1.7927821159362793, |
| "reward_std": 0.24044746458530425, |
| "rewards/code_format_reward": 0.9699999809265136, |
| "rewards/code_reward": 0.6538910627365112, |
| "step": 4800, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.00756604690104723, |
| "clip_ratio/high_mean": 0.0010185762541368604, |
| "clip_ratio/low_mean": 0.00016034738000598737, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001178923639236018, |
| "completion_length": 99.49000091552735, |
| "epoch": 0.9240226683315724, |
| "grad_norm": 7.225472927093506, |
| "kl": 0.2285786397755146, |
| "learning_rate": 1.1279760641430568e-07, |
| "loss": 0.0001, |
| "reward": 1.7233760595321654, |
| "reward_std": 0.22306990921497344, |
| "rewards/code_format_reward": 0.9875, |
| "rewards/code_reward": 0.6148130118846893, |
| "step": 4810, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.011060118256136776, |
| "clip_ratio/high_mean": 0.0017047788191121072, |
| "clip_ratio/low_mean": 0.00028281604463700207, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0019875948812114073, |
| "completion_length": 92.36500091552735, |
| "epoch": 0.9259437133800787, |
| "grad_norm": 4.390386581420898, |
| "kl": 0.8032988727092742, |
| "learning_rate": 1.1216231436827974e-07, |
| "loss": 0.0005, |
| "reward": 1.7829072952270508, |
| "reward_std": 0.21434771865606309, |
| "rewards/code_format_reward": 0.9862500071525574, |
| "rewards/code_reward": 0.6448911607265473, |
| "step": 4820, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.013947398256277665, |
| "clip_ratio/high_mean": 0.0018392194229818414, |
| "clip_ratio/low_mean": 0.0004060613617184572, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.002245280790521065, |
| "completion_length": 103.5125015258789, |
| "epoch": 0.9278647584285852, |
| "grad_norm": 6.774899482727051, |
| "kl": 0.34710453301668165, |
| "learning_rate": 1.1154297879850462e-07, |
| "loss": 0.0003, |
| "reward": 1.7023445606231689, |
| "reward_std": 0.23593612909317016, |
| "rewards/code_format_reward": 0.96875, |
| "rewards/code_reward": 0.60898477435112, |
| "step": 4830, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.00909699429757893, |
| "clip_ratio/high_mean": 0.0014705892943311482, |
| "clip_ratio/low_mean": 0.0004355724740889855, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0019061617698753253, |
| "completion_length": 91.47000274658203, |
| "epoch": 0.9297858034770915, |
| "grad_norm": 1.7924318313598633, |
| "kl": 0.5235365644097328, |
| "learning_rate": 1.1093962227593214e-07, |
| "loss": 0.0017, |
| "reward": 1.823938512802124, |
| "reward_std": 0.18318418860435487, |
| "rewards/code_format_reward": 0.987499988079071, |
| "rewards/code_reward": 0.6650941967964172, |
| "step": 4840, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.008868952537886799, |
| "clip_ratio/high_mean": 0.0013198618631577118, |
| "clip_ratio/low_mean": 6.896776030771435e-05, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0013888296205550432, |
| "completion_length": 97.04750061035156, |
| "epoch": 0.9317068485255979, |
| "grad_norm": 5.492427825927734, |
| "kl": 0.27957614585757257, |
| "learning_rate": 1.1035226678917662e-07, |
| "loss": 0.0001, |
| "reward": 1.7743586778640748, |
| "reward_std": 0.19067177027463914, |
| "rewards/code_format_reward": 0.9699999928474426, |
| "rewards/code_reward": 0.6446793019771576, |
| "step": 4850, |
| "zero_std_ratio": 0.6 |
| }, |
| { |
| "clip_ratio/high_max": 0.00021865542512387038, |
| "clip_ratio/high_mean": 2.7331928140483797e-05, |
| "clip_ratio/low_mean": 0.00022580694640055298, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0002531388745410368, |
| "completion_length": 91.65750274658203, |
| "epoch": 0.9336278935741044, |
| "grad_norm": 8.045164108276367, |
| "kl": 0.20759812816977502, |
| "learning_rate": 1.0978093374371373e-07, |
| "loss": -0.0004, |
| "reward": 1.7663999795913696, |
| "reward_std": 0.281513449549675, |
| "rewards/code_format_reward": 0.9912499904632568, |
| "rewards/code_reward": 0.6353874802589417, |
| "step": 4860, |
| "zero_std_ratio": 0.4 |
| }, |
| { |
| "clip_ratio/high_max": 0.02832627217285335, |
| "clip_ratio/high_mean": 0.0035600741393864155, |
| "clip_ratio/low_mean": 0.00011176664993399754, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0036718408693559466, |
| "completion_length": 84.46500244140626, |
| "epoch": 0.9355489386226107, |
| "grad_norm": 4.819484233856201, |
| "kl": 0.5664212189614772, |
| "learning_rate": 1.0922564396109993e-07, |
| "loss": -0.0008, |
| "reward": 1.7755849838256836, |
| "reward_std": 0.20761601328849794, |
| "rewards/code_format_reward": 0.9899999856948852, |
| "rewards/code_reward": 0.640292489528656, |
| "step": 4870, |
| "zero_std_ratio": 0.625 |
| }, |
| { |
| "clip_ratio/high_max": 0.006872700434178114, |
| "clip_ratio/high_mean": 0.0009693403088022023, |
| "clip_ratio/low_mean": 3.415665923967026e-05, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0010034969680418726, |
| "completion_length": 92.47500152587891, |
| "epoch": 0.9374699836711171, |
| "grad_norm": 2.605060338973999, |
| "kl": 0.6489929877221584, |
| "learning_rate": 1.0868641767821432e-07, |
| "loss": -0.0041, |
| "reward": 1.9151075601577758, |
| "reward_std": 0.2566168040037155, |
| "rewards/code_format_reward": 0.9849999904632568, |
| "rewards/code_reward": 0.7113037467002868, |
| "step": 4880, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.018258474441245197, |
| "clip_ratio/high_mean": 0.003355332469800487, |
| "clip_ratio/low_mean": 0.000607103164657019, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.003962435649009421, |
| "completion_length": 90.51000366210937, |
| "epoch": 0.9393910287196234, |
| "grad_norm": 4.408846378326416, |
| "kl": 0.35625301077961924, |
| "learning_rate": 1.0816327454652044e-07, |
| "loss": -0.0018, |
| "reward": 1.7154739379882813, |
| "reward_std": 0.2987362504005432, |
| "rewards/code_format_reward": 0.9612499952316285, |
| "rewards/code_reward": 0.6174244284629822, |
| "step": 4890, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.010325380798894912, |
| "clip_ratio/high_mean": 0.0015298718310077675, |
| "clip_ratio/low_mean": 0.000294900168228196, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0018247719475766645, |
| "completion_length": 100.19250183105468, |
| "epoch": 0.9413120737681299, |
| "grad_norm": 9.08279037475586, |
| "kl": 0.23486268445849418, |
| "learning_rate": 1.0765623363135061e-07, |
| "loss": -0.0011, |
| "reward": 1.5800267338752747, |
| "reward_std": 0.26311944872140886, |
| "rewards/code_format_reward": 0.9862499952316284, |
| "rewards/code_reward": 0.5434508502483368, |
| "step": 4900, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.004708675656002015, |
| "clip_ratio/high_mean": 0.0008911975004593842, |
| "clip_ratio/low_mean": 0.0001348661200609058, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001026063623430673, |
| "completion_length": 85.6300033569336, |
| "epoch": 0.9432331188166363, |
| "grad_norm": 2.5798628330230713, |
| "kl": 0.5353534445166588, |
| "learning_rate": 1.071653134112109e-07, |
| "loss": -0.0018, |
| "reward": 1.7293733358383179, |
| "reward_std": 0.23426424115896224, |
| "rewards/code_format_reward": 0.9862499833106995, |
| "rewards/code_reward": 0.6181241631507873, |
| "step": 4910, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.031931064534001054, |
| "clip_ratio/high_mean": 0.004416047394624911, |
| "clip_ratio/low_mean": 0.00037934551510261373, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0047953929373761636, |
| "completion_length": 93.08500061035156, |
| "epoch": 0.9451541638651426, |
| "grad_norm": 3.0407347679138184, |
| "kl": 0.3617399115115404, |
| "learning_rate": 1.0669053177710766e-07, |
| "loss": -0.0023, |
| "reward": 1.602178120613098, |
| "reward_std": 0.23843889832496643, |
| "rewards/code_format_reward": 0.987499988079071, |
| "rewards/code_reward": 0.5542140543460846, |
| "step": 4920, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.008546069997828453, |
| "clip_ratio/high_mean": 0.0011838132908451372, |
| "clip_ratio/low_mean": 6.596306338906288e-05, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0012497763542341999, |
| "completion_length": 102.79500122070313, |
| "epoch": 0.947075208913649, |
| "grad_norm": 5.987438678741455, |
| "kl": 0.28761252388358116, |
| "learning_rate": 1.0623190603189566e-07, |
| "loss": 0.0011, |
| "reward": 1.5471005201339723, |
| "reward_std": 0.28855718672275543, |
| "rewards/code_format_reward": 0.9674999952316284, |
| "rewards/code_reward": 0.5316752552986145, |
| "step": 4930, |
| "zero_std_ratio": 0.4 |
| }, |
| { |
| "clip_ratio/high_max": 0.03786106104962528, |
| "clip_ratio/high_mean": 0.005264365172479302, |
| "clip_ratio/low_mean": 0.001157468621386215, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00642183352902066, |
| "completion_length": 95.2625015258789, |
| "epoch": 0.9489962539621554, |
| "grad_norm": 4.088647842407227, |
| "kl": 9114.393886435031, |
| "learning_rate": 1.0578945288964734e-07, |
| "loss": 18.226, |
| "reward": 1.5625978589057923, |
| "reward_std": 0.22688832581043245, |
| "rewards/code_format_reward": 0.9762499928474426, |
| "rewards/code_reward": 0.5372364044189453, |
| "step": 4940, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.009068883489817381, |
| "clip_ratio/high_mean": 0.0015044378931634128, |
| "clip_ratio/low_mean": 8.003948896657675e-05, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0015844773850403726, |
| "completion_length": 88.91999969482421, |
| "epoch": 0.9509172990106618, |
| "grad_norm": 4.558302879333496, |
| "kl": 0.322134206071496, |
| "learning_rate": 1.0536318847504383e-07, |
| "loss": 0.0008, |
| "reward": 1.683999252319336, |
| "reward_std": 0.15837213546037673, |
| "rewards/code_format_reward": 0.9887500047683716, |
| "rewards/code_reward": 0.5948120951652527, |
| "step": 4950, |
| "zero_std_ratio": 0.65 |
| }, |
| { |
| "clip_ratio/high_max": 0.004135725944070146, |
| "clip_ratio/high_mean": 0.0006349837080051657, |
| "clip_ratio/low_mean": 0.0002028582151979208, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0008378419290238526, |
| "completion_length": 88.58000030517579, |
| "epoch": 0.9528383440591682, |
| "grad_norm": 1.3143569231033325, |
| "kl": 0.32492467686533927, |
| "learning_rate": 1.0495312832278721e-07, |
| "loss": 0.001, |
| "reward": 1.757376217842102, |
| "reward_std": 0.18446292728185654, |
| "rewards/code_format_reward": 0.9875, |
| "rewards/code_reward": 0.6318130671977997, |
| "step": 4960, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.004577422246802599, |
| "clip_ratio/high_mean": 0.00067823924619006, |
| "clip_ratio/low_mean": 0.00020590101485140622, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0008841402595862746, |
| "completion_length": 91.77249908447266, |
| "epoch": 0.9547593891076745, |
| "grad_norm": 2.7616970539093018, |
| "kl": 0.6282597549259663, |
| "learning_rate": 1.0455928737703441e-07, |
| "loss": 0.0001, |
| "reward": 1.665701198577881, |
| "reward_std": 0.1566584974527359, |
| "rewards/code_format_reward": 0.99375, |
| "rewards/code_reward": 0.5844130754470825, |
| "step": 4970, |
| "zero_std_ratio": 0.6 |
| }, |
| { |
| "clip_ratio/high_max": 0.012442531622946262, |
| "clip_ratio/high_mean": 0.0018589732819236815, |
| "clip_ratio/low_mean": 6.720430101267994e-05, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0019261775829363613, |
| "completion_length": 90.92250213623046, |
| "epoch": 0.956680434156181, |
| "grad_norm": 2.84462308883667, |
| "kl": 0.3018207371234894, |
| "learning_rate": 1.0418167999085259e-07, |
| "loss": 0.0041, |
| "reward": 1.7472755432128906, |
| "reward_std": 0.24319706559181214, |
| "rewards/code_format_reward": 0.9774999856948853, |
| "rewards/code_reward": 0.6292627692222595, |
| "step": 4980, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.01709002295974642, |
| "clip_ratio/high_mean": 0.002679444645764306, |
| "clip_ratio/low_mean": 0.0003698979213368148, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.003049342567101121, |
| "completion_length": 92.97750091552734, |
| "epoch": 0.9586014792046873, |
| "grad_norm": 11.978320121765137, |
| "kl": 1.2294385731220245, |
| "learning_rate": 1.0382031992569592e-07, |
| "loss": 0.0036, |
| "reward": 1.739167046546936, |
| "reward_std": 0.29275294244289396, |
| "rewards/code_format_reward": 0.9875, |
| "rewards/code_reward": 0.622708535194397, |
| "step": 4990, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.007942511793226003, |
| "clip_ratio/high_mean": 0.001185902243014425, |
| "clip_ratio/low_mean": 5.571418441832066e-05, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0012416164390742779, |
| "completion_length": 93.31250305175782, |
| "epoch": 0.9605225242531937, |
| "grad_norm": 3.364788055419922, |
| "kl": 0.35085868686437605, |
| "learning_rate": 1.0347522035090446e-07, |
| "loss": -0.0003, |
| "reward": 1.9564055442810058, |
| "reward_std": 0.2229623466730118, |
| "rewards/code_format_reward": 0.9912499904632568, |
| "rewards/code_reward": 0.7303902268409729, |
| "step": 5000, |
| "zero_std_ratio": 0.6 |
| }, |
| { |
| "clip_ratio/high_max": 0.015932422177866102, |
| "clip_ratio/high_mean": 0.0028564550855662675, |
| "clip_ratio/low_mean": 0.00020086783915758134, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.003057322936365381, |
| "completion_length": 96.12750091552735, |
| "epoch": 0.9624435693017002, |
| "grad_norm": 5.283419609069824, |
| "kl": 0.3115640334784985, |
| "learning_rate": 1.0314639384322356e-07, |
| "loss": -0.0037, |
| "reward": 1.6293291807174684, |
| "reward_std": 0.2581008836627007, |
| "rewards/code_format_reward": 0.981249988079071, |
| "rewards/code_reward": 0.5693520545959473, |
| "step": 5010, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.003986756759695708, |
| "clip_ratio/high_mean": 0.0006319725507637486, |
| "clip_ratio/low_mean": 0.000603693921584636, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0012356664577964694, |
| "completion_length": 86.04750366210938, |
| "epoch": 0.9643646143502065, |
| "grad_norm": 8.70874309539795, |
| "kl": 0.47548493221402166, |
| "learning_rate": 1.0283385238634632e-07, |
| "loss": 0.0041, |
| "reward": 1.622909712791443, |
| "reward_std": 0.2179076835513115, |
| "rewards/code_format_reward": 0.9712499976158142, |
| "rewards/code_reward": 0.5686423420906067, |
| "step": 5020, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.005893218703567982, |
| "clip_ratio/high_mean": 0.000819433806464076, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.000819433806464076, |
| "completion_length": 88.9375015258789, |
| "epoch": 0.9662856593987129, |
| "grad_norm": 6.6337199211120605, |
| "kl": 0.5933880299329758, |
| "learning_rate": 1.0253760737047606e-07, |
| "loss": -0.0043, |
| "reward": 1.7307970523834229, |
| "reward_std": 0.1557233951985836, |
| "rewards/code_format_reward": 0.9924999952316285, |
| "rewards/code_reward": 0.6172735095024109, |
| "step": 5030, |
| "zero_std_ratio": 0.625 |
| }, |
| { |
| "clip_ratio/high_max": 0.010898534208536148, |
| "clip_ratio/high_mean": 0.0015226851450279356, |
| "clip_ratio/low_mean": 0.009100449224933981, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.010623134509660303, |
| "completion_length": 87.65500183105469, |
| "epoch": 0.9682067044472192, |
| "grad_norm": 12.837902069091797, |
| "kl": 0.1521947119385004, |
| "learning_rate": 1.0225766959191187e-07, |
| "loss": 0.0007, |
| "reward": 1.766017746925354, |
| "reward_std": 0.1697022169828415, |
| "rewards/code_format_reward": 0.9924999952316285, |
| "rewards/code_reward": 0.6348838567733764, |
| "step": 5040, |
| "zero_std_ratio": 0.625 |
| }, |
| { |
| "clip_ratio/high_max": 0.039930257271043955, |
| "clip_ratio/high_mean": 0.005228024450480007, |
| "clip_ratio/low_mean": 0.0012023555900668725, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006430380133679137, |
| "completion_length": 99.63500213623047, |
| "epoch": 0.9701277494957257, |
| "grad_norm": 3.0135834217071533, |
| "kl": 0.5389343507587909, |
| "learning_rate": 1.0199404925265473e-07, |
| "loss": -0.0011, |
| "reward": 1.5655887126922607, |
| "reward_std": 0.1425598829984665, |
| "rewards/code_format_reward": 0.9849999904632568, |
| "rewards/code_reward": 0.5365443468093872, |
| "step": 5050, |
| "zero_std_ratio": 0.575 |
| }, |
| { |
| "clip_ratio/high_max": 0.013292990019544959, |
| "clip_ratio/high_mean": 0.0019570814620237797, |
| "clip_ratio/low_mean": 0.0004139953598496504, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0023710768204182387, |
| "completion_length": 92.15750122070312, |
| "epoch": 0.9720487945442321, |
| "grad_norm": 8.622629165649414, |
| "kl": 0.3708019584417343, |
| "learning_rate": 1.0174675596003588e-07, |
| "loss": -0.0037, |
| "reward": 1.6285043001174926, |
| "reward_std": 0.21171441301703453, |
| "rewards/code_format_reward": 0.9675000071525574, |
| "rewards/code_reward": 0.5723771452903748, |
| "step": 5060, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.011086594103835523, |
| "clip_ratio/high_mean": 0.001482552892412059, |
| "clip_ratio/low_mean": 7.31003499822691e-05, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.001555653239483945, |
| "completion_length": 92.72000122070312, |
| "epoch": 0.9739698395927384, |
| "grad_norm": 10.519503593444824, |
| "kl": 0.42225370053201916, |
| "learning_rate": 1.0151579872636673e-07, |
| "loss": 0.0073, |
| "reward": 1.9428821086883545, |
| "reward_std": 0.2824172407388687, |
| "rewards/code_format_reward": 0.981249988079071, |
| "rewards/code_reward": 0.7261285543441772, |
| "step": 5070, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.02070889645256102, |
| "clip_ratio/high_mean": 0.0035216436022892593, |
| "clip_ratio/low_mean": 0.0003085655207542004, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0038302090688375756, |
| "completion_length": 105.0050048828125, |
| "epoch": 0.9758908846412448, |
| "grad_norm": 4.139841079711914, |
| "kl": 0.3159520372748375, |
| "learning_rate": 1.0130118596861028e-07, |
| "loss": -0.0044, |
| "reward": 1.6708447217941285, |
| "reward_std": 0.30501508712768555, |
| "rewards/code_format_reward": 0.9837499976158142, |
| "rewards/code_reward": 0.5894848227500915, |
| "step": 5080, |
| "zero_std_ratio": 0.4 |
| }, |
| { |
| "clip_ratio/high_max": 0.008058706868905575, |
| "clip_ratio/high_mean": 0.0012073565638274885, |
| "clip_ratio/low_mean": 0.00031636476196581496, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0015237213257933036, |
| "completion_length": 84.28750152587891, |
| "epoch": 0.9778119296897512, |
| "grad_norm": 4.015879154205322, |
| "kl": 0.2918614260852337, |
| "learning_rate": 1.0110292550807451e-07, |
| "loss": -0.0012, |
| "reward": 1.7721335172653199, |
| "reward_std": 0.286711610853672, |
| "rewards/code_format_reward": 0.9899999976158143, |
| "rewards/code_reward": 0.6385667800903321, |
| "step": 5090, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.019471552316099407, |
| "clip_ratio/high_mean": 0.0026317643467336895, |
| "clip_ratio/low_mean": 0.0003316317946882918, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0029633961850777267, |
| "completion_length": 90.81000213623047, |
| "epoch": 0.9797329747382576, |
| "grad_norm": 1.132954716682434, |
| "kl": 0.2704964060336351, |
| "learning_rate": 1.0092102457012717e-07, |
| "loss": -0.0022, |
| "reward": 1.6570582151412965, |
| "reward_std": 0.21210518777370452, |
| "rewards/code_format_reward": 0.9899999976158143, |
| "rewards/code_reward": 0.5810291051864624, |
| "step": 5100, |
| "zero_std_ratio": 0.5 |
| }, |
| { |
| "clip_ratio/high_max": 0.011018617497757077, |
| "clip_ratio/high_mean": 0.0013860319217201323, |
| "clip_ratio/low_mean": 3.4722223062999547e-05, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0014207541418727488, |
| "completion_length": 93.61250305175781, |
| "epoch": 0.981654019786764, |
| "grad_norm": 16.08737564086914, |
| "kl": 0.26382347345352175, |
| "learning_rate": 1.0075548978393277e-07, |
| "loss": -0.0002, |
| "reward": 1.8070130348205566, |
| "reward_std": 0.1673865035176277, |
| "rewards/code_format_reward": 0.9912500023841858, |
| "rewards/code_reward": 0.6556940078735352, |
| "step": 5110, |
| "zero_std_ratio": 0.625 |
| }, |
| { |
| "clip_ratio/high_max": 0.010004310857038946, |
| "clip_ratio/high_mean": 0.0012777127660228871, |
| "clip_ratio/low_mean": 0.0003342236072057858, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0016119363936013542, |
| "completion_length": 89.3, |
| "epoch": 0.9835750648352704, |
| "grad_norm": 0.4556010961532593, |
| "kl": 0.4934497371315956, |
| "learning_rate": 1.0060632718221066e-07, |
| "loss": 0.0026, |
| "reward": 1.3408710062503815, |
| "reward_std": 0.16168890111148357, |
| "rewards/code_format_reward": 0.9875, |
| "rewards/code_reward": 0.42356050610542295, |
| "step": 5120, |
| "zero_std_ratio": 0.7 |
| }, |
| { |
| "clip_ratio/high_max": 0.05311971204355359, |
| "clip_ratio/high_mean": 0.0075716287479735914, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0075716287479735914, |
| "completion_length": 102.92249908447266, |
| "epoch": 0.9854961098837768, |
| "grad_norm": 3.9305222034454346, |
| "kl": 0.27781638093292715, |
| "learning_rate": 1.0047354220101518e-07, |
| "loss": -0.0011, |
| "reward": 1.630450439453125, |
| "reward_std": 0.18297318816185, |
| "rewards/code_format_reward": 0.9887499928474426, |
| "rewards/code_reward": 0.5680376827716828, |
| "step": 5130, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.004692732833791524, |
| "clip_ratio/high_mean": 0.0006299943852354772, |
| "clip_ratio/low_mean": 0.00031122941145440565, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0009412237734068186, |
| "completion_length": 88.23250122070313, |
| "epoch": 0.9874171549322832, |
| "grad_norm": 4.31157112121582, |
| "kl": 0.2751577727496624, |
| "learning_rate": 1.0035713967953797e-07, |
| "loss": -0.0038, |
| "reward": 1.635274839401245, |
| "reward_std": 0.29494107216596605, |
| "rewards/code_format_reward": 0.9849999904632568, |
| "rewards/code_reward": 0.5713874340057373, |
| "step": 5140, |
| "zero_std_ratio": 0.45 |
| }, |
| { |
| "clip_ratio/high_max": 0.012987824180163443, |
| "clip_ratio/high_mean": 0.0019960356265073644, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0019960356265073644, |
| "completion_length": 86.49750061035157, |
| "epoch": 0.9893381999807895, |
| "grad_norm": 7.45393705368042, |
| "kl": 0.3619408316910267, |
| "learning_rate": 1.0025712385993115e-07, |
| "loss": 0.0012, |
| "reward": 1.687432312965393, |
| "reward_std": 0.2386924833059311, |
| "rewards/code_format_reward": 0.9912499904632568, |
| "rewards/code_reward": 0.5959036707878113, |
| "step": 5150, |
| "zero_std_ratio": 0.475 |
| }, |
| { |
| "clip_ratio/high_max": 0.014351918507600203, |
| "clip_ratio/high_mean": 0.002000216278975131, |
| "clip_ratio/low_mean": 7.898250914877281e-05, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0020791988128621595, |
| "completion_length": 89.34250030517578, |
| "epoch": 0.991259245029296, |
| "grad_norm": 35.51432418823242, |
| "kl": 0.2617587223649025, |
| "learning_rate": 1.0017349838715278e-07, |
| "loss": -0.004, |
| "reward": 1.2408424496650696, |
| "reward_std": 0.21315770447254181, |
| "rewards/code_format_reward": 0.9774999976158142, |
| "rewards/code_reward": 0.3760462045669556, |
| "step": 5160, |
| "zero_std_ratio": 0.525 |
| }, |
| { |
| "clip_ratio/high_max": 0.003973034140653908, |
| "clip_ratio/high_mean": 0.0004966292675817385, |
| "clip_ratio/low_mean": 6.530825339723379e-06, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0005031600929214619, |
| "completion_length": 99.66499938964844, |
| "epoch": 0.9931802900778023, |
| "grad_norm": 2.5418131351470947, |
| "kl": 0.1747375037521124, |
| "learning_rate": 1.0010626630883432e-07, |
| "loss": 0.003, |
| "reward": 1.421428418159485, |
| "reward_std": 0.09218620862811804, |
| "rewards/code_format_reward": 0.9612499952316285, |
| "rewards/code_reward": 0.4704016923904419, |
| "step": 5170, |
| "zero_std_ratio": 0.675 |
| }, |
| { |
| "clip_ratio/high_max": 0.02674068254418671, |
| "clip_ratio/high_mean": 0.003380646219011396, |
| "clip_ratio/low_mean": 9.682812378741801e-05, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0034774743369780483, |
| "completion_length": 92.72500152587891, |
| "epoch": 0.9951013351263087, |
| "grad_norm": 6.10137414932251, |
| "kl": 0.41192906014621256, |
| "learning_rate": 1.0005543007516928e-07, |
| "loss": -0.0051, |
| "reward": 1.5263760328292846, |
| "reward_std": 0.28926219046115875, |
| "rewards/code_format_reward": 0.9899999976158143, |
| "rewards/code_reward": 0.5156879663467407, |
| "step": 5180, |
| "zero_std_ratio": 0.425 |
| }, |
| { |
| "clip_ratio/high_max": 0.1273454572306946, |
| "clip_ratio/high_mean": 0.016399703072966076, |
| "clip_ratio/low_mean": 0.0004187120386632159, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.016818415274610744, |
| "completion_length": 81.02250213623047, |
| "epoch": 0.9970223801748151, |
| "grad_norm": 7.77527379989624, |
| "kl": 0.7098278045654297, |
| "learning_rate": 1.0002099153882402e-07, |
| "loss": -0.0041, |
| "reward": 1.6053562879562377, |
| "reward_std": 0.16601394787430762, |
| "rewards/code_format_reward": 0.9824999928474426, |
| "rewards/code_reward": 0.557053166627884, |
| "step": 5190, |
| "zero_std_ratio": 0.6 |
| }, |
| { |
| "clip_ratio/high_max": 0.0028753917664289474, |
| "clip_ratio/high_mean": 0.00045008738234173504, |
| "clip_ratio/low_mean": 0.00016858125454746186, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.000618668642709963, |
| "completion_length": 95.49250183105468, |
| "epoch": 0.9989434252233215, |
| "grad_norm": 6.507387638092041, |
| "kl": 0.9340068377554417, |
| "learning_rate": 1.0000295195487024e-07, |
| "loss": -0.0018, |
| "reward": 1.4542541027069091, |
| "reward_std": 0.20283248797059059, |
| "rewards/code_format_reward": 0.981249988079071, |
| "rewards/code_reward": 0.4818145722150803, |
| "step": 5200, |
| "zero_std_ratio": 0.55 |
| }, |
| { |
| "clip_ratio/high_max": 0.010171899455599487, |
| "clip_ratio/high_mean": 0.0014317149762064219, |
| "clip_ratio/low_mean": 0.00016592920292168856, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0015976441791281104, |
| "completion_length": 90.05000305175781, |
| "epoch": 0.999711843242724, |
| "kl": 0.5218422394245863, |
| "reward": 1.0329873859882355, |
| "reward_std": 0.19616412371397018, |
| "rewards/code_format_reward": 0.934374988079071, |
| "rewards/code_reward": 0.28289994597435, |
| "step": 5204, |
| "total_flos": 0.0, |
| "train_loss": 1756184.5472393532, |
| "train_runtime": 149594.4727, |
| "train_samples_per_second": 0.139, |
| "train_steps_per_second": 0.035, |
| "zero_std_ratio": 0.5625 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 5205, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 2000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 5, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|