{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.03237068965517241, "eval_steps": 500, "global_step": 751, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1, "completions/max_length": 193.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 103.975, "completions/mean_terminated_length": 90.39881134033203, "completions/min_length": 17.6, "completions/min_terminated_length": 17.6, "entropy": 0.1820149033330381, "epoch": 2.5, "frac_reward_zero_std": 0.5, "grad_norm": 0.25, "learning_rate": 1.0000000000000002e-06, "loss": 0.0794088900089264, "num_tokens": 14355.0, "reward": 0.15020800828933717, "reward_std": 0.6376187483081595, "rewards/reward_fn/mean": 0.15020800828933717, "rewards/reward_fn/std": 0.6376187764341011, "step": 5, "step_time": 30.522303848797673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 151.2, "completions/max_terminated_length": 141.6, "completions/mean_length": 80.075, "completions/mean_terminated_length": 77.78928833007812, "completions/min_length": 17.8, "completions/min_terminated_length": 17.8, "entropy": 0.16221200795844198, "epoch": 5.0, "frac_reward_zero_std": 0.5, "grad_norm": 0.228515625, "learning_rate": 2.25e-06, "loss": 0.07169516086578369, "num_tokens": 27462.0, "reward": 0.4012819856405258, "reward_std": 0.4128362699819263, "rewards/reward_fn/mean": 0.4012819856405258, "rewards/reward_fn/std": 0.4128363010211615, "step": 10, "step_time": 25.01976956339822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 185.6, "completions/max_terminated_length": 171.6, "completions/mean_length": 83.65, "completions/mean_terminated_length": 79.14285736083984, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "entropy": 0.14225535104051232, "epoch": 7.5, "frac_reward_zero_std": 0.3, "grad_norm": 0.265625, "learning_rate": 3.5e-06, "loss": 0.0717179834842682, "num_tokens": 40740.0, "reward": 0.10207997858524323, "reward_std": 0.7454913818277419, "rewards/reward_fn/mean": 0.10207997858524323, "rewards/reward_fn/std": 0.7454914333298802, "step": 15, "step_time": 29.626422570000432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1, "completions/max_length": 202.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 98.85, "completions/mean_terminated_length": 81.79285888671875, "completions/min_length": 19.2, "completions/min_terminated_length": 19.2, "entropy": 0.1534867493668571, "epoch": 10.0, "frac_reward_zero_std": 0.4, "grad_norm": 0.2431640625, "learning_rate": 4.75e-06, "loss": 0.09726614952087402, "num_tokens": 54862.0, "reward": 0.25181599259376525, "reward_std": 0.6045640033902601, "rewards/reward_fn/mean": 0.25181599259376525, "rewards/reward_fn/std": 0.6045640454394743, "step": 20, "step_time": 31.625062462999267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 233.6, "completions/max_terminated_length": 194.4, "completions/mean_length": 99.525, "completions/mean_terminated_length": 77.78690643310547, "completions/min_length": 18.8, "completions/min_terminated_length": 18.8, "entropy": 0.1538564210291952, "epoch": 12.5, "frac_reward_zero_std": 0.6, "grad_norm": 0.0, "learning_rate": 4.981481481481482e-06, "loss": 0.13929661512374877, "num_tokens": 69039.0, "reward": 0.0006859898567199707, "reward_std": 1.0108385920524596, "rewards/reward_fn/mean": 0.0006859898567199707, "rewards/reward_fn/std": 1.0108386158943177, "step": 25, "step_time": 35.65783783460065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 181.8, "completions/max_terminated_length": 153.0, "completions/mean_length": 65.05, "completions/mean_terminated_length": 56.03928680419922, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "entropy": 0.13224927680566906, "epoch": 15.0, "frac_reward_zero_std": 0.5, "grad_norm": 0.1875, "learning_rate": 4.958333333333334e-06, "loss": -0.003383058309555054, "num_tokens": 81545.0, "reward": 0.3994179755449295, "reward_std": 0.5622525057464373, "rewards/reward_fn/mean": 0.3994179755449295, "rewards/reward_fn/std": 0.56225254482124, "step": 30, "step_time": 29.112151033400732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 131.0, "completions/max_terminated_length": 128.6, "completions/mean_length": 61.075, "completions/mean_terminated_length": 53.282144165039064, "completions/min_length": 17.6, "completions/min_terminated_length": 17.6, "entropy": 0.11974610288161784, "epoch": 17.5, "frac_reward_zero_std": 0.7, "grad_norm": 0.220703125, "learning_rate": 4.935185185185186e-06, "loss": 0.004663025587797165, "num_tokens": 93892.0, "reward": 0.3469119846820831, "reward_std": 0.5775202971824911, "rewards/reward_fn/mean": 0.3469119846820831, "rewards/reward_fn/std": 0.5775203009106917, "step": 35, "step_time": 22.457519923200017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.075, "completions/max_length": 179.6, "completions/max_terminated_length": 131.2, "completions/mean_length": 78.025, "completions/mean_terminated_length": 65.70357513427734, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.12550681543070824, "epoch": 20.0, "frac_reward_zero_std": 0.4, "grad_norm": 0.4609375, "learning_rate": 4.9120370370370375e-06, "loss": 0.036792796850204465, "num_tokens": 107209.0, "reward": 0.3498039901256561, "reward_std": 0.7408117946935817, "rewards/reward_fn/mean": 0.3498039901256561, "rewards/reward_fn/std": 0.7408118456369266, "step": 40, "step_time": 28.692756705999635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 189.0, "completions/max_terminated_length": 178.8, "completions/mean_length": 92.675, "completions/mean_terminated_length": 89.51071472167969, "completions/min_length": 17.8, "completions/min_terminated_length": 17.8, "entropy": 0.15165529411751777, "epoch": 22.5, "frac_reward_zero_std": 0.5, "grad_norm": 0.232421875, "learning_rate": 4.888888888888889e-06, "loss": -0.004727205634117127, "num_tokens": 120868.0, "reward": 0.30072798430919645, "reward_std": 0.7614144545921591, "rewards/reward_fn/mean": 0.30072798430919645, "rewards/reward_fn/std": 0.7614145380415721, "step": 45, "step_time": 30.053675796201425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.075, "completions/max_length": 208.6, "completions/max_terminated_length": 189.6, "completions/mean_length": 102.625, "completions/mean_terminated_length": 90.11666870117188, "completions/min_length": 18.8, "completions/min_terminated_length": 18.8, "entropy": 0.15195838457439095, "epoch": 25.0, "frac_reward_zero_std": 0.6, "grad_norm": 0.208984375, "learning_rate": 4.865740740740741e-06, "loss": 0.04500017166137695, "num_tokens": 135121.0, "reward": 0.5009119868278503, "reward_std": 0.5213470441231038, "rewards/reward_fn/mean": 0.5009119868278503, "rewards/reward_fn/std": 0.5213470560469432, "step": 50, "step_time": 32.55890014459801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 195.4, "completions/max_terminated_length": 188.4, "completions/mean_length": 110.95, "completions/mean_terminated_length": 106.75357208251953, "completions/min_length": 51.8, "completions/min_terminated_length": 51.8, "entropy": 0.1545537636615336, "epoch": 27.5, "frac_reward_zero_std": 0.5, "grad_norm": 0.1201171875, "learning_rate": 4.842592592592593e-06, "loss": 0.04118683934211731, "num_tokens": 149755.0, "reward": 0.40073599219322203, "reward_std": 0.6958227735885885, "rewards/reward_fn/mean": 0.40073599219322203, "rewards/reward_fn/std": 0.6958228093542858, "step": 55, "step_time": 30.86228356460051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.075, "completions/max_length": 172.8, "completions/max_terminated_length": 147.2, "completions/mean_length": 91.875, "completions/mean_terminated_length": 81.20119171142578, "completions/min_length": 32.8, "completions/min_terminated_length": 32.8, "entropy": 0.14326929268427194, "epoch": 30.0, "frac_reward_zero_std": 0.5, "grad_norm": 0.318359375, "learning_rate": 4.819444444444445e-06, "loss": 0.06774483323097229, "num_tokens": 163334.0, "reward": 0.5998779892921448, "reward_std": 0.3705297726322897, "rewards/reward_fn/mean": 0.5998779892921448, "rewards/reward_fn/std": 0.3705297749955207, "step": 60, "step_time": 27.902300025800287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 152.4, "completions/max_terminated_length": 147.6, "completions/mean_length": 81.625, "completions/mean_terminated_length": 78.91071472167968, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.15796504009049386, "epoch": 32.5, "frac_reward_zero_std": 0.6, "grad_norm": 0.0, "learning_rate": 4.796296296296297e-06, "loss": -0.008640621602535248, "num_tokens": 176503.0, "reward": 0.45116598904132843, "reward_std": 0.4201629768765997, "rewards/reward_fn/mean": 0.45116598904132843, "rewards/reward_fn/std": 0.42016300329414663, "step": 65, "step_time": 25.321657606401278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 197.6, "completions/max_terminated_length": 183.2, "completions/mean_length": 114.3, "completions/mean_terminated_length": 109.30833435058594, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.16214836928993465, "epoch": 35.0, "frac_reward_zero_std": 0.5, "grad_norm": 0.0, "learning_rate": 4.7731481481481484e-06, "loss": 0.007857038080692292, "num_tokens": 191271.0, "reward": 0.29932799339294436, "reward_std": 0.6332858696579933, "rewards/reward_fn/mean": 0.29932799339294436, "rewards/reward_fn/std": 0.6332858689129353, "step": 70, "step_time": 31.062436907000667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 179.4, "completions/max_terminated_length": 173.2, "completions/mean_length": 81.875, "completions/mean_terminated_length": 72.65, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.1165547446347773, "epoch": 37.5, "frac_reward_zero_std": 0.5, "grad_norm": 0.4609375, "learning_rate": 4.75e-06, "loss": 0.015596307814121246, "num_tokens": 204450.0, "reward": 0.5023359954357147, "reward_std": 0.39528531452669996, "rewards/reward_fn/mean": 0.5023359954357147, "rewards/reward_fn/std": 0.3952853078444605, "step": 75, "step_time": 28.755046762000347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 204.4, "completions/max_terminated_length": 181.4, "completions/mean_length": 111.05, "completions/mean_terminated_length": 104.30357208251954, "completions/min_length": 44.6, "completions/min_terminated_length": 44.6, "entropy": 0.13618089363444597, "epoch": 40.0, "frac_reward_zero_std": 0.6, "grad_norm": 0.140625, "learning_rate": 4.726851851851852e-06, "loss": 0.043748298287391664, "num_tokens": 219088.0, "reward": 0.3999840050935745, "reward_std": 0.5622952695033746, "rewards/reward_fn/mean": 0.3999840050935745, "rewards/reward_fn/std": 0.5622952873265603, "step": 80, "step_time": 31.989889109399883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 191.4, "completions/max_terminated_length": 186.4, "completions/mean_length": 104.1, "completions/mean_terminated_length": 100.05000152587891, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.17853650995530188, "epoch": 42.5, "frac_reward_zero_std": 0.5, "grad_norm": 0.1875, "learning_rate": 4.703703703703704e-06, "loss": 0.02743053436279297, "num_tokens": 233420.0, "reward": 0.44984598755836486, "reward_std": 0.5406898662215098, "rewards/reward_fn/mean": 0.44984598755836486, "rewards/reward_fn/std": 0.5406898936373181, "step": 85, "step_time": 30.449153596599718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.075, "completions/max_length": 200.6, "completions/max_terminated_length": 186.6, "completions/mean_length": 103.875, "completions/mean_terminated_length": 88.48499984741211, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "entropy": 0.12900277911685407, "epoch": 45.0, "frac_reward_zero_std": 0.6, "grad_norm": 0.2314453125, "learning_rate": 4.680555555555556e-06, "loss": 0.05115030407905578, "num_tokens": 247507.0, "reward": 0.5499680012464523, "reward_std": 0.39219149924028895, "rewards/reward_fn/mean": 0.5499680012464523, "rewards/reward_fn/std": 0.3921915319937398, "step": 90, "step_time": 31.51559891059951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 180.8, "completions/max_terminated_length": 167.4, "completions/mean_length": 103.05, "completions/mean_terminated_length": 99.05000152587891, "completions/min_length": 44.6, "completions/min_terminated_length": 44.6, "entropy": 0.12843654905445873, "epoch": 47.5, "frac_reward_zero_std": 0.5, "grad_norm": 0.099609375, "learning_rate": 4.6574074074074076e-06, "loss": 0.043516650795936584, "num_tokens": 261825.0, "reward": 0.4511160016059875, "reward_std": 0.6716319680213928, "rewards/reward_fn/mean": 0.4511160016059875, "rewards/reward_fn/std": 0.6716320037841796, "step": 95, "step_time": 28.928432848399826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 157.0, "completions/max_terminated_length": 151.8, "completions/mean_length": 84.425, "completions/mean_terminated_length": 80.66428680419922, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "entropy": 0.13799221392255276, "epoch": 50.0, "frac_reward_zero_std": 0.5, "grad_norm": 0.154296875, "learning_rate": 4.634259259259259e-06, "loss": -0.0032827585935592653, "num_tokens": 275106.0, "reward": 0.5485600084066391, "reward_std": 0.39775218120921635, "rewards/reward_fn/mean": 0.5485600084066391, "rewards/reward_fn/std": 0.3977522020417382, "step": 100, "step_time": 25.882849212799556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 182.2, "completions/max_terminated_length": 174.8, "completions/mean_length": 100.925, "completions/mean_terminated_length": 96.83928680419922, "completions/min_length": 40.4, "completions/min_terminated_length": 40.4, "entropy": 0.12651287836488337, "epoch": 52.5, "frac_reward_zero_std": 0.6, "grad_norm": 0.0, "learning_rate": 4.611111111111112e-06, "loss": -0.008606845140457153, "num_tokens": 289075.0, "reward": 0.5499760001897812, "reward_std": 0.3922359466421767, "rewards/reward_fn/mean": 0.5499760001897812, "rewards/reward_fn/std": 0.39223596563824686, "step": 105, "step_time": 29.0759426169996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 194.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 104.525, "completions/mean_terminated_length": 98.08214416503907, "completions/min_length": 43.2, "completions/min_terminated_length": 43.2, "entropy": 0.16052292285021394, "epoch": 55.0, "frac_reward_zero_std": 0.6, "grad_norm": 0.0, "learning_rate": 4.587962962962964e-06, "loss": 0.022305211424827574, "num_tokens": 303424.0, "reward": 0.5999760001897811, "reward_std": 0.34845718718279384, "rewards/reward_fn/mean": 0.5999760001897811, "rewards/reward_fn/std": 0.3484572199362447, "step": 110, "step_time": 30.576252812399254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 186.2, "completions/max_terminated_length": 170.0, "completions/mean_length": 107.275, "completions/mean_terminated_length": 97.70833435058594, "completions/min_length": 42.2, "completions/min_terminated_length": 42.2, "entropy": 0.16247000750154256, "epoch": 57.5, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 4.564814814814815e-06, "loss": 0.036947214603424074, "num_tokens": 317863.0, "reward": 0.700000011920929, "reward_std": 0.1851640224456787, "rewards/reward_fn/mean": 0.700000011920929, "rewards/reward_fn/std": 0.1851640224456787, "step": 115, "step_time": 29.630989668599433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 202.0, "completions/max_terminated_length": 183.2, "completions/mean_length": 106.1, "completions/mean_terminated_length": 99.70714416503907, "completions/min_length": 40.8, "completions/min_terminated_length": 40.8, "entropy": 0.15718324964400382, "epoch": 60.0, "frac_reward_zero_std": 0.7, "grad_norm": 0.0, "learning_rate": 4.541666666666667e-06, "loss": 0.04491000473499298, "num_tokens": 332059.0, "reward": 0.2999920010566711, "reward_std": 0.6278709468911984, "rewards/reward_fn/mean": 0.2999920010566711, "rewards/reward_fn/std": 0.6278709915655781, "step": 120, "step_time": 31.635853910601874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.6, "completions/max_terminated_length": 170.6, "completions/mean_length": 94.575, "completions/mean_terminated_length": 94.575, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.10850867056287825, "epoch": 62.5, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "learning_rate": 4.5185185185185185e-06, "loss": 0.024337136745452882, "num_tokens": 345990.0, "reward": 0.6500000119209289, "reward_std": 0.3265853762626648, "rewards/reward_fn/mean": 0.6500000119209289, "rewards/reward_fn/std": 0.3265853762626648, "step": 125, "step_time": 27.550983037801416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.075, "completions/max_length": 201.8, "completions/max_terminated_length": 187.0, "completions/mean_length": 99.25, "completions/mean_terminated_length": 84.59642944335937, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.12130871089175344, "epoch": 65.0, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "learning_rate": 4.49537037037037e-06, "loss": 0.018311865627765656, "num_tokens": 359912.0, "reward": 0.6000000089406967, "reward_std": 0.34844101667404176, "rewards/reward_fn/mean": 0.6000000089406967, "rewards/reward_fn/std": 0.34844104051589964, "step": 130, "step_time": 31.70451795959889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.6, "completions/max_terminated_length": 162.6, "completions/mean_length": 89.825, "completions/mean_terminated_length": 89.825, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.11614590694662183, "epoch": 67.5, "frac_reward_zero_std": 0.7, "grad_norm": 0.1982421875, "learning_rate": 4.472222222222223e-06, "loss": 0.019238683581352233, "num_tokens": 373457.0, "reward": 0.5999920129776001, "reward_std": 0.3703506765436032, "rewards/reward_fn/mean": 0.5999920129776001, "rewards/reward_fn/std": 0.3703506735342671, "step": 135, "step_time": 26.545528039799684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.075, "completions/max_length": 205.4, "completions/max_terminated_length": 165.4, "completions/mean_length": 99.4, "completions/mean_terminated_length": 87.00476379394532, "completions/min_length": 19.4, "completions/min_terminated_length": 19.4, "entropy": 0.1428308295784518, "epoch": 70.0, "frac_reward_zero_std": 0.7, "grad_norm": 0.0, "learning_rate": 4.449074074074075e-06, "loss": 0.04644445776939392, "num_tokens": 387581.0, "reward": 0.5499920099973679, "reward_std": 0.39217875003814695, "rewards/reward_fn/mean": 0.5499920099973679, "rewards/reward_fn/std": 0.39217878580093385, "step": 140, "step_time": 32.013767499400274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.8, "completions/max_terminated_length": 171.8, "completions/mean_length": 97.65, "completions/mean_terminated_length": 97.65, "completions/min_length": 28.8, "completions/min_terminated_length": 28.8, "entropy": 0.13889142961706966, "epoch": 72.5, "frac_reward_zero_std": 0.6, "grad_norm": 0.2197265625, "learning_rate": 4.425925925925927e-06, "loss": 0.021968120336532594, "num_tokens": 401439.0, "reward": 0.6477920055389405, "reward_std": 0.4305092230439186, "rewards/reward_fn/mean": 0.6477920055389405, "rewards/reward_fn/std": 0.4305092342197895, "step": 145, "step_time": 27.826696390995494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 211.0, "completions/max_terminated_length": 199.4, "completions/mean_length": 110.125, "completions/mean_terminated_length": 102.15714416503906, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "entropy": 0.17309838379733264, "epoch": 75.0, "frac_reward_zero_std": 0.7, "grad_norm": 0.216796875, "learning_rate": 4.4027777777777784e-06, "loss": 0.0743155837059021, "num_tokens": 415992.0, "reward": 0.6007840156555175, "reward_std": 0.3688644051551819, "rewards/reward_fn/mean": 0.6007840156555175, "rewards/reward_fn/std": 0.3688644051551819, "step": 150, "step_time": 32.79056119860106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 166.6, "completions/max_terminated_length": 153.4, "completions/mean_length": 102.05, "completions/mean_terminated_length": 99.5, "completions/min_length": 56.6, "completions/min_terminated_length": 56.6, "entropy": 0.13074476819019765, "epoch": 77.5, "frac_reward_zero_std": 0.8, "grad_norm": 0.1337890625, "learning_rate": 4.379629629629629e-06, "loss": 0.01247234344482422, "num_tokens": 430242.0, "reward": 0.5507920116186142, "reward_std": 0.3907249927520752, "rewards/reward_fn/mean": 0.5507920116186142, "rewards/reward_fn/std": 0.3907250165939331, "step": 155, "step_time": 27.061828034398786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.4, "completions/max_terminated_length": 194.4, "completions/mean_length": 92.975, "completions/mean_terminated_length": 92.975, "completions/min_length": 19.8, "completions/min_terminated_length": 19.8, "entropy": 0.11762763226870447, "epoch": 80.0, "frac_reward_zero_std": 0.8, "grad_norm": 0.2060546875, "learning_rate": 4.356481481481482e-06, "loss": 0.0216289147734642, "num_tokens": 443893.0, "reward": 0.700000011920929, "reward_std": 0.2828427076339722, "rewards/reward_fn/mean": 0.700000011920929, "rewards/reward_fn/std": 0.2828427076339722, "step": 160, "step_time": 30.701956771399274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 185.6, "completions/max_terminated_length": 178.8, "completions/mean_length": 104.825, "completions/mean_terminated_length": 98.86785888671875, "completions/min_length": 41.4, "completions/min_terminated_length": 41.4, "entropy": 0.12376428130082787, "epoch": 82.5, "frac_reward_zero_std": 0.7, "grad_norm": 0.19921875, "learning_rate": 4.333333333333334e-06, "loss": 0.04524213671684265, "num_tokens": 458018.0, "reward": 0.6500000119209289, "reward_std": 0.42426406145095824, "rewards/reward_fn/mean": 0.6500000119209289, "rewards/reward_fn/std": 0.42426406145095824, "step": 165, "step_time": 29.50906059279878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 201.0, "completions/max_terminated_length": 177.4, "completions/mean_length": 107.4, "completions/mean_terminated_length": 102.47142944335937, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.1829328211490065, "epoch": 85.0, "frac_reward_zero_std": 0.7, "grad_norm": 0.0, "learning_rate": 4.310185185185186e-06, "loss": 0.05608509778976441, "num_tokens": 472482.0, "reward": 0.5000000089406967, "reward_std": 0.5336050391197205, "rewards/reward_fn/mean": 0.5000000089406967, "rewards/reward_fn/std": 0.5336050629615784, "step": 170, "step_time": 31.589203411201016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 142.6, "completions/max_terminated_length": 134.4, "completions/mean_length": 90.575, "completions/mean_terminated_length": 88.62142944335938, "completions/min_length": 60.8, "completions/min_terminated_length": 60.8, "entropy": 0.12587652734946458, "epoch": 87.5, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 4.2870370370370376e-06, "loss": 0.006346069276332855, "num_tokens": 486009.0, "reward": 0.700000011920929, "reward_std": 0.1851640224456787, "rewards/reward_fn/mean": 0.700000011920929, "rewards/reward_fn/std": 0.1851640224456787, "step": 175, "step_time": 24.074513484201454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 172.4, "completions/max_terminated_length": 156.4, "completions/mean_length": 113.225, "completions/mean_terminated_length": 107.86666870117188, "completions/min_length": 62.8, "completions/min_terminated_length": 62.8, "entropy": 0.13538812827318908, "epoch": 90.0, "frac_reward_zero_std": 0.6, "grad_norm": 0.0, "learning_rate": 4.263888888888889e-06, "loss": 0.04661061465740204, "num_tokens": 500734.0, "reward": 0.5999920129776001, "reward_std": 0.46800349950790404, "rewards/reward_fn/mean": 0.5999920129776001, "rewards/reward_fn/std": 0.468003511428833, "step": 180, "step_time": 27.94757253280186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 196.8, "completions/max_terminated_length": 179.4, "completions/mean_length": 105.125, "completions/mean_terminated_length": 100.6, "completions/min_length": 23.2, "completions/min_terminated_length": 23.2, "entropy": 0.14581037967000157, "epoch": 92.5, "frac_reward_zero_std": 0.9, "grad_norm": 0.1181640625, "learning_rate": 4.240740740740741e-06, "loss": 0.0037437133491039277, "num_tokens": 515087.0, "reward": 0.7999920129776001, "reward_std": 2.263165224576369e-05, "rewards/reward_fn/mean": 0.7999920129776001, "rewards/reward_fn/std": 2.2628642909694463e-05, "step": 185, "step_time": 31.02160367520264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 154.2, "completions/max_terminated_length": 139.4, "completions/mean_length": 91.275, "completions/mean_terminated_length": 88.80357360839844, "completions/min_length": 55.8, "completions/min_terminated_length": 55.8, "entropy": 0.1140737212728709, "epoch": 95.0, "frac_reward_zero_std": 0.7, "grad_norm": 0.1728515625, "learning_rate": 4.217592592592593e-06, "loss": 0.029188913106918336, "num_tokens": 528690.0, "reward": 0.600000011920929, "reward_std": 0.4680067300796509, "rewards/reward_fn/mean": 0.600000011920929, "rewards/reward_fn/std": 0.4680067300796509, "step": 190, "step_time": 25.543457056801707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 187.0, "completions/max_terminated_length": 171.8, "completions/mean_length": 81.625, "completions/mean_terminated_length": 77.18928680419921, "completions/min_length": 19.4, "completions/min_terminated_length": 19.4, "entropy": 0.114993113768287, "epoch": 97.5, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "learning_rate": 4.194444444444445e-06, "loss": 0.0800092101097107, "num_tokens": 541859.0, "reward": 0.700000011920929, "reward_std": 0.2828427076339722, "rewards/reward_fn/mean": 0.700000011920929, "rewards/reward_fn/std": 0.2828427076339722, "step": 195, "step_time": 29.684030519998487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 188.2, "completions/max_terminated_length": 171.6, "completions/mean_length": 114.675, "completions/mean_terminated_length": 107.88214416503907, "completions/min_length": 63.4, "completions/min_terminated_length": 63.4, "entropy": 0.14553113598376513, "epoch": 100.0, "frac_reward_zero_std": 0.6, "grad_norm": 0.154296875, "learning_rate": 4.171296296296297e-06, "loss": 0.05318028926849365, "num_tokens": 556642.0, "reward": 0.6450000047683716, "reward_std": 0.4384061962366104, "rewards/reward_fn/mean": 0.6450000047683716, "rewards/reward_fn/std": 0.4384061962366104, "step": 200, "step_time": 29.818907721999857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 213.6, "completions/max_terminated_length": 182.8, "completions/mean_length": 106.8, "completions/mean_terminated_length": 98.99285888671875, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "entropy": 0.14924523658119143, "epoch": 102.5, "frac_reward_zero_std": 0.7, "grad_norm": 0.1748046875, "learning_rate": 4.1481481481481485e-06, "loss": 0.0724250853061676, "num_tokens": 571082.0, "reward": 0.600000011920929, "reward_std": 0.4680067300796509, "rewards/reward_fn/mean": 0.600000011920929, "rewards/reward_fn/std": 0.4680067300796509, "step": 205, "step_time": 33.18914894179907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.6, "completions/max_terminated_length": 166.6, "completions/mean_length": 88.725, "completions/mean_terminated_length": 88.725, "completions/min_length": 19.8, "completions/min_terminated_length": 19.8, "entropy": 0.12393170983996242, "epoch": 105.0, "frac_reward_zero_std": 0.6, "grad_norm": 0.24609375, "learning_rate": 4.125e-06, "loss": 0.022324693202972413, "num_tokens": 584563.0, "reward": 0.597000002861023, "reward_std": 0.47649201303720473, "rewards/reward_fn/mean": 0.597000002861023, "rewards/reward_fn/std": 0.47649201229214666, "step": 210, "step_time": 27.244719348003127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.6, "completions/max_terminated_length": 181.6, "completions/mean_length": 104.95, "completions/mean_terminated_length": 104.95, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.10806208297144622, "epoch": 107.5, "frac_reward_zero_std": 0.7, "grad_norm": 0.1337890625, "learning_rate": 4.101851851851852e-06, "loss": -0.012457241863012313, "num_tokens": 598957.0, "reward": 0.6500000119209289, "reward_std": 0.42426406145095824, "rewards/reward_fn/mean": 0.6500000119209289, "rewards/reward_fn/std": 0.42426406145095824, "step": 215, "step_time": 29.092179798799044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 175.2, "completions/max_terminated_length": 162.8, "completions/mean_length": 89.175, "completions/mean_terminated_length": 85.25714416503907, "completions/min_length": 27.6, "completions/min_terminated_length": 27.6, "entropy": 0.13876313052605838, "epoch": 110.0, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 4.078703703703704e-06, "loss": 0.016936567425727845, "num_tokens": 612428.0, "reward": 0.700000011920929, "reward_std": 0.1851640224456787, "rewards/reward_fn/mean": 0.700000011920929, "rewards/reward_fn/std": 0.1851640224456787, "step": 220, "step_time": 28.282268692799697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 175.0, "completions/max_terminated_length": 165.2, "completions/mean_length": 100.275, "completions/mean_terminated_length": 95.95, "completions/min_length": 48.6, "completions/min_terminated_length": 48.6, "entropy": 0.14033891165163367, "epoch": 112.5, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "learning_rate": 4.055555555555556e-06, "loss": 0.03414565920829773, "num_tokens": 626587.0, "reward": 0.6999920129776, "reward_std": 0.18518665409792448, "rewards/reward_fn/mean": 0.6999920129776, "rewards/reward_fn/std": 0.1851866510885884, "step": 225, "step_time": 28.14439602499842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 192.6, "completions/max_terminated_length": 184.4, "completions/mean_length": 96.575, "completions/mean_terminated_length": 92.39285736083984, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.13608734677545725, "epoch": 115.0, "frac_reward_zero_std": 0.6, "grad_norm": 0.138671875, "learning_rate": 4.032407407407408e-06, "loss": 0.052947860956192014, "num_tokens": 640402.0, "reward": 0.6969920039176941, "reward_std": 0.2913506222437718, "rewards/reward_fn/mean": 0.6969920039176941, "rewards/reward_fn/std": 0.29135061848937766, "step": 230, "step_time": 30.501605717404892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 192.6, "completions/max_terminated_length": 174.4, "completions/mean_length": 96.85, "completions/mean_terminated_length": 92.18214416503906, "completions/min_length": 17.6, "completions/min_terminated_length": 17.6, "entropy": 0.12447628125082702, "epoch": 117.5, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 4.0092592592592594e-06, "loss": 0.04736369252204895, "num_tokens": 654424.0, "reward": 0.7500000119209289, "reward_std": 0.1414213538169861, "rewards/reward_fn/mean": 0.7500000119209289, "rewards/reward_fn/std": 0.1414213538169861, "step": 235, "step_time": 30.51819963699527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.4, "completions/max_terminated_length": 173.4, "completions/mean_length": 88.6, "completions/mean_terminated_length": 88.6, "completions/min_length": 19.8, "completions/min_terminated_length": 19.8, "entropy": 0.11185428954195231, "epoch": 120.0, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "learning_rate": 3.986111111111112e-06, "loss": 0.028244262933731078, "num_tokens": 667920.0, "reward": 0.700000011920929, "reward_std": 0.2828427076339722, "rewards/reward_fn/mean": 0.700000011920929, "rewards/reward_fn/std": 0.2828427076339722, "step": 240, "step_time": 28.104555083598825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 156.6, "completions/max_terminated_length": 148.4, "completions/mean_length": 101.75, "completions/mean_terminated_length": 99.26428833007813, "completions/min_length": 61.8, "completions/min_terminated_length": 61.8, "entropy": 0.15239415562245995, "epoch": 122.5, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 3.962962962962963e-06, "loss": 0.016936178505420684, "num_tokens": 682138.0, "reward": 0.7500000119209289, "reward_std": 0.1414213538169861, "rewards/reward_fn/mean": 0.7500000119209289, "rewards/reward_fn/std": 0.1414213538169861, "step": 245, "step_time": 25.842140119400575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 182.4, "completions/max_terminated_length": 168.6, "completions/mean_length": 96.175, "completions/mean_terminated_length": 91.94642944335938, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "entropy": 0.12800856186076998, "epoch": 125.0, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 3.939814814814815e-06, "loss": 0.02638625502586365, "num_tokens": 695937.0, "reward": 0.7500000119209289, "reward_std": 0.1414213538169861, "rewards/reward_fn/mean": 0.7500000119209289, "rewards/reward_fn/std": 0.1414213538169861, "step": 250, "step_time": 29.15611189340343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.8, "completions/max_terminated_length": 153.8, "completions/mean_length": 81.275, "completions/mean_terminated_length": 81.275, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.14729445518460124, "epoch": 127.5, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 3.916666666666667e-06, "loss": 0.017367249727249144, "num_tokens": 709092.0, "reward": 0.700000011920929, "reward_std": 0.1851640224456787, "rewards/reward_fn/mean": 0.700000011920929, "rewards/reward_fn/std": 0.1851640224456787, "step": 255, "step_time": 25.567990457000995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.8, "completions/max_terminated_length": 199.8, "completions/mean_length": 101.3, "completions/mean_terminated_length": 101.3, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.1291541094193235, "epoch": 130.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.8935185185185185e-06, "loss": 0.0, "num_tokens": 723340.0, "reward": 0.800000011920929, "reward_std": 0.0, "rewards/reward_fn/mean": 0.800000011920929, "rewards/reward_fn/std": 0.0, "step": 260, "step_time": 31.396876741403684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.075, "completions/max_length": 203.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 103.7, "completions/mean_terminated_length": 91.9607147216797, "completions/min_length": 25.8, "completions/min_terminated_length": 25.8, "entropy": 0.14293926188256592, "epoch": 132.5, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "learning_rate": 3.87037037037037e-06, "loss": 0.015573790669441223, "num_tokens": 737656.0, "reward": 0.7499920129776001, "reward_std": 0.14144398546923184, "rewards/reward_fn/mean": 0.7499920129776001, "rewards/reward_fn/std": 0.14144398245989578, "step": 265, "step_time": 31.94191468279896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 102.075, "completions/mean_terminated_length": 102.075, "completions/min_length": 29.4, "completions/min_terminated_length": 29.4, "entropy": 0.12520905593410134, "epoch": 135.0, "frac_reward_zero_std": 0.8, "grad_norm": 0.103515625, "learning_rate": 3.847222222222223e-06, "loss": -0.010684289038181305, "num_tokens": 751671.0, "reward": 0.7499920129776001, "reward_std": 0.14144398546923184, "rewards/reward_fn/mean": 0.7499920129776001, "rewards/reward_fn/std": 0.14144398245989578, "step": 270, "step_time": 30.233847617598077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.6, "completions/max_terminated_length": 198.6, "completions/mean_length": 101.05, "completions/mean_terminated_length": 101.05, "completions/min_length": 29.2, "completions/min_terminated_length": 29.2, "entropy": 0.1348960422212258, "epoch": 137.5, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 3.824074074074075e-06, "loss": -0.008155962824821473, "num_tokens": 765881.0, "reward": 0.7500000119209289, "reward_std": 0.1414213538169861, "rewards/reward_fn/mean": 0.7500000119209289, "rewards/reward_fn/std": 0.1414213538169861, "step": 275, "step_time": 31.395782154197512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 189.8, "completions/max_terminated_length": 170.8, "completions/mean_length": 92.95, "completions/mean_terminated_length": 88.30357208251954, "completions/min_length": 27.6, "completions/min_terminated_length": 27.6, "entropy": 0.11601264334749431, "epoch": 140.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.8009259259259263e-06, "loss": 0.0, "num_tokens": 779531.0, "reward": 0.800000011920929, "reward_std": 0.0, "rewards/reward_fn/mean": 0.800000011920929, "rewards/reward_fn/std": 0.0, "step": 280, "step_time": 30.19007007140026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 199.4, "completions/max_terminated_length": 186.0, "completions/mean_length": 111.55, "completions/mean_terminated_length": 108.88571472167969, "completions/min_length": 40.6, "completions/min_terminated_length": 40.6, "entropy": 0.12823146923910828, "epoch": 142.5, "frac_reward_zero_std": 0.8, "grad_norm": 0.1337890625, "learning_rate": 3.777777777777778e-06, "loss": 0.016540104150772096, "num_tokens": 794189.0, "reward": 0.7007920145988464, "reward_std": 0.2806025862693787, "rewards/reward_fn/mean": 0.7007920145988464, "rewards/reward_fn/std": 0.28060259819030764, "step": 285, "step_time": 31.41049292399839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.6, "completions/max_terminated_length": 131.6, "completions/mean_length": 85.425, "completions/mean_terminated_length": 85.425, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.1431839597178623, "epoch": 145.0, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 3.75462962962963e-06, "loss": -0.008183705806732177, "num_tokens": 807510.0, "reward": 0.7500000119209289, "reward_std": 0.1414213538169861, "rewards/reward_fn/mean": 0.7500000119209289, "rewards/reward_fn/std": 0.1414213538169861, "step": 290, "step_time": 22.681272815001286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 173.0, "completions/max_terminated_length": 157.4, "completions/mean_length": 105.45, "completions/mean_terminated_length": 103.09285888671874, "completions/min_length": 60.2, "completions/min_terminated_length": 60.2, "entropy": 0.12795310239307583, "epoch": 147.5, "frac_reward_zero_std": 0.8, "grad_norm": 0.11328125, "learning_rate": 3.731481481481482e-06, "loss": 0.005563179403543473, "num_tokens": 821924.0, "reward": 0.6000000089406967, "reward_std": 0.34844101667404176, "rewards/reward_fn/mean": 0.6000000089406967, "rewards/reward_fn/std": 0.34844104051589964, "step": 295, "step_time": 28.041595162999876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.4, "completions/max_terminated_length": 142.4, "completions/mean_length": 85.8, "completions/mean_terminated_length": 85.8, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "entropy": 0.12552432040683925, "epoch": 150.0, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "learning_rate": 3.708333333333334e-06, "loss": 0.025445324182510377, "num_tokens": 835260.0, "reward": 0.7499920129776001, "reward_std": 0.14144398546923184, "rewards/reward_fn/mean": 0.7499920129776001, "rewards/reward_fn/std": 0.14144398245989578, "step": 300, "step_time": 24.1276477496016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 194.8, "completions/max_terminated_length": 177.4, "completions/mean_length": 100.125, "completions/mean_terminated_length": 95.59285736083984, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "entropy": 0.14587228901218624, "epoch": 152.5, "frac_reward_zero_std": 0.7, "grad_norm": 0.1435546875, "learning_rate": 3.6851851851851854e-06, "loss": 0.045100918412208556, "num_tokens": 849413.0, "reward": 0.7499840140342713, "reward_std": 0.14146661712147762, "rewards/reward_fn/mean": 0.7499840140342713, "rewards/reward_fn/std": 0.14146661110280548, "step": 305, "step_time": 30.895152483399578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.4, "completions/max_terminated_length": 164.4, "completions/mean_length": 95.8, "completions/mean_terminated_length": 95.8, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.12987288122531027, "epoch": 155.0, "frac_reward_zero_std": 0.7, "grad_norm": 0.0, "learning_rate": 3.662037037037037e-06, "loss": -0.010421520471572876, "num_tokens": 863197.0, "reward": 0.7439999938011169, "reward_std": 0.14894575029611587, "rewards/reward_fn/mean": 0.7439999938011169, "rewards/reward_fn/std": 0.14894574955105783, "step": 310, "step_time": 26.97180611779986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.8, "completions/max_terminated_length": 128.8, "completions/mean_length": 88.525, "completions/mean_terminated_length": 88.525, "completions/min_length": 55.6, "completions/min_terminated_length": 55.6, "entropy": 0.12343773562461138, "epoch": 157.5, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 3.638888888888889e-06, "loss": -2.3096799850463868e-08, "num_tokens": 876690.0, "reward": 0.7970000028610229, "reward_std": 0.008485282957553863, "rewards/reward_fn/mean": 0.7970000028610229, "rewards/reward_fn/std": 0.008485282212495804, "step": 315, "step_time": 22.35670812440003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.2, "completions/max_terminated_length": 162.2, "completions/mean_length": 93.075, "completions/mean_terminated_length": 93.075, "completions/min_length": 40.6, "completions/min_terminated_length": 40.6, "entropy": 0.15649742879904807, "epoch": 160.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.615740740740741e-06, "loss": 0.0, "num_tokens": 890561.0, "reward": 0.800000011920929, "reward_std": 0.0, "rewards/reward_fn/mean": 0.800000011920929, "rewards/reward_fn/std": 0.0, "step": 320, "step_time": 26.563862933603378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.8, "completions/max_terminated_length": 170.8, "completions/mean_length": 92.8, "completions/mean_terminated_length": 92.8, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "entropy": 0.12219774469267577, "epoch": 162.5, "frac_reward_zero_std": 0.8, "grad_norm": 0.232421875, "learning_rate": 3.592592592592593e-06, "loss": 0.0173223078250885, "num_tokens": 904205.0, "reward": 0.7499920129776001, "reward_std": 0.14144398546923184, "rewards/reward_fn/mean": 0.7499920129776001, "rewards/reward_fn/std": 0.14144398245989578, "step": 325, "step_time": 27.899399954797264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 102.475, "completions/mean_terminated_length": 102.475, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "entropy": 0.16676131393760443, "epoch": 165.0, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 3.569444444444445e-06, "loss": -1.1920928955078126e-08, "num_tokens": 918472.0, "reward": 0.7970000028610229, "reward_std": 0.008485282957553863, "rewards/reward_fn/mean": 0.7970000028610229, "rewards/reward_fn/std": 0.008485282212495804, "step": 330, "step_time": 27.33713496620112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 94.2, "completions/mean_terminated_length": 94.2, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "entropy": 0.13035173853859305, "epoch": 167.5, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 3.5462962962962967e-06, "loss": 0.02846188545227051, "num_tokens": 932388.0, "reward": 0.7500000119209289, "reward_std": 0.1414213538169861, "rewards/reward_fn/mean": 0.7500000119209289, "rewards/reward_fn/std": 0.1414213538169861, "step": 335, "step_time": 27.027245131201198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.2, "completions/max_terminated_length": 182.2, "completions/mean_length": 95.775, "completions/mean_terminated_length": 95.775, "completions/min_length": 33.6, "completions/min_terminated_length": 33.6, "entropy": 0.13310553296469152, "epoch": 170.0, "frac_reward_zero_std": 0.8, "grad_norm": 0.193359375, "learning_rate": 3.523148148148148e-06, "loss": 0.001617179811000824, "num_tokens": 946171.0, "reward": 0.700000011920929, "reward_std": 0.2828427076339722, "rewards/reward_fn/mean": 0.700000011920929, "rewards/reward_fn/std": 0.2828427076339722, "step": 340, "step_time": 29.20151192679841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 192.8, "completions/max_terminated_length": 181.0, "completions/mean_length": 103.925, "completions/mean_terminated_length": 100.5750015258789, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.15284172971732915, "epoch": 172.5, "frac_reward_zero_std": 0.9, "grad_norm": 0.1142578125, "learning_rate": 3.5e-06, "loss": -0.0006755538284778595, "num_tokens": 960496.0, "reward": 0.7999920129776001, "reward_std": 2.263165224576369e-05, "rewards/reward_fn/mean": 0.7999920129776001, "rewards/reward_fn/std": 2.2628642909694463e-05, "step": 345, "step_time": 30.46660940139991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 96.9, "completions/mean_terminated_length": 96.9, "completions/min_length": 28.4, "completions/min_terminated_length": 28.4, "entropy": 0.1217897635884583, "epoch": 175.0, "frac_reward_zero_std": 0.9, "grad_norm": 0.201171875, "learning_rate": 3.476851851851852e-06, "loss": 0.0045785665512084964, "num_tokens": 974304.0, "reward": 0.7500000119209289, "reward_std": 0.1414213538169861, "rewards/reward_fn/mean": 0.7500000119209289, "rewards/reward_fn/std": 0.1414213538169861, "step": 350, "step_time": 29.460710451801425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 189.8, "completions/max_terminated_length": 180.2, "completions/mean_length": 108.925, "completions/mean_terminated_length": 106.39642944335938, "completions/min_length": 46.2, "completions/min_terminated_length": 46.2, "entropy": 0.13436518078669907, "epoch": 177.5, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 3.453703703703704e-06, "loss": -0.00737251341342926, "num_tokens": 988857.0, "reward": 0.7511820077896119, "reward_std": 0.13807815313339233, "rewards/reward_fn/mean": 0.7511820077896119, "rewards/reward_fn/std": 0.13807815313339233, "step": 355, "step_time": 30.16567378140171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.2, "completions/max_terminated_length": 144.2, "completions/mean_length": 81.875, "completions/mean_terminated_length": 81.875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "entropy": 0.1366418529767543, "epoch": 180.0, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 3.430555555555556e-06, "loss": 0.017211392521858215, "num_tokens": 1002036.0, "reward": 0.7500000119209289, "reward_std": 0.1414213538169861, "rewards/reward_fn/mean": 0.7500000119209289, "rewards/reward_fn/std": 0.1414213538169861, "step": 360, "step_time": 24.196757839000202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.8, "completions/max_terminated_length": 160.8, "completions/mean_length": 97.65, "completions/mean_terminated_length": 97.65, "completions/min_length": 49.8, "completions/min_terminated_length": 49.8, "entropy": 0.11701173500623555, "epoch": 182.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4074074074074077e-06, "loss": 0.0, "num_tokens": 1015874.0, "reward": 0.800000011920929, "reward_std": 0.0, "rewards/reward_fn/mean": 0.800000011920929, "rewards/reward_fn/std": 0.0, "step": 365, "step_time": 26.509838377000413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 189.8, "completions/max_terminated_length": 170.8, "completions/mean_length": 104.2, "completions/mean_terminated_length": 96.73333435058593, "completions/min_length": 46.2, "completions/min_terminated_length": 46.2, "entropy": 0.1576181524200365, "epoch": 185.0, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "learning_rate": 3.3842592592592595e-06, "loss": 0.016841967403888703, "num_tokens": 1030210.0, "reward": 0.700000011920929, "reward_std": 0.2828427076339722, "rewards/reward_fn/mean": 0.700000011920929, "rewards/reward_fn/std": 0.2828427076339722, "step": 370, "step_time": 30.090859549198647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 154.6, "completions/max_terminated_length": 150.6, "completions/mean_length": 78.825, "completions/mean_terminated_length": 75.07857208251953, "completions/min_length": 19.4, "completions/min_terminated_length": 19.4, "entropy": 0.1031433446565643, "epoch": 187.5, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 3.3611111111111117e-06, "loss": 0.016514861583709718, "num_tokens": 1043267.0, "reward": 0.7500000119209289, "reward_std": 0.1414213538169861, "rewards/reward_fn/mean": 0.7500000119209289, "rewards/reward_fn/std": 0.1414213538169861, "step": 375, "step_time": 25.698997188800423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.6, "completions/max_terminated_length": 179.6, "completions/mean_length": 104.575, "completions/mean_terminated_length": 104.575, "completions/min_length": 40.4, "completions/min_terminated_length": 40.4, "entropy": 0.13036173072177917, "epoch": 190.0, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "learning_rate": 3.3379629629629636e-06, "loss": -0.000981520116329193, "num_tokens": 1057646.0, "reward": 0.7499920129776001, "reward_std": 0.14144398546923184, "rewards/reward_fn/mean": 0.7499920129776001, "rewards/reward_fn/std": 0.14144398245989578, "step": 380, "step_time": 28.867799249802193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.8, "completions/max_terminated_length": 158.8, "completions/mean_length": 110.175, "completions/mean_terminated_length": 110.175, "completions/min_length": 72.6, "completions/min_terminated_length": 72.6, "entropy": 0.14139417342375965, "epoch": 192.5, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 3.314814814814815e-06, "loss": -0.009847503900527955, "num_tokens": 1072249.0, "reward": 0.7500000119209289, "reward_std": 0.1414213538169861, "rewards/reward_fn/mean": 0.7500000119209289, "rewards/reward_fn/std": 0.1414213538169861, "step": 385, "step_time": 26.117895688798306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 162.0, "completions/max_terminated_length": 149.8, "completions/mean_length": 90.825, "completions/mean_terminated_length": 86.45, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.1398956686956808, "epoch": 195.0, "frac_reward_zero_std": 0.8, "grad_norm": 0.361328125, "learning_rate": 3.2916666666666668e-06, "loss": 0.07702720761299134, "num_tokens": 1085786.0, "reward": 0.7507840156555176, "reward_std": 0.13920386410463834, "rewards/reward_fn/mean": 0.7507840156555176, "rewards/reward_fn/std": 0.13920387301623122, "step": 390, "step_time": 26.53806324480174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.8, "completions/max_terminated_length": 184.8, "completions/mean_length": 95.3, "completions/mean_terminated_length": 95.3, "completions/min_length": 25.8, "completions/min_terminated_length": 25.8, "entropy": 0.11398763118777425, "epoch": 197.5, "frac_reward_zero_std": 0.6, "grad_norm": 0.25, "learning_rate": 3.2685185185185186e-06, "loss": 0.013139170408248902, "num_tokens": 1099550.0, "reward": 0.6969920039176941, "reward_std": 0.2819044408868649, "rewards/reward_fn/mean": 0.6969920039176941, "rewards/reward_fn/std": 0.28190446171938677, "step": 395, "step_time": 29.58767645379921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.6, "completions/max_terminated_length": 191.6, "completions/mean_length": 106.725, "completions/mean_terminated_length": 106.725, "completions/min_length": 43.2, "completions/min_terminated_length": 43.2, "entropy": 0.1625191917642951, "epoch": 200.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2453703703703704e-06, "loss": 0.0, "num_tokens": 1113967.0, "reward": 0.800000011920929, "reward_std": 0.0, "rewards/reward_fn/mean": 0.800000011920929, "rewards/reward_fn/std": 0.0, "step": 400, "step_time": 30.439370583198617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 77.4, "completions/mean_terminated_length": 77.4, "completions/min_length": 19.8, "completions/min_terminated_length": 19.8, "entropy": 0.1114686549641192, "epoch": 202.5, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 3.2222222222222227e-06, "loss": 0.016538560390472412, "num_tokens": 1126967.0, "reward": 0.7500000119209289, "reward_std": 0.1414213538169861, "rewards/reward_fn/mean": 0.7500000119209289, "rewards/reward_fn/std": 0.1414213538169861, "step": 405, "step_time": 24.568558594400383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 176.8, "completions/max_terminated_length": 160.0, "completions/mean_length": 105.125, "completions/mean_terminated_length": 102.4607147216797, "completions/min_length": 55.4, "completions/min_terminated_length": 55.4, "entropy": 0.14484606825280935, "epoch": 205.0, "frac_reward_zero_std": 0.6, "grad_norm": 0.0, "learning_rate": 3.1990740740740745e-06, "loss": 0.04012419581413269, "num_tokens": 1141368.0, "reward": 0.6499920129776001, "reward_std": 0.424286693103204, "rewards/reward_fn/mean": 0.6499920129776001, "rewards/reward_fn/std": 0.42428669009386794, "step": 410, "step_time": 28.52794066679926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.8, "completions/max_terminated_length": 164.8, "completions/mean_length": 79.375, "completions/mean_terminated_length": 79.375, "completions/min_length": 19.8, "completions/min_terminated_length": 19.8, "entropy": 0.1323097031097859, "epoch": 207.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1759259259259263e-06, "loss": 0.0, "num_tokens": 1154447.0, "reward": 0.800000011920929, "reward_std": 0.0, "rewards/reward_fn/mean": 0.800000011920929, "rewards/reward_fn/std": 0.0, "step": 415, "step_time": 26.996101147800072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 167.0, "completions/max_terminated_length": 157.6, "completions/mean_length": 103.525, "completions/mean_terminated_length": 99.01071472167969, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.11831333981826901, "epoch": 210.0, "frac_reward_zero_std": 0.9, "grad_norm": 0.19140625, "learning_rate": 3.152777777777778e-06, "loss": 0.015304601192474366, "num_tokens": 1168784.0, "reward": 0.7500000119209289, "reward_std": 0.1414213538169861, "rewards/reward_fn/mean": 0.7500000119209289, "rewards/reward_fn/std": 0.1414213538169861, "step": 420, "step_time": 27.225987496596645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 180.4, "completions/max_terminated_length": 165.4, "completions/mean_length": 98.0, "completions/mean_terminated_length": 93.45714416503907, "completions/min_length": 45.8, "completions/min_terminated_length": 45.8, "entropy": 0.13548461146419868, "epoch": 212.5, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 3.1296296296296295e-06, "loss": -0.009428337216377258, "num_tokens": 1182852.0, "reward": 0.7500000119209289, "reward_std": 0.1414213538169861, "rewards/reward_fn/mean": 0.7500000119209289, "rewards/reward_fn/std": 0.1414213538169861, "step": 425, "step_time": 29.033930897598474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.6, "completions/max_terminated_length": 175.6, "completions/mean_length": 92.675, "completions/mean_terminated_length": 92.675, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "entropy": 0.10527189422864466, "epoch": 215.0, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 3.1064814814814818e-06, "loss": -2.16066837310791e-08, "num_tokens": 1196511.0, "reward": 0.7970000028610229, "reward_std": 0.008485282957553863, "rewards/reward_fn/mean": 0.7970000028610229, "rewards/reward_fn/std": 0.008485282212495804, "step": 430, "step_time": 28.445749506798894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.8, "completions/max_terminated_length": 182.8, "completions/mean_length": 107.5, "completions/mean_terminated_length": 107.5, "completions/min_length": 39.2, "completions/min_terminated_length": 39.2, "entropy": 0.13915550753008574, "epoch": 217.5, "frac_reward_zero_std": 0.7, "grad_norm": 0.1435546875, "learning_rate": 3.0833333333333336e-06, "loss": 0.04852951169013977, "num_tokens": 1211007.0, "reward": 0.6999920129776, "reward_std": 0.28286533928621793, "rewards/reward_fn/mean": 0.6999920129776, "rewards/reward_fn/std": 0.2828653362768819, "step": 435, "step_time": 29.242199634596183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 142.4, "completions/max_terminated_length": 126.8, "completions/mean_length": 84.875, "completions/mean_terminated_length": 82.12857360839844, "completions/min_length": 50.6, "completions/min_terminated_length": 50.6, "entropy": 0.12801749436184764, "epoch": 220.0, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "learning_rate": 3.0601851851851854e-06, "loss": 0.023633134365081788, "num_tokens": 1224306.0, "reward": 0.6500000089406968, "reward_std": 0.20701966285705567, "rewards/reward_fn/mean": 0.6500000089406968, "rewards/reward_fn/std": 0.20701968669891357, "step": 440, "step_time": 24.112674611999683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 171.4, "completions/max_terminated_length": 144.0, "completions/mean_length": 89.325, "completions/mean_terminated_length": 82.22142944335937, "completions/min_length": 40.6, "completions/min_terminated_length": 40.6, "entropy": 0.14441313657443972, "epoch": 222.5, "frac_reward_zero_std": 0.6, "grad_norm": 0.287109375, "learning_rate": 3.0370370370370372e-06, "loss": 0.061201339960098265, "num_tokens": 1237783.0, "reward": 0.6499920129776001, "reward_std": 0.424286693103204, "rewards/reward_fn/mean": 0.6499920129776001, "rewards/reward_fn/std": 0.42428669009386794, "step": 445, "step_time": 27.72405263900073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.6, "completions/max_terminated_length": 177.6, "completions/mean_length": 101.2, "completions/mean_terminated_length": 101.2, "completions/min_length": 41.4, "completions/min_terminated_length": 41.4, "entropy": 0.1256763830780983, "epoch": 225.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.013888888888889e-06, "loss": 0.0, "num_tokens": 1252027.0, "reward": 0.800000011920929, "reward_std": 0.0, "rewards/reward_fn/mean": 0.800000011920929, "rewards/reward_fn/std": 0.0, "step": 450, "step_time": 28.645315791801114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.8, "completions/max_terminated_length": 153.8, "completions/mean_length": 106.525, "completions/mean_terminated_length": 106.525, "completions/min_length": 72.4, "completions/min_terminated_length": 72.4, "entropy": 0.11539835934527218, "epoch": 227.5, "frac_reward_zero_std": 0.9, "grad_norm": 0.134765625, "learning_rate": 2.990740740740741e-06, "loss": -0.008653049170970917, "num_tokens": 1266484.0, "reward": 0.700000011920929, "reward_std": 0.1851640224456787, "rewards/reward_fn/mean": 0.700000011920929, "rewards/reward_fn/std": 0.1851640224456787, "step": 455, "step_time": 25.554188429601346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 169.6, "completions/max_terminated_length": 166.8, "completions/mean_length": 84.45, "completions/mean_terminated_length": 80.72142944335937, "completions/min_length": 19.8, "completions/min_terminated_length": 19.8, "entropy": 0.13924790399614723, "epoch": 230.0, "frac_reward_zero_std": 0.8, "grad_norm": 0.158203125, "learning_rate": 2.967592592592593e-06, "loss": 0.03869241774082184, "num_tokens": 1279766.0, "reward": 0.6500000119209289, "reward_std": 0.3265853762626648, "rewards/reward_fn/mean": 0.6500000119209289, "rewards/reward_fn/std": 0.3265853762626648, "step": 460, "step_time": 27.48372414899932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.8, "completions/max_terminated_length": 189.8, "completions/mean_length": 93.25, "completions/mean_terminated_length": 93.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.11793683604337275, "epoch": 232.5, "frac_reward_zero_std": 0.8, "grad_norm": 0.2177734375, "learning_rate": 2.944444444444445e-06, "loss": 0.012316146492958069, "num_tokens": 1293448.0, "reward": 0.6500000119209289, "reward_std": 0.3265853762626648, "rewards/reward_fn/mean": 0.6500000119209289, "rewards/reward_fn/std": 0.3265853762626648, "step": 465, "step_time": 30.217514709999524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 170.4, "completions/max_terminated_length": 156.6, "completions/mean_length": 93.975, "completions/mean_terminated_length": 90.22500152587891, "completions/min_length": 27.4, "completions/min_terminated_length": 27.4, "entropy": 0.1507271576556377, "epoch": 235.0, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 2.9212962962962964e-06, "loss": 0.024145886301994324, "num_tokens": 1307355.0, "reward": 0.7500000119209289, "reward_std": 0.1414213538169861, "rewards/reward_fn/mean": 0.7500000119209289, "rewards/reward_fn/std": 0.1414213538169861, "step": 470, "step_time": 27.92282026600078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 174.2, "completions/max_terminated_length": 160.0, "completions/mean_length": 103.6, "completions/mean_terminated_length": 100.88214416503907, "completions/min_length": 39.2, "completions/min_terminated_length": 39.2, "entropy": 0.14852485329611226, "epoch": 237.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.898148148148148e-06, "loss": 0.0, "num_tokens": 1321667.0, "reward": 0.800000011920929, "reward_std": 0.0, "rewards/reward_fn/mean": 0.800000011920929, "rewards/reward_fn/std": 0.0, "step": 475, "step_time": 28.292469495798287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.6, "completions/max_terminated_length": 151.6, "completions/mean_length": 87.05, "completions/mean_terminated_length": 87.05, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "entropy": 0.1000554851605557, "epoch": 240.0, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "learning_rate": 2.875e-06, "loss": 0.01600448936223984, "num_tokens": 1335081.0, "reward": 0.700000011920929, "reward_std": 0.2828427076339722, "rewards/reward_fn/mean": 0.700000011920929, "rewards/reward_fn/std": 0.2828427076339722, "step": 480, "step_time": 25.202083195801244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.6, "completions/max_terminated_length": 151.6, "completions/mean_length": 82.025, "completions/mean_terminated_length": 82.025, "completions/min_length": 19.8, "completions/min_terminated_length": 19.8, "entropy": 0.1268925666459836, "epoch": 242.5, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "learning_rate": 2.8518518518518522e-06, "loss": 0.009142126142978668, "num_tokens": 1348266.0, "reward": 0.700000011920929, "reward_std": 0.2828427076339722, "rewards/reward_fn/mean": 0.700000011920929, "rewards/reward_fn/std": 0.2828427076339722, "step": 485, "step_time": 25.32069047859695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 214.0, "completions/max_terminated_length": 184.2, "completions/mean_length": 112.625, "completions/mean_terminated_length": 105.24642944335938, "completions/min_length": 45.6, "completions/min_terminated_length": 45.6, "entropy": 0.13998855941463262, "epoch": 245.0, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "learning_rate": 2.828703703703704e-06, "loss": 0.06989994049072265, "num_tokens": 1362967.0, "reward": 0.700000011920929, "reward_std": 0.2828427076339722, "rewards/reward_fn/mean": 0.700000011920929, "rewards/reward_fn/std": 0.2828427076339722, "step": 490, "step_time": 33.36453152959875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.8, "completions/max_terminated_length": 188.8, "completions/mean_length": 104.725, "completions/mean_terminated_length": 104.725, "completions/min_length": 43.8, "completions/min_terminated_length": 43.8, "entropy": 0.1694211942027323, "epoch": 247.5, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "learning_rate": 2.805555555555556e-06, "loss": 0.006063133478164673, "num_tokens": 1377324.0, "reward": 0.6500000119209289, "reward_std": 0.3265853762626648, "rewards/reward_fn/mean": 0.6500000119209289, "rewards/reward_fn/std": 0.3265853762626648, "step": 495, "step_time": 29.983117254402895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.8, "completions/max_terminated_length": 153.8, "completions/mean_length": 83.875, "completions/mean_terminated_length": 83.875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.12524008078034968, "epoch": 250.0, "frac_reward_zero_std": 0.8, "grad_norm": 0.146484375, "learning_rate": 2.7824074074074077e-06, "loss": 0.009342719614505769, "num_tokens": 1390611.0, "reward": 0.700000011920929, "reward_std": 0.2828427076339722, "rewards/reward_fn/mean": 0.700000011920929, "rewards/reward_fn/std": 0.2828427076339722, "step": 500, "step_time": 25.58551082920021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.2, "completions/max_terminated_length": 138.2, "completions/mean_length": 83.7, "completions/mean_terminated_length": 83.7, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "entropy": 0.12294827501755208, "epoch": 252.5, "frac_reward_zero_std": 0.9, "grad_norm": 0.279296875, "learning_rate": 2.759259259259259e-06, "loss": 0.0008182898163795471, "num_tokens": 1403863.0, "reward": 0.7970000028610229, "reward_std": 0.008485282957553863, "rewards/reward_fn/mean": 0.7970000028610229, "rewards/reward_fn/std": 0.008485282212495804, "step": 505, "step_time": 23.533826856199447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 170.6, "completions/max_terminated_length": 167.4, "completions/mean_length": 101.25, "completions/mean_terminated_length": 98.58928833007812, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "entropy": 0.11002964545041323, "epoch": 255.0, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 2.7361111111111118e-06, "loss": 0.01304551213979721, "num_tokens": 1418109.0, "reward": 0.7500000119209289, "reward_std": 0.1414213538169861, "rewards/reward_fn/mean": 0.7500000119209289, "rewards/reward_fn/std": 0.1414213538169861, "step": 510, "step_time": 27.820240558599473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 94.05, "completions/mean_terminated_length": 94.05, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "entropy": 0.1244091754895635, "epoch": 257.5, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 2.712962962962963e-06, "loss": 0.006943752616643905, "num_tokens": 1432019.0, "reward": 0.7500000119209289, "reward_std": 0.1414213538169861, "rewards/reward_fn/mean": 0.7500000119209289, "rewards/reward_fn/std": 0.1414213538169861, "step": 515, "step_time": 27.71690519019976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.6, "completions/max_terminated_length": 164.6, "completions/mean_length": 93.45, "completions/mean_terminated_length": 93.45, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "entropy": 0.11850596296135336, "epoch": 260.0, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "learning_rate": 2.689814814814815e-06, "loss": 0.03078709840774536, "num_tokens": 1445709.0, "reward": 0.747000002861023, "reward_std": 0.14046046733856202, "rewards/reward_fn/mean": 0.747000002861023, "rewards/reward_fn/std": 0.14046047925949096, "step": 520, "step_time": 26.998205948202667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 75.9, "completions/mean_terminated_length": 75.9, "completions/min_length": 19.8, "completions/min_terminated_length": 19.8, "entropy": 0.10163291104836389, "epoch": 262.5, "frac_reward_zero_std": 0.9, "grad_norm": 0.42578125, "learning_rate": 2.666666666666667e-06, "loss": 0.02896396517753601, "num_tokens": 1458649.0, "reward": 0.7999920129776001, "reward_std": 2.263165224576369e-05, "rewards/reward_fn/mean": 0.7999920129776001, "rewards/reward_fn/std": 2.2628642909694463e-05, "step": 525, "step_time": 25.67267432579829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.4, "completions/max_terminated_length": 167.4, "completions/mean_length": 106.8, "completions/mean_terminated_length": 106.8, "completions/min_length": 60.2, "completions/min_terminated_length": 60.2, "entropy": 0.12018695392180234, "epoch": 265.0, "frac_reward_zero_std": 0.9, "grad_norm": 0.10986328125, "learning_rate": 2.6435185185185187e-06, "loss": -9.685754776000977e-09, "num_tokens": 1473117.0, "reward": 0.7970000028610229, "reward_std": 0.008485282957553863, "rewards/reward_fn/mean": 0.7970000028610229, "rewards/reward_fn/std": 0.008485282212495804, "step": 530, "step_time": 27.306792101602333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.2, "completions/max_terminated_length": 165.2, "completions/mean_length": 90.35, "completions/mean_terminated_length": 90.35, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.12258970123948529, "epoch": 267.5, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 2.6203703703703705e-06, "loss": 0.024295052886009215, "num_tokens": 1486879.0, "reward": 0.7507920145988465, "reward_std": 0.13918123245239258, "rewards/reward_fn/mean": 0.7507920145988465, "rewards/reward_fn/std": 0.13918124437332152, "step": 535, "step_time": 27.07393169499992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 202.0, "completions/max_terminated_length": 175.4, "completions/mean_length": 99.4, "completions/mean_terminated_length": 90.5750015258789, "completions/min_length": 27.6, "completions/min_terminated_length": 27.6, "entropy": 0.12210832491982729, "epoch": 270.0, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "learning_rate": 2.5972222222222227e-06, "loss": 0.04415736496448517, "num_tokens": 1500807.0, "reward": 0.6500000119209289, "reward_std": 0.3265853762626648, "rewards/reward_fn/mean": 0.6500000119209289, "rewards/reward_fn/std": 0.3265853762626648, "step": 540, "step_time": 31.728731169800447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 88.475, "completions/mean_terminated_length": 88.475, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.1007203730638139, "epoch": 272.5, "frac_reward_zero_std": 0.9, "grad_norm": 0.2734375, "learning_rate": 2.5740740740740745e-06, "loss": 0.01148533821105957, "num_tokens": 1514278.0, "reward": 0.7500000119209289, "reward_std": 0.1414213538169861, "rewards/reward_fn/mean": 0.7500000119209289, "rewards/reward_fn/std": 0.1414213538169861, "step": 545, "step_time": 23.616023338599916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 181.6, "completions/max_terminated_length": 166.0, "completions/mean_length": 99.95, "completions/mean_terminated_length": 95.3357147216797, "completions/min_length": 42.6, "completions/min_terminated_length": 42.6, "entropy": 0.14508932372555136, "epoch": 275.0, "frac_reward_zero_std": 0.8, "grad_norm": 0.134765625, "learning_rate": 2.550925925925926e-06, "loss": 0.058039349317550656, "num_tokens": 1528444.0, "reward": 0.700000011920929, "reward_std": 0.2828427076339722, "rewards/reward_fn/mean": 0.700000011920929, "rewards/reward_fn/std": 0.2828427076339722, "step": 550, "step_time": 29.27236720959845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.6, "completions/max_terminated_length": 172.6, "completions/mean_length": 94.725, "completions/mean_terminated_length": 94.725, "completions/min_length": 26.8, "completions/min_terminated_length": 26.8, "entropy": 0.12917686544824392, "epoch": 277.5, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 2.5277777777777778e-06, "loss": 0.008541859686374664, "num_tokens": 1542381.0, "reward": 0.7500000119209289, "reward_std": 0.1414213538169861, "rewards/reward_fn/mean": 0.7500000119209289, "rewards/reward_fn/std": 0.1414213538169861, "step": 555, "step_time": 28.04494361659672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 89.575, "completions/mean_terminated_length": 89.575, "completions/min_length": 43.2, "completions/min_terminated_length": 43.2, "entropy": 0.10766231055604294, "epoch": 280.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5046296296296296e-06, "loss": 0.0, "num_tokens": 1555916.0, "reward": 0.800000011920929, "reward_std": 0.0, "rewards/reward_fn/mean": 0.800000011920929, "rewards/reward_fn/std": 0.0, "step": 560, "step_time": 27.3150207976003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.4, "completions/max_terminated_length": 164.4, "completions/mean_length": 81.35, "completions/mean_terminated_length": 81.35, "completions/min_length": 28.6, "completions/min_terminated_length": 28.6, "entropy": 0.12244645584141836, "epoch": 282.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.481481481481482e-06, "loss": 0.0, "num_tokens": 1569074.0, "reward": 0.800000011920929, "reward_std": 0.0, "rewards/reward_fn/mean": 0.800000011920929, "rewards/reward_fn/std": 0.0, "step": 565, "step_time": 26.9950363395983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.8, "completions/max_terminated_length": 207.8, "completions/mean_length": 115.3, "completions/mean_terminated_length": 115.3, "completions/min_length": 50.2, "completions/min_terminated_length": 50.2, "entropy": 0.1410753179807216, "epoch": 285.0, "frac_reward_zero_std": 0.6, "grad_norm": 0.1884765625, "learning_rate": 2.4583333333333332e-06, "loss": -0.0042663484811782835, "num_tokens": 1583882.0, "reward": 0.6040000081062317, "reward_std": 0.46080578565597535, "rewards/reward_fn/mean": 0.6040000081062317, "rewards/reward_fn/std": 0.46080578565597535, "step": 570, "step_time": 32.51511334399911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.6, "completions/max_terminated_length": 157.6, "completions/mean_length": 95.85, "completions/mean_terminated_length": 95.85, "completions/min_length": 50.6, "completions/min_terminated_length": 50.6, "entropy": 0.1270086049917154, "epoch": 287.5, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "learning_rate": 2.4351851851851855e-06, "loss": 0.0031342685222625734, "num_tokens": 1597648.0, "reward": 0.700000011920929, "reward_std": 0.2828427076339722, "rewards/reward_fn/mean": 0.700000011920929, "rewards/reward_fn/std": 0.2828427076339722, "step": 575, "step_time": 26.03918815540019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 178.6, "completions/max_terminated_length": 170.8, "completions/mean_length": 103.725, "completions/mean_terminated_length": 101.60357360839843, "completions/min_length": 52.2, "completions/min_terminated_length": 52.2, "entropy": 0.14261129099177197, "epoch": 290.0, "frac_reward_zero_std": 0.8, "grad_norm": 0.10693359375, "learning_rate": 2.4120370370370373e-06, "loss": 0.010924074053764343, "num_tokens": 1611965.0, "reward": 0.7499920129776001, "reward_std": 0.14144398546923184, "rewards/reward_fn/mean": 0.7499920129776001, "rewards/reward_fn/std": 0.14144398245989578, "step": 580, "step_time": 28.818242219599778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.8, "completions/max_terminated_length": 181.8, "completions/mean_length": 99.45, "completions/mean_terminated_length": 99.45, "completions/min_length": 25.8, "completions/min_terminated_length": 25.8, "entropy": 0.14679535252507775, "epoch": 292.5, "frac_reward_zero_std": 0.6, "grad_norm": 0.193359375, "learning_rate": 2.388888888888889e-06, "loss": 0.04403604865074158, "num_tokens": 1626111.0, "reward": 0.6469920158386231, "reward_std": 0.42329972982406616, "rewards/reward_fn/mean": 0.6469920158386231, "rewards/reward_fn/std": 0.42329972982406616, "step": 585, "step_time": 29.356267316001322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.6, "completions/max_terminated_length": 175.6, "completions/mean_length": 90.075, "completions/mean_terminated_length": 90.075, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.11895611474756151, "epoch": 295.0, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 2.365740740740741e-06, "loss": 0.0307393878698349, "num_tokens": 1639646.0, "reward": 0.7500000119209289, "reward_std": 0.1414213538169861, "rewards/reward_fn/mean": 0.7500000119209289, "rewards/reward_fn/std": 0.1414213538169861, "step": 590, "step_time": 28.44031162360261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.6, "completions/max_terminated_length": 163.6, "completions/mean_length": 83.8, "completions/mean_terminated_length": 83.8, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 0.11074991283239796, "epoch": 297.5, "frac_reward_zero_std": 0.8, "grad_norm": 0.12060546875, "learning_rate": 2.3425925925925928e-06, "loss": -0.0017749778926372528, "num_tokens": 1652930.0, "reward": 0.747000002861023, "reward_std": 0.14990663677453994, "rewards/reward_fn/mean": 0.747000002861023, "rewards/reward_fn/std": 0.1499066360294819, "step": 595, "step_time": 26.963711300198337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 100.125, "completions/mean_terminated_length": 100.125, "completions/min_length": 28.6, "completions/min_terminated_length": 28.6, "entropy": 0.15330470782937483, "epoch": 300.0, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "learning_rate": 2.3194444444444446e-06, "loss": 0.0064928531646728516, "num_tokens": 1667103.0, "reward": 0.747000002861023, "reward_std": 0.14046045541763305, "rewards/reward_fn/mean": 0.747000002861023, "rewards/reward_fn/std": 0.14046047925949096, "step": 600, "step_time": 32.507627704396874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.2, "completions/max_terminated_length": 190.2, "completions/mean_length": 96.925, "completions/mean_terminated_length": 96.925, "completions/min_length": 29.2, "completions/min_terminated_length": 29.2, "entropy": 0.1422055270173587, "epoch": 302.5, "frac_reward_zero_std": 0.7, "grad_norm": 0.125, "learning_rate": 2.2962962962962964e-06, "loss": 0.008351585268974305, "num_tokens": 1681128.0, "reward": 0.6999920129776, "reward_std": 0.28286533928621793, "rewards/reward_fn/mean": 0.6999920129776, "rewards/reward_fn/std": 0.2828653362768819, "step": 605, "step_time": 30.293829828397428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.6, "completions/max_terminated_length": 183.6, "completions/mean_length": 94.025, "completions/mean_terminated_length": 94.025, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.10572184595512227, "epoch": 305.0, "frac_reward_zero_std": 0.8, "grad_norm": 0.203125, "learning_rate": 2.2731481481481482e-06, "loss": -0.007780712842941284, "num_tokens": 1694841.0, "reward": 0.700000011920929, "reward_std": 0.2828427076339722, "rewards/reward_fn/mean": 0.700000011920929, "rewards/reward_fn/std": 0.2828427076339722, "step": 610, "step_time": 29.523135821998586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.4, "completions/max_terminated_length": 163.4, "completions/mean_length": 92.0, "completions/mean_terminated_length": 92.0, "completions/min_length": 40.8, "completions/min_terminated_length": 40.8, "entropy": 0.11542486005928368, "epoch": 307.5, "frac_reward_zero_std": 0.8, "grad_norm": 0.1025390625, "learning_rate": 2.25e-06, "loss": 0.0066208459436893465, "num_tokens": 1708473.0, "reward": 0.747000002861023, "reward_std": 0.14990663677453994, "rewards/reward_fn/mean": 0.747000002861023, "rewards/reward_fn/std": 0.1499066360294819, "step": 615, "step_time": 26.86916997299777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.6, "completions/max_terminated_length": 157.6, "completions/mean_length": 100.575, "completions/mean_terminated_length": 100.575, "completions/min_length": 59.4, "completions/min_terminated_length": 59.4, "entropy": 0.14778990496415645, "epoch": 310.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.226851851851852e-06, "loss": 0.0, "num_tokens": 1722644.0, "reward": 0.800000011920929, "reward_std": 0.0, "rewards/reward_fn/mean": 0.800000011920929, "rewards/reward_fn/std": 0.0, "step": 620, "step_time": 26.04490917779767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.8, "completions/max_terminated_length": 143.8, "completions/mean_length": 80.75, "completions/mean_terminated_length": 80.75, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.1346577436546795, "epoch": 312.5, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "learning_rate": 2.203703703703704e-06, "loss": 0.0028170838952064512, "num_tokens": 1735778.0, "reward": 0.700000011920929, "reward_std": 0.2828427076339722, "rewards/reward_fn/mean": 0.700000011920929, "rewards/reward_fn/std": 0.2828427076339722, "step": 625, "step_time": 24.271923562001756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 174.4, "completions/max_terminated_length": 162.2, "completions/mean_length": 112.3, "completions/mean_terminated_length": 109.96428833007812, "completions/min_length": 59.4, "completions/min_terminated_length": 59.4, "entropy": 0.12042991645867004, "epoch": 315.0, "frac_reward_zero_std": 0.8, "grad_norm": 0.2255859375, "learning_rate": 2.180555555555556e-06, "loss": 0.021805547177791595, "num_tokens": 1750466.0, "reward": 0.747000002861023, "reward_std": 0.14990663677453994, "rewards/reward_fn/mean": 0.747000002861023, "rewards/reward_fn/std": 0.1499066360294819, "step": 630, "step_time": 28.148792170204253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 202.0, "completions/max_terminated_length": 189.4, "completions/mean_length": 106.8, "completions/mean_terminated_length": 103.18214416503906, "completions/min_length": 27.2, "completions/min_terminated_length": 27.2, "entropy": 0.1451239718357101, "epoch": 317.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1574074074074073e-06, "loss": 0.0, "num_tokens": 1764906.0, "reward": 0.800000011920929, "reward_std": 0.0, "rewards/reward_fn/mean": 0.800000011920929, "rewards/reward_fn/std": 0.0, "step": 635, "step_time": 31.783189745399433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.2, "completions/max_terminated_length": 170.2, "completions/mean_length": 89.025, "completions/mean_terminated_length": 89.025, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "entropy": 0.13819038349902257, "epoch": 320.0, "frac_reward_zero_std": 0.8, "grad_norm": 0.201171875, "learning_rate": 2.1342592592592596e-06, "loss": 0.02045248746871948, "num_tokens": 1778399.0, "reward": 0.747000002861023, "reward_std": 0.14990663677453994, "rewards/reward_fn/mean": 0.747000002861023, "rewards/reward_fn/std": 0.1499066360294819, "step": 640, "step_time": 27.795567151399155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 91.0, "completions/mean_terminated_length": 91.0, "completions/min_length": 26.8, "completions/min_terminated_length": 26.8, "entropy": 0.1383639785577543, "epoch": 322.5, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "learning_rate": 2.1111111111111114e-06, "loss": -2.16066837310791e-08, "num_tokens": 1792187.0, "reward": 0.793999993801117, "reward_std": 0.016970565915107726, "rewards/reward_fn/mean": 0.793999993801117, "rewards/reward_fn/std": 0.01697056442499161, "step": 645, "step_time": 27.67300096500112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 196.4, "completions/max_terminated_length": 176.2, "completions/mean_length": 94.375, "completions/mean_terminated_length": 86.31785888671875, "completions/min_length": 26.8, "completions/min_terminated_length": 26.8, "entropy": 0.13860561683541164, "epoch": 325.0, "frac_reward_zero_std": 0.6, "grad_norm": 0.173828125, "learning_rate": 2.087962962962963e-06, "loss": 0.06564760804176331, "num_tokens": 1805914.0, "reward": 0.6539920151233674, "reward_std": 0.31940706349123504, "rewards/reward_fn/mean": 0.6539920151233674, "rewards/reward_fn/std": 0.3194070724028279, "step": 650, "step_time": 31.155566099599675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.2, "completions/max_terminated_length": 188.2, "completions/mean_length": 96.375, "completions/mean_terminated_length": 96.375, "completions/min_length": 45.4, "completions/min_terminated_length": 45.4, "entropy": 0.12026678196853027, "epoch": 327.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.064814814814815e-06, "loss": 0.0, "num_tokens": 1819721.0, "reward": 0.800000011920929, "reward_std": 0.0, "rewards/reward_fn/mean": 0.800000011920929, "rewards/reward_fn/std": 0.0, "step": 655, "step_time": 29.93114675219913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 92.75, "completions/mean_terminated_length": 92.75, "completions/min_length": 17.6, "completions/min_terminated_length": 17.6, "entropy": 0.12484765433473513, "epoch": 330.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.041666666666667e-06, "loss": 0.0, "num_tokens": 1833579.0, "reward": 0.800000011920929, "reward_std": 0.0, "rewards/reward_fn/mean": 0.800000011920929, "rewards/reward_fn/std": 0.0, "step": 660, "step_time": 29.045186105799804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.6, "completions/max_terminated_length": 176.6, "completions/mean_length": 88.8, "completions/mean_terminated_length": 88.8, "completions/min_length": 19.8, "completions/min_terminated_length": 19.8, "entropy": 0.11349608366144821, "epoch": 332.5, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 2.0185185185185187e-06, "loss": 0.023330992460250853, "num_tokens": 1847063.0, "reward": 0.7500000119209289, "reward_std": 0.1414213538169861, "rewards/reward_fn/mean": 0.7500000119209289, "rewards/reward_fn/std": 0.1414213538169861, "step": 665, "step_time": 28.709340109200276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.2, "completions/max_terminated_length": 173.2, "completions/mean_length": 94.525, "completions/mean_terminated_length": 94.525, "completions/min_length": 29.4, "completions/min_terminated_length": 29.4, "entropy": 0.13530309102497995, "epoch": 335.0, "frac_reward_zero_std": 0.9, "grad_norm": 0.1513671875, "learning_rate": 1.9953703703703705e-06, "loss": 0.005181397497653961, "num_tokens": 1861012.0, "reward": 0.7500000119209289, "reward_std": 0.1414213538169861, "rewards/reward_fn/mean": 0.7500000119209289, "rewards/reward_fn/std": 0.1414213538169861, "step": 670, "step_time": 28.0701173802001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 98.75, "completions/mean_terminated_length": 98.75, "completions/min_length": 45.6, "completions/min_terminated_length": 45.6, "entropy": 0.13825307501247153, "epoch": 337.5, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 1.9722222222222224e-06, "loss": 0.007883030176162719, "num_tokens": 1875110.0, "reward": 0.7500000119209289, "reward_std": 0.1414213538169861, "rewards/reward_fn/mean": 0.7500000119209289, "rewards/reward_fn/std": 0.1414213538169861, "step": 675, "step_time": 26.424618140001257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.8, "completions/max_terminated_length": 180.8, "completions/mean_length": 97.3, "completions/mean_terminated_length": 97.3, "completions/min_length": 33.6, "completions/min_terminated_length": 33.6, "entropy": 0.12926889541558922, "epoch": 340.0, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 1.949074074074074e-06, "loss": -1.1175870895385742e-08, "num_tokens": 1888954.0, "reward": 0.7970000028610229, "reward_std": 0.008485282957553863, "rewards/reward_fn/mean": 0.7970000028610229, "rewards/reward_fn/std": 0.008485282212495804, "step": 680, "step_time": 29.160137929796473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.4, "completions/max_terminated_length": 177.4, "completions/mean_length": 98.375, "completions/mean_terminated_length": 98.375, "completions/min_length": 21.2, "completions/min_terminated_length": 21.2, "entropy": 0.12973124553682283, "epoch": 342.5, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "learning_rate": 1.925925925925926e-06, "loss": 0.01764121949672699, "num_tokens": 1903037.0, "reward": 0.7499920129776001, "reward_std": 0.14144398546923184, "rewards/reward_fn/mean": 0.7499920129776001, "rewards/reward_fn/std": 0.14144398245989578, "step": 685, "step_time": 28.66847429559639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.4, "completions/max_terminated_length": 189.4, "completions/mean_length": 95.675, "completions/mean_terminated_length": 95.675, "completions/min_length": 39.2, "completions/min_terminated_length": 39.2, "entropy": 0.13125102190533652, "epoch": 345.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9027777777777778e-06, "loss": 0.0, "num_tokens": 1916816.0, "reward": 0.800000011920929, "reward_std": 0.0, "rewards/reward_fn/mean": 0.800000011920929, "rewards/reward_fn/std": 0.0, "step": 690, "step_time": 30.259811803402407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 179.2, "completions/max_terminated_length": 178.4, "completions/mean_length": 103.3, "completions/mean_terminated_length": 101.18571472167969, "completions/min_length": 38.8, "completions/min_terminated_length": 38.8, "entropy": 0.13168567434186115, "epoch": 347.5, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "learning_rate": 1.8796296296296299e-06, "loss": 0.038299480080604555, "num_tokens": 1931116.0, "reward": 0.7499920129776001, "reward_std": 0.14144398546923184, "rewards/reward_fn/mean": 0.7499920129776001, "rewards/reward_fn/std": 0.14144398245989578, "step": 695, "step_time": 28.8232333001979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.6, "completions/max_terminated_length": 168.6, "completions/mean_length": 95.725, "completions/mean_terminated_length": 95.725, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.11110330240335316, "epoch": 350.0, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "learning_rate": 1.8564814814814817e-06, "loss": 0.019394376873970033, "num_tokens": 1944877.0, "reward": 0.700000011920929, "reward_std": 0.2828427076339722, "rewards/reward_fn/mean": 0.700000011920929, "rewards/reward_fn/std": 0.2828427076339722, "step": 700, "step_time": 27.542852006795876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.4, "completions/max_terminated_length": 186.4, "completions/mean_length": 100.875, "completions/mean_terminated_length": 100.875, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "entropy": 0.13480499390279874, "epoch": 352.5, "frac_reward_zero_std": 0.8, "grad_norm": 0.193359375, "learning_rate": 1.8333333333333333e-06, "loss": 0.006207095086574554, "num_tokens": 1958864.0, "reward": 0.6499920099973678, "reward_std": 0.20704229450930142, "rewards/reward_fn/mean": 0.6499920099973678, "rewards/reward_fn/std": 0.20704231534182327, "step": 705, "step_time": 29.79549445579905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 184.6, "completions/max_terminated_length": 181.4, "completions/mean_length": 100.15, "completions/mean_terminated_length": 96.58928680419922, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "entropy": 0.14106163564138113, "epoch": 355.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8101851851851853e-06, "loss": 0.0, "num_tokens": 1973018.0, "reward": 0.800000011920929, "reward_std": 0.0, "rewards/reward_fn/mean": 0.800000011920929, "rewards/reward_fn/std": 0.0, "step": 710, "step_time": 29.740049313199414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 80.225, "completions/mean_terminated_length": 80.225, "completions/min_length": 44.2, "completions/min_terminated_length": 44.2, "entropy": 0.13479442811803893, "epoch": 357.5, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 1.7870370370370371e-06, "loss": -0.0044358067214488985, "num_tokens": 1986131.0, "reward": 0.7999920129776001, "reward_std": 2.263165224576369e-05, "rewards/reward_fn/mean": 0.7999920129776001, "rewards/reward_fn/std": 2.2628642909694463e-05, "step": 715, "step_time": 23.464931381204224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 210.4, "completions/max_terminated_length": 195.4, "completions/mean_length": 108.975, "completions/mean_terminated_length": 104.57142944335938, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "entropy": 0.12921297890134156, "epoch": 360.0, "frac_reward_zero_std": 0.8, "grad_norm": 0.2138671875, "learning_rate": 1.7638888888888892e-06, "loss": 0.015706980228424074, "num_tokens": 2000686.0, "reward": 0.6500000119209289, "reward_std": 0.3265853762626648, "rewards/reward_fn/mean": 0.6500000119209289, "rewards/reward_fn/std": 0.3265853762626648, "step": 720, "step_time": 33.07905763199815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 213.2, "completions/max_terminated_length": 197.6, "completions/mean_length": 109.575, "completions/mean_terminated_length": 105.03928680419922, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "entropy": 0.11884576582815498, "epoch": 362.5, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 1.740740740740741e-06, "loss": 0.04035586714744568, "num_tokens": 2015265.0, "reward": 0.7500000119209289, "reward_std": 0.1414213538169861, "rewards/reward_fn/mean": 0.7500000119209289, "rewards/reward_fn/std": 0.1414213538169861, "step": 725, "step_time": 33.32936851200211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 144.8, "completions/max_terminated_length": 128.4, "completions/mean_length": 87.825, "completions/mean_terminated_length": 85.10714416503906, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.12527481613215058, "epoch": 365.0, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 1.7175925925925926e-06, "loss": 0.00535660944879055, "num_tokens": 2028682.0, "reward": 0.7500000119209289, "reward_std": 0.1414213538169861, "rewards/reward_fn/mean": 0.7500000119209289, "rewards/reward_fn/std": 0.1414213538169861, "step": 730, "step_time": 24.470471370794986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 194.6, "completions/max_terminated_length": 172.8, "completions/mean_length": 94.275, "completions/mean_terminated_length": 89.62142944335938, "completions/min_length": 21.2, "completions/min_terminated_length": 21.2, "entropy": 0.1287142861634493, "epoch": 367.5, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 1.6944444444444446e-06, "loss": 0.04657252728939056, "num_tokens": 2042601.0, "reward": 0.7500000119209289, "reward_std": 0.1414213538169861, "rewards/reward_fn/mean": 0.7500000119209289, "rewards/reward_fn/std": 0.1414213538169861, "step": 735, "step_time": 31.15084702780441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 188.2, "completions/max_terminated_length": 179.4, "completions/mean_length": 93.5, "completions/mean_terminated_length": 89.78214416503906, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.12946587130427362, "epoch": 370.0, "frac_reward_zero_std": 0.7, "grad_norm": 0.0, "learning_rate": 1.6712962962962965e-06, "loss": 0.036357957124710086, "num_tokens": 2056293.0, "reward": 0.6999920129776, "reward_std": 0.28286533928621793, "rewards/reward_fn/mean": 0.6999920129776, "rewards/reward_fn/std": 0.2828653362768819, "step": 740, "step_time": 30.288723102195945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 96.55, "completions/mean_terminated_length": 96.55, "completions/min_length": 32.8, "completions/min_terminated_length": 32.8, "entropy": 0.14114207804668694, "epoch": 372.5, "frac_reward_zero_std": 0.8, "grad_norm": 0.318359375, "learning_rate": 1.648148148148148e-06, "loss": 0.04727624654769898, "num_tokens": 2070107.0, "reward": 0.700000011920929, "reward_std": 0.2828427076339722, "rewards/reward_fn/mean": 0.700000011920929, "rewards/reward_fn/std": 0.2828427076339722, "step": 745, "step_time": 29.68092410319514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 196.2, "completions/max_terminated_length": 186.8, "completions/mean_length": 102.175, "completions/mean_terminated_length": 97.74285736083985, "completions/min_length": 29.2, "completions/min_terminated_length": 29.2, "entropy": 0.14779772673500702, "epoch": 375.0, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 1.6250000000000001e-06, "loss": 0.03531245589256286, "num_tokens": 2084342.0, "reward": 0.7500000119209289, "reward_std": 0.1414213538169861, "rewards/reward_fn/mean": 0.7500000119209289, "rewards/reward_fn/std": 0.1414213538169861, "step": 750, "step_time": 31.207076488804887 } ], "logging_steps": 5, "max_steps": 600, "num_input_tokens_seen": 2086592, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }