| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.03237068965517241, |
| "eval_steps": 500, |
| "global_step": 751, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1, |
| "completions/max_length": 193.0, |
| "completions/max_terminated_length": 163.0, |
| "completions/mean_length": 103.975, |
| "completions/mean_terminated_length": 90.39881134033203, |
| "completions/min_length": 17.6, |
| "completions/min_terminated_length": 17.6, |
| "entropy": 0.1820149033330381, |
| "epoch": 2.5, |
| "frac_reward_zero_std": 0.5, |
| "grad_norm": 0.25, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 0.0794088900089264, |
| "num_tokens": 14355.0, |
| "reward": 0.15020800828933717, |
| "reward_std": 0.6376187483081595, |
| "rewards/reward_fn/mean": 0.15020800828933717, |
| "rewards/reward_fn/std": 0.6376187764341011, |
| "step": 5, |
| "step_time": 30.522303848797673 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 151.2, |
| "completions/max_terminated_length": 141.6, |
| "completions/mean_length": 80.075, |
| "completions/mean_terminated_length": 77.78928833007812, |
| "completions/min_length": 17.8, |
| "completions/min_terminated_length": 17.8, |
| "entropy": 0.16221200795844198, |
| "epoch": 5.0, |
| "frac_reward_zero_std": 0.5, |
| "grad_norm": 0.228515625, |
| "learning_rate": 2.25e-06, |
| "loss": 0.07169516086578369, |
| "num_tokens": 27462.0, |
| "reward": 0.4012819856405258, |
| "reward_std": 0.4128362699819263, |
| "rewards/reward_fn/mean": 0.4012819856405258, |
| "rewards/reward_fn/std": 0.4128363010211615, |
| "step": 10, |
| "step_time": 25.01976956339822 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 185.6, |
| "completions/max_terminated_length": 171.6, |
| "completions/mean_length": 83.65, |
| "completions/mean_terminated_length": 79.14285736083984, |
| "completions/min_length": 16.6, |
| "completions/min_terminated_length": 16.6, |
| "entropy": 0.14225535104051232, |
| "epoch": 7.5, |
| "frac_reward_zero_std": 0.3, |
| "grad_norm": 0.265625, |
| "learning_rate": 3.5e-06, |
| "loss": 0.0717179834842682, |
| "num_tokens": 40740.0, |
| "reward": 0.10207997858524323, |
| "reward_std": 0.7454913818277419, |
| "rewards/reward_fn/mean": 0.10207997858524323, |
| "rewards/reward_fn/std": 0.7454914333298802, |
| "step": 15, |
| "step_time": 29.626422570000432 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1, |
| "completions/max_length": 202.0, |
| "completions/max_terminated_length": 163.0, |
| "completions/mean_length": 98.85, |
| "completions/mean_terminated_length": 81.79285888671875, |
| "completions/min_length": 19.2, |
| "completions/min_terminated_length": 19.2, |
| "entropy": 0.1534867493668571, |
| "epoch": 10.0, |
| "frac_reward_zero_std": 0.4, |
| "grad_norm": 0.2431640625, |
| "learning_rate": 4.75e-06, |
| "loss": 0.09726614952087402, |
| "num_tokens": 54862.0, |
| "reward": 0.25181599259376525, |
| "reward_std": 0.6045640033902601, |
| "rewards/reward_fn/mean": 0.25181599259376525, |
| "rewards/reward_fn/std": 0.6045640454394743, |
| "step": 20, |
| "step_time": 31.625062462999267 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.125, |
| "completions/max_length": 233.6, |
| "completions/max_terminated_length": 194.4, |
| "completions/mean_length": 99.525, |
| "completions/mean_terminated_length": 77.78690643310547, |
| "completions/min_length": 18.8, |
| "completions/min_terminated_length": 18.8, |
| "entropy": 0.1538564210291952, |
| "epoch": 12.5, |
| "frac_reward_zero_std": 0.6, |
| "grad_norm": 0.0, |
| "learning_rate": 4.981481481481482e-06, |
| "loss": 0.13929661512374877, |
| "num_tokens": 69039.0, |
| "reward": 0.0006859898567199707, |
| "reward_std": 1.0108385920524596, |
| "rewards/reward_fn/mean": 0.0006859898567199707, |
| "rewards/reward_fn/std": 1.0108386158943177, |
| "step": 25, |
| "step_time": 35.65783783460065 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.05, |
| "completions/max_length": 181.8, |
| "completions/max_terminated_length": 153.0, |
| "completions/mean_length": 65.05, |
| "completions/mean_terminated_length": 56.03928680419922, |
| "completions/min_length": 16.6, |
| "completions/min_terminated_length": 16.6, |
| "entropy": 0.13224927680566906, |
| "epoch": 15.0, |
| "frac_reward_zero_std": 0.5, |
| "grad_norm": 0.1875, |
| "learning_rate": 4.958333333333334e-06, |
| "loss": -0.003383058309555054, |
| "num_tokens": 81545.0, |
| "reward": 0.3994179755449295, |
| "reward_std": 0.5622525057464373, |
| "rewards/reward_fn/mean": 0.3994179755449295, |
| "rewards/reward_fn/std": 0.56225254482124, |
| "step": 30, |
| "step_time": 29.112151033400732 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.05, |
| "completions/max_length": 131.0, |
| "completions/max_terminated_length": 128.6, |
| "completions/mean_length": 61.075, |
| "completions/mean_terminated_length": 53.282144165039064, |
| "completions/min_length": 17.6, |
| "completions/min_terminated_length": 17.6, |
| "entropy": 0.11974610288161784, |
| "epoch": 17.5, |
| "frac_reward_zero_std": 0.7, |
| "grad_norm": 0.220703125, |
| "learning_rate": 4.935185185185186e-06, |
| "loss": 0.004663025587797165, |
| "num_tokens": 93892.0, |
| "reward": 0.3469119846820831, |
| "reward_std": 0.5775202971824911, |
| "rewards/reward_fn/mean": 0.3469119846820831, |
| "rewards/reward_fn/std": 0.5775203009106917, |
| "step": 35, |
| "step_time": 22.457519923200017 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.075, |
| "completions/max_length": 179.6, |
| "completions/max_terminated_length": 131.2, |
| "completions/mean_length": 78.025, |
| "completions/mean_terminated_length": 65.70357513427734, |
| "completions/min_length": 19.0, |
| "completions/min_terminated_length": 19.0, |
| "entropy": 0.12550681543070824, |
| "epoch": 20.0, |
| "frac_reward_zero_std": 0.4, |
| "grad_norm": 0.4609375, |
| "learning_rate": 4.9120370370370375e-06, |
| "loss": 0.036792796850204465, |
| "num_tokens": 107209.0, |
| "reward": 0.3498039901256561, |
| "reward_std": 0.7408117946935817, |
| "rewards/reward_fn/mean": 0.3498039901256561, |
| "rewards/reward_fn/std": 0.7408118456369266, |
| "step": 40, |
| "step_time": 28.692756705999635 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 189.0, |
| "completions/max_terminated_length": 178.8, |
| "completions/mean_length": 92.675, |
| "completions/mean_terminated_length": 89.51071472167969, |
| "completions/min_length": 17.8, |
| "completions/min_terminated_length": 17.8, |
| "entropy": 0.15165529411751777, |
| "epoch": 22.5, |
| "frac_reward_zero_std": 0.5, |
| "grad_norm": 0.232421875, |
| "learning_rate": 4.888888888888889e-06, |
| "loss": -0.004727205634117127, |
| "num_tokens": 120868.0, |
| "reward": 0.30072798430919645, |
| "reward_std": 0.7614144545921591, |
| "rewards/reward_fn/mean": 0.30072798430919645, |
| "rewards/reward_fn/std": 0.7614145380415721, |
| "step": 45, |
| "step_time": 30.053675796201425 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.075, |
| "completions/max_length": 208.6, |
| "completions/max_terminated_length": 189.6, |
| "completions/mean_length": 102.625, |
| "completions/mean_terminated_length": 90.11666870117188, |
| "completions/min_length": 18.8, |
| "completions/min_terminated_length": 18.8, |
| "entropy": 0.15195838457439095, |
| "epoch": 25.0, |
| "frac_reward_zero_std": 0.6, |
| "grad_norm": 0.208984375, |
| "learning_rate": 4.865740740740741e-06, |
| "loss": 0.04500017166137695, |
| "num_tokens": 135121.0, |
| "reward": 0.5009119868278503, |
| "reward_std": 0.5213470441231038, |
| "rewards/reward_fn/mean": 0.5009119868278503, |
| "rewards/reward_fn/std": 0.5213470560469432, |
| "step": 50, |
| "step_time": 32.55890014459801 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 195.4, |
| "completions/max_terminated_length": 188.4, |
| "completions/mean_length": 110.95, |
| "completions/mean_terminated_length": 106.75357208251953, |
| "completions/min_length": 51.8, |
| "completions/min_terminated_length": 51.8, |
| "entropy": 0.1545537636615336, |
| "epoch": 27.5, |
| "frac_reward_zero_std": 0.5, |
| "grad_norm": 0.1201171875, |
| "learning_rate": 4.842592592592593e-06, |
| "loss": 0.04118683934211731, |
| "num_tokens": 149755.0, |
| "reward": 0.40073599219322203, |
| "reward_std": 0.6958227735885885, |
| "rewards/reward_fn/mean": 0.40073599219322203, |
| "rewards/reward_fn/std": 0.6958228093542858, |
| "step": 55, |
| "step_time": 30.86228356460051 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.075, |
| "completions/max_length": 172.8, |
| "completions/max_terminated_length": 147.2, |
| "completions/mean_length": 91.875, |
| "completions/mean_terminated_length": 81.20119171142578, |
| "completions/min_length": 32.8, |
| "completions/min_terminated_length": 32.8, |
| "entropy": 0.14326929268427194, |
| "epoch": 30.0, |
| "frac_reward_zero_std": 0.5, |
| "grad_norm": 0.318359375, |
| "learning_rate": 4.819444444444445e-06, |
| "loss": 0.06774483323097229, |
| "num_tokens": 163334.0, |
| "reward": 0.5998779892921448, |
| "reward_std": 0.3705297726322897, |
| "rewards/reward_fn/mean": 0.5998779892921448, |
| "rewards/reward_fn/std": 0.3705297749955207, |
| "step": 60, |
| "step_time": 27.902300025800287 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 152.4, |
| "completions/max_terminated_length": 147.6, |
| "completions/mean_length": 81.625, |
| "completions/mean_terminated_length": 78.91071472167968, |
| "completions/min_length": 33.0, |
| "completions/min_terminated_length": 33.0, |
| "entropy": 0.15796504009049386, |
| "epoch": 32.5, |
| "frac_reward_zero_std": 0.6, |
| "grad_norm": 0.0, |
| "learning_rate": 4.796296296296297e-06, |
| "loss": -0.008640621602535248, |
| "num_tokens": 176503.0, |
| "reward": 0.45116598904132843, |
| "reward_std": 0.4201629768765997, |
| "rewards/reward_fn/mean": 0.45116598904132843, |
| "rewards/reward_fn/std": 0.42016300329414663, |
| "step": 65, |
| "step_time": 25.321657606401278 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.05, |
| "completions/max_length": 197.6, |
| "completions/max_terminated_length": 183.2, |
| "completions/mean_length": 114.3, |
| "completions/mean_terminated_length": 109.30833435058594, |
| "completions/min_length": 50.0, |
| "completions/min_terminated_length": 50.0, |
| "entropy": 0.16214836928993465, |
| "epoch": 35.0, |
| "frac_reward_zero_std": 0.5, |
| "grad_norm": 0.0, |
| "learning_rate": 4.7731481481481484e-06, |
| "loss": 0.007857038080692292, |
| "num_tokens": 191271.0, |
| "reward": 0.29932799339294436, |
| "reward_std": 0.6332858696579933, |
| "rewards/reward_fn/mean": 0.29932799339294436, |
| "rewards/reward_fn/std": 0.6332858689129353, |
| "step": 70, |
| "step_time": 31.062436907000667 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.05, |
| "completions/max_length": 179.4, |
| "completions/max_terminated_length": 173.2, |
| "completions/mean_length": 81.875, |
| "completions/mean_terminated_length": 72.65, |
| "completions/min_length": 16.0, |
| "completions/min_terminated_length": 16.0, |
| "entropy": 0.1165547446347773, |
| "epoch": 37.5, |
| "frac_reward_zero_std": 0.5, |
| "grad_norm": 0.4609375, |
| "learning_rate": 4.75e-06, |
| "loss": 0.015596307814121246, |
| "num_tokens": 204450.0, |
| "reward": 0.5023359954357147, |
| "reward_std": 0.39528531452669996, |
| "rewards/reward_fn/mean": 0.5023359954357147, |
| "rewards/reward_fn/std": 0.3952853078444605, |
| "step": 75, |
| "step_time": 28.755046762000347 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.05, |
| "completions/max_length": 204.4, |
| "completions/max_terminated_length": 181.4, |
| "completions/mean_length": 111.05, |
| "completions/mean_terminated_length": 104.30357208251954, |
| "completions/min_length": 44.6, |
| "completions/min_terminated_length": 44.6, |
| "entropy": 0.13618089363444597, |
| "epoch": 40.0, |
| "frac_reward_zero_std": 0.6, |
| "grad_norm": 0.140625, |
| "learning_rate": 4.726851851851852e-06, |
| "loss": 0.043748298287391664, |
| "num_tokens": 219088.0, |
| "reward": 0.3999840050935745, |
| "reward_std": 0.5622952695033746, |
| "rewards/reward_fn/mean": 0.3999840050935745, |
| "rewards/reward_fn/std": 0.5622952873265603, |
| "step": 80, |
| "step_time": 31.989889109399883 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 191.4, |
| "completions/max_terminated_length": 186.4, |
| "completions/mean_length": 104.1, |
| "completions/mean_terminated_length": 100.05000152587891, |
| "completions/min_length": 39.0, |
| "completions/min_terminated_length": 39.0, |
| "entropy": 0.17853650995530188, |
| "epoch": 42.5, |
| "frac_reward_zero_std": 0.5, |
| "grad_norm": 0.1875, |
| "learning_rate": 4.703703703703704e-06, |
| "loss": 0.02743053436279297, |
| "num_tokens": 233420.0, |
| "reward": 0.44984598755836486, |
| "reward_std": 0.5406898662215098, |
| "rewards/reward_fn/mean": 0.44984598755836486, |
| "rewards/reward_fn/std": 0.5406898936373181, |
| "step": 85, |
| "step_time": 30.449153596599718 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.075, |
| "completions/max_length": 200.6, |
| "completions/max_terminated_length": 186.6, |
| "completions/mean_length": 103.875, |
| "completions/mean_terminated_length": 88.48499984741211, |
| "completions/min_length": 31.2, |
| "completions/min_terminated_length": 31.2, |
| "entropy": 0.12900277911685407, |
| "epoch": 45.0, |
| "frac_reward_zero_std": 0.6, |
| "grad_norm": 0.2314453125, |
| "learning_rate": 4.680555555555556e-06, |
| "loss": 0.05115030407905578, |
| "num_tokens": 247507.0, |
| "reward": 0.5499680012464523, |
| "reward_std": 0.39219149924028895, |
| "rewards/reward_fn/mean": 0.5499680012464523, |
| "rewards/reward_fn/std": 0.3921915319937398, |
| "step": 90, |
| "step_time": 31.51559891059951 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 180.8, |
| "completions/max_terminated_length": 167.4, |
| "completions/mean_length": 103.05, |
| "completions/mean_terminated_length": 99.05000152587891, |
| "completions/min_length": 44.6, |
| "completions/min_terminated_length": 44.6, |
| "entropy": 0.12843654905445873, |
| "epoch": 47.5, |
| "frac_reward_zero_std": 0.5, |
| "grad_norm": 0.099609375, |
| "learning_rate": 4.6574074074074076e-06, |
| "loss": 0.043516650795936584, |
| "num_tokens": 261825.0, |
| "reward": 0.4511160016059875, |
| "reward_std": 0.6716319680213928, |
| "rewards/reward_fn/mean": 0.4511160016059875, |
| "rewards/reward_fn/std": 0.6716320037841796, |
| "step": 95, |
| "step_time": 28.928432848399826 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 157.0, |
| "completions/max_terminated_length": 151.8, |
| "completions/mean_length": 84.425, |
| "completions/mean_terminated_length": 80.66428680419922, |
| "completions/min_length": 35.8, |
| "completions/min_terminated_length": 35.8, |
| "entropy": 0.13799221392255276, |
| "epoch": 50.0, |
| "frac_reward_zero_std": 0.5, |
| "grad_norm": 0.154296875, |
| "learning_rate": 4.634259259259259e-06, |
| "loss": -0.0032827585935592653, |
| "num_tokens": 275106.0, |
| "reward": 0.5485600084066391, |
| "reward_std": 0.39775218120921635, |
| "rewards/reward_fn/mean": 0.5485600084066391, |
| "rewards/reward_fn/std": 0.3977522020417382, |
| "step": 100, |
| "step_time": 25.882849212799556 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 182.2, |
| "completions/max_terminated_length": 174.8, |
| "completions/mean_length": 100.925, |
| "completions/mean_terminated_length": 96.83928680419922, |
| "completions/min_length": 40.4, |
| "completions/min_terminated_length": 40.4, |
| "entropy": 0.12651287836488337, |
| "epoch": 52.5, |
| "frac_reward_zero_std": 0.6, |
| "grad_norm": 0.0, |
| "learning_rate": 4.611111111111112e-06, |
| "loss": -0.008606845140457153, |
| "num_tokens": 289075.0, |
| "reward": 0.5499760001897812, |
| "reward_std": 0.3922359466421767, |
| "rewards/reward_fn/mean": 0.5499760001897812, |
| "rewards/reward_fn/std": 0.39223596563824686, |
| "step": 105, |
| "step_time": 29.0759426169996 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.05, |
| "completions/max_length": 194.0, |
| "completions/max_terminated_length": 169.0, |
| "completions/mean_length": 104.525, |
| "completions/mean_terminated_length": 98.08214416503907, |
| "completions/min_length": 43.2, |
| "completions/min_terminated_length": 43.2, |
| "entropy": 0.16052292285021394, |
| "epoch": 55.0, |
| "frac_reward_zero_std": 0.6, |
| "grad_norm": 0.0, |
| "learning_rate": 4.587962962962964e-06, |
| "loss": 0.022305211424827574, |
| "num_tokens": 303424.0, |
| "reward": 0.5999760001897811, |
| "reward_std": 0.34845718718279384, |
| "rewards/reward_fn/mean": 0.5999760001897811, |
| "rewards/reward_fn/std": 0.3484572199362447, |
| "step": 110, |
| "step_time": 30.576252812399254 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.05, |
| "completions/max_length": 186.2, |
| "completions/max_terminated_length": 170.0, |
| "completions/mean_length": 107.275, |
| "completions/mean_terminated_length": 97.70833435058594, |
| "completions/min_length": 42.2, |
| "completions/min_terminated_length": 42.2, |
| "entropy": 0.16247000750154256, |
| "epoch": 57.5, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 4.564814814814815e-06, |
| "loss": 0.036947214603424074, |
| "num_tokens": 317863.0, |
| "reward": 0.700000011920929, |
| "reward_std": 0.1851640224456787, |
| "rewards/reward_fn/mean": 0.700000011920929, |
| "rewards/reward_fn/std": 0.1851640224456787, |
| "step": 115, |
| "step_time": 29.630989668599433 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.05, |
| "completions/max_length": 202.0, |
| "completions/max_terminated_length": 183.2, |
| "completions/mean_length": 106.1, |
| "completions/mean_terminated_length": 99.70714416503907, |
| "completions/min_length": 40.8, |
| "completions/min_terminated_length": 40.8, |
| "entropy": 0.15718324964400382, |
| "epoch": 60.0, |
| "frac_reward_zero_std": 0.7, |
| "grad_norm": 0.0, |
| "learning_rate": 4.541666666666667e-06, |
| "loss": 0.04491000473499298, |
| "num_tokens": 332059.0, |
| "reward": 0.2999920010566711, |
| "reward_std": 0.6278709468911984, |
| "rewards/reward_fn/mean": 0.2999920010566711, |
| "rewards/reward_fn/std": 0.6278709915655781, |
| "step": 120, |
| "step_time": 31.635853910601874 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 170.6, |
| "completions/max_terminated_length": 170.6, |
| "completions/mean_length": 94.575, |
| "completions/mean_terminated_length": 94.575, |
| "completions/min_length": 42.0, |
| "completions/min_terminated_length": 42.0, |
| "entropy": 0.10850867056287825, |
| "epoch": 62.5, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.0, |
| "learning_rate": 4.5185185185185185e-06, |
| "loss": 0.024337136745452882, |
| "num_tokens": 345990.0, |
| "reward": 0.6500000119209289, |
| "reward_std": 0.3265853762626648, |
| "rewards/reward_fn/mean": 0.6500000119209289, |
| "rewards/reward_fn/std": 0.3265853762626648, |
| "step": 125, |
| "step_time": 27.550983037801416 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.075, |
| "completions/max_length": 201.8, |
| "completions/max_terminated_length": 187.0, |
| "completions/mean_length": 99.25, |
| "completions/mean_terminated_length": 84.59642944335937, |
| "completions/min_length": 18.0, |
| "completions/min_terminated_length": 18.0, |
| "entropy": 0.12130871089175344, |
| "epoch": 65.0, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.0, |
| "learning_rate": 4.49537037037037e-06, |
| "loss": 0.018311865627765656, |
| "num_tokens": 359912.0, |
| "reward": 0.6000000089406967, |
| "reward_std": 0.34844101667404176, |
| "rewards/reward_fn/mean": 0.6000000089406967, |
| "rewards/reward_fn/std": 0.34844104051589964, |
| "step": 130, |
| "step_time": 31.70451795959889 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 162.6, |
| "completions/max_terminated_length": 162.6, |
| "completions/mean_length": 89.825, |
| "completions/mean_terminated_length": 89.825, |
| "completions/min_length": 43.0, |
| "completions/min_terminated_length": 43.0, |
| "entropy": 0.11614590694662183, |
| "epoch": 67.5, |
| "frac_reward_zero_std": 0.7, |
| "grad_norm": 0.1982421875, |
| "learning_rate": 4.472222222222223e-06, |
| "loss": 0.019238683581352233, |
| "num_tokens": 373457.0, |
| "reward": 0.5999920129776001, |
| "reward_std": 0.3703506765436032, |
| "rewards/reward_fn/mean": 0.5999920129776001, |
| "rewards/reward_fn/std": 0.3703506735342671, |
| "step": 135, |
| "step_time": 26.545528039799684 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.075, |
| "completions/max_length": 205.4, |
| "completions/max_terminated_length": 165.4, |
| "completions/mean_length": 99.4, |
| "completions/mean_terminated_length": 87.00476379394532, |
| "completions/min_length": 19.4, |
| "completions/min_terminated_length": 19.4, |
| "entropy": 0.1428308295784518, |
| "epoch": 70.0, |
| "frac_reward_zero_std": 0.7, |
| "grad_norm": 0.0, |
| "learning_rate": 4.449074074074075e-06, |
| "loss": 0.04644445776939392, |
| "num_tokens": 387581.0, |
| "reward": 0.5499920099973679, |
| "reward_std": 0.39217875003814695, |
| "rewards/reward_fn/mean": 0.5499920099973679, |
| "rewards/reward_fn/std": 0.39217878580093385, |
| "step": 140, |
| "step_time": 32.013767499400274 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 171.8, |
| "completions/max_terminated_length": 171.8, |
| "completions/mean_length": 97.65, |
| "completions/mean_terminated_length": 97.65, |
| "completions/min_length": 28.8, |
| "completions/min_terminated_length": 28.8, |
| "entropy": 0.13889142961706966, |
| "epoch": 72.5, |
| "frac_reward_zero_std": 0.6, |
| "grad_norm": 0.2197265625, |
| "learning_rate": 4.425925925925927e-06, |
| "loss": 0.021968120336532594, |
| "num_tokens": 401439.0, |
| "reward": 0.6477920055389405, |
| "reward_std": 0.4305092230439186, |
| "rewards/reward_fn/mean": 0.6477920055389405, |
| "rewards/reward_fn/std": 0.4305092342197895, |
| "step": 145, |
| "step_time": 27.826696390995494 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.05, |
| "completions/max_length": 211.0, |
| "completions/max_terminated_length": 199.4, |
| "completions/mean_length": 110.125, |
| "completions/mean_terminated_length": 102.15714416503906, |
| "completions/min_length": 36.6, |
| "completions/min_terminated_length": 36.6, |
| "entropy": 0.17309838379733264, |
| "epoch": 75.0, |
| "frac_reward_zero_std": 0.7, |
| "grad_norm": 0.216796875, |
| "learning_rate": 4.4027777777777784e-06, |
| "loss": 0.0743155837059021, |
| "num_tokens": 415992.0, |
| "reward": 0.6007840156555175, |
| "reward_std": 0.3688644051551819, |
| "rewards/reward_fn/mean": 0.6007840156555175, |
| "rewards/reward_fn/std": 0.3688644051551819, |
| "step": 150, |
| "step_time": 32.79056119860106 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 166.6, |
| "completions/max_terminated_length": 153.4, |
| "completions/mean_length": 102.05, |
| "completions/mean_terminated_length": 99.5, |
| "completions/min_length": 56.6, |
| "completions/min_terminated_length": 56.6, |
| "entropy": 0.13074476819019765, |
| "epoch": 77.5, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.1337890625, |
| "learning_rate": 4.379629629629629e-06, |
| "loss": 0.01247234344482422, |
| "num_tokens": 430242.0, |
| "reward": 0.5507920116186142, |
| "reward_std": 0.3907249927520752, |
| "rewards/reward_fn/mean": 0.5507920116186142, |
| "rewards/reward_fn/std": 0.3907250165939331, |
| "step": 155, |
| "step_time": 27.061828034398786 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 194.4, |
| "completions/max_terminated_length": 194.4, |
| "completions/mean_length": 92.975, |
| "completions/mean_terminated_length": 92.975, |
| "completions/min_length": 19.8, |
| "completions/min_terminated_length": 19.8, |
| "entropy": 0.11762763226870447, |
| "epoch": 80.0, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.2060546875, |
| "learning_rate": 4.356481481481482e-06, |
| "loss": 0.0216289147734642, |
| "num_tokens": 443893.0, |
| "reward": 0.700000011920929, |
| "reward_std": 0.2828427076339722, |
| "rewards/reward_fn/mean": 0.700000011920929, |
| "rewards/reward_fn/std": 0.2828427076339722, |
| "step": 160, |
| "step_time": 30.701956771399274 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.05, |
| "completions/max_length": 185.6, |
| "completions/max_terminated_length": 178.8, |
| "completions/mean_length": 104.825, |
| "completions/mean_terminated_length": 98.86785888671875, |
| "completions/min_length": 41.4, |
| "completions/min_terminated_length": 41.4, |
| "entropy": 0.12376428130082787, |
| "epoch": 82.5, |
| "frac_reward_zero_std": 0.7, |
| "grad_norm": 0.19921875, |
| "learning_rate": 4.333333333333334e-06, |
| "loss": 0.04524213671684265, |
| "num_tokens": 458018.0, |
| "reward": 0.6500000119209289, |
| "reward_std": 0.42426406145095824, |
| "rewards/reward_fn/mean": 0.6500000119209289, |
| "rewards/reward_fn/std": 0.42426406145095824, |
| "step": 165, |
| "step_time": 29.50906059279878 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 201.0, |
| "completions/max_terminated_length": 177.4, |
| "completions/mean_length": 107.4, |
| "completions/mean_terminated_length": 102.47142944335937, |
| "completions/min_length": 53.0, |
| "completions/min_terminated_length": 53.0, |
| "entropy": 0.1829328211490065, |
| "epoch": 85.0, |
| "frac_reward_zero_std": 0.7, |
| "grad_norm": 0.0, |
| "learning_rate": 4.310185185185186e-06, |
| "loss": 0.05608509778976441, |
| "num_tokens": 472482.0, |
| "reward": 0.5000000089406967, |
| "reward_std": 0.5336050391197205, |
| "rewards/reward_fn/mean": 0.5000000089406967, |
| "rewards/reward_fn/std": 0.5336050629615784, |
| "step": 170, |
| "step_time": 31.589203411201016 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 142.6, |
| "completions/max_terminated_length": 134.4, |
| "completions/mean_length": 90.575, |
| "completions/mean_terminated_length": 88.62142944335938, |
| "completions/min_length": 60.8, |
| "completions/min_terminated_length": 60.8, |
| "entropy": 0.12587652734946458, |
| "epoch": 87.5, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 4.2870370370370376e-06, |
| "loss": 0.006346069276332855, |
| "num_tokens": 486009.0, |
| "reward": 0.700000011920929, |
| "reward_std": 0.1851640224456787, |
| "rewards/reward_fn/mean": 0.700000011920929, |
| "rewards/reward_fn/std": 0.1851640224456787, |
| "step": 175, |
| "step_time": 24.074513484201454 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.05, |
| "completions/max_length": 172.4, |
| "completions/max_terminated_length": 156.4, |
| "completions/mean_length": 113.225, |
| "completions/mean_terminated_length": 107.86666870117188, |
| "completions/min_length": 62.8, |
| "completions/min_terminated_length": 62.8, |
| "entropy": 0.13538812827318908, |
| "epoch": 90.0, |
| "frac_reward_zero_std": 0.6, |
| "grad_norm": 0.0, |
| "learning_rate": 4.263888888888889e-06, |
| "loss": 0.04661061465740204, |
| "num_tokens": 500734.0, |
| "reward": 0.5999920129776001, |
| "reward_std": 0.46800349950790404, |
| "rewards/reward_fn/mean": 0.5999920129776001, |
| "rewards/reward_fn/std": 0.468003511428833, |
| "step": 180, |
| "step_time": 27.94757253280186 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 196.8, |
| "completions/max_terminated_length": 179.4, |
| "completions/mean_length": 105.125, |
| "completions/mean_terminated_length": 100.6, |
| "completions/min_length": 23.2, |
| "completions/min_terminated_length": 23.2, |
| "entropy": 0.14581037967000157, |
| "epoch": 92.5, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.1181640625, |
| "learning_rate": 4.240740740740741e-06, |
| "loss": 0.0037437133491039277, |
| "num_tokens": 515087.0, |
| "reward": 0.7999920129776001, |
| "reward_std": 2.263165224576369e-05, |
| "rewards/reward_fn/mean": 0.7999920129776001, |
| "rewards/reward_fn/std": 2.2628642909694463e-05, |
| "step": 185, |
| "step_time": 31.02160367520264 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 154.2, |
| "completions/max_terminated_length": 139.4, |
| "completions/mean_length": 91.275, |
| "completions/mean_terminated_length": 88.80357360839844, |
| "completions/min_length": 55.8, |
| "completions/min_terminated_length": 55.8, |
| "entropy": 0.1140737212728709, |
| "epoch": 95.0, |
| "frac_reward_zero_std": 0.7, |
| "grad_norm": 0.1728515625, |
| "learning_rate": 4.217592592592593e-06, |
| "loss": 0.029188913106918336, |
| "num_tokens": 528690.0, |
| "reward": 0.600000011920929, |
| "reward_std": 0.4680067300796509, |
| "rewards/reward_fn/mean": 0.600000011920929, |
| "rewards/reward_fn/std": 0.4680067300796509, |
| "step": 190, |
| "step_time": 25.543457056801707 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 187.0, |
| "completions/max_terminated_length": 171.8, |
| "completions/mean_length": 81.625, |
| "completions/mean_terminated_length": 77.18928680419921, |
| "completions/min_length": 19.4, |
| "completions/min_terminated_length": 19.4, |
| "entropy": 0.114993113768287, |
| "epoch": 97.5, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.0, |
| "learning_rate": 4.194444444444445e-06, |
| "loss": 0.0800092101097107, |
| "num_tokens": 541859.0, |
| "reward": 0.700000011920929, |
| "reward_std": 0.2828427076339722, |
| "rewards/reward_fn/mean": 0.700000011920929, |
| "rewards/reward_fn/std": 0.2828427076339722, |
| "step": 195, |
| "step_time": 29.684030519998487 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.05, |
| "completions/max_length": 188.2, |
| "completions/max_terminated_length": 171.6, |
| "completions/mean_length": 114.675, |
| "completions/mean_terminated_length": 107.88214416503907, |
| "completions/min_length": 63.4, |
| "completions/min_terminated_length": 63.4, |
| "entropy": 0.14553113598376513, |
| "epoch": 100.0, |
| "frac_reward_zero_std": 0.6, |
| "grad_norm": 0.154296875, |
| "learning_rate": 4.171296296296297e-06, |
| "loss": 0.05318028926849365, |
| "num_tokens": 556642.0, |
| "reward": 0.6450000047683716, |
| "reward_std": 0.4384061962366104, |
| "rewards/reward_fn/mean": 0.6450000047683716, |
| "rewards/reward_fn/std": 0.4384061962366104, |
| "step": 200, |
| "step_time": 29.818907721999857 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.05, |
| "completions/max_length": 213.6, |
| "completions/max_terminated_length": 182.8, |
| "completions/mean_length": 106.8, |
| "completions/mean_terminated_length": 98.99285888671875, |
| "completions/min_length": 31.6, |
| "completions/min_terminated_length": 31.6, |
| "entropy": 0.14924523658119143, |
| "epoch": 102.5, |
| "frac_reward_zero_std": 0.7, |
| "grad_norm": 0.1748046875, |
| "learning_rate": 4.1481481481481485e-06, |
| "loss": 0.0724250853061676, |
| "num_tokens": 571082.0, |
| "reward": 0.600000011920929, |
| "reward_std": 0.4680067300796509, |
| "rewards/reward_fn/mean": 0.600000011920929, |
| "rewards/reward_fn/std": 0.4680067300796509, |
| "step": 205, |
| "step_time": 33.18914894179907 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 166.6, |
| "completions/max_terminated_length": 166.6, |
| "completions/mean_length": 88.725, |
| "completions/mean_terminated_length": 88.725, |
| "completions/min_length": 19.8, |
| "completions/min_terminated_length": 19.8, |
| "entropy": 0.12393170983996242, |
| "epoch": 105.0, |
| "frac_reward_zero_std": 0.6, |
| "grad_norm": 0.24609375, |
| "learning_rate": 4.125e-06, |
| "loss": 0.022324693202972413, |
| "num_tokens": 584563.0, |
| "reward": 0.597000002861023, |
| "reward_std": 0.47649201303720473, |
| "rewards/reward_fn/mean": 0.597000002861023, |
| "rewards/reward_fn/std": 0.47649201229214666, |
| "step": 210, |
| "step_time": 27.244719348003127 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 181.6, |
| "completions/max_terminated_length": 181.6, |
| "completions/mean_length": 104.95, |
| "completions/mean_terminated_length": 104.95, |
| "completions/min_length": 40.0, |
| "completions/min_terminated_length": 40.0, |
| "entropy": 0.10806208297144622, |
| "epoch": 107.5, |
| "frac_reward_zero_std": 0.7, |
| "grad_norm": 0.1337890625, |
| "learning_rate": 4.101851851851852e-06, |
| "loss": -0.012457241863012313, |
| "num_tokens": 598957.0, |
| "reward": 0.6500000119209289, |
| "reward_std": 0.42426406145095824, |
| "rewards/reward_fn/mean": 0.6500000119209289, |
| "rewards/reward_fn/std": 0.42426406145095824, |
| "step": 215, |
| "step_time": 29.092179798799044 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 175.2, |
| "completions/max_terminated_length": 162.8, |
| "completions/mean_length": 89.175, |
| "completions/mean_terminated_length": 85.25714416503907, |
| "completions/min_length": 27.6, |
| "completions/min_terminated_length": 27.6, |
| "entropy": 0.13876313052605838, |
| "epoch": 110.0, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 4.078703703703704e-06, |
| "loss": 0.016936567425727845, |
| "num_tokens": 612428.0, |
| "reward": 0.700000011920929, |
| "reward_std": 0.1851640224456787, |
| "rewards/reward_fn/mean": 0.700000011920929, |
| "rewards/reward_fn/std": 0.1851640224456787, |
| "step": 220, |
| "step_time": 28.282268692799697 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 175.0, |
| "completions/max_terminated_length": 165.2, |
| "completions/mean_length": 100.275, |
| "completions/mean_terminated_length": 95.95, |
| "completions/min_length": 48.6, |
| "completions/min_terminated_length": 48.6, |
| "entropy": 0.14033891165163367, |
| "epoch": 112.5, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.0, |
| "learning_rate": 4.055555555555556e-06, |
| "loss": 0.03414565920829773, |
| "num_tokens": 626587.0, |
| "reward": 0.6999920129776, |
| "reward_std": 0.18518665409792448, |
| "rewards/reward_fn/mean": 0.6999920129776, |
| "rewards/reward_fn/std": 0.1851866510885884, |
| "step": 225, |
| "step_time": 28.14439602499842 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 192.6, |
| "completions/max_terminated_length": 184.4, |
| "completions/mean_length": 96.575, |
| "completions/mean_terminated_length": 92.39285736083984, |
| "completions/min_length": 43.0, |
| "completions/min_terminated_length": 43.0, |
| "entropy": 0.13608734677545725, |
| "epoch": 115.0, |
| "frac_reward_zero_std": 0.6, |
| "grad_norm": 0.138671875, |
| "learning_rate": 4.032407407407408e-06, |
| "loss": 0.052947860956192014, |
| "num_tokens": 640402.0, |
| "reward": 0.6969920039176941, |
| "reward_std": 0.2913506222437718, |
| "rewards/reward_fn/mean": 0.6969920039176941, |
| "rewards/reward_fn/std": 0.29135061848937766, |
| "step": 230, |
| "step_time": 30.501605717404892 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 192.6, |
| "completions/max_terminated_length": 174.4, |
| "completions/mean_length": 96.85, |
| "completions/mean_terminated_length": 92.18214416503906, |
| "completions/min_length": 17.6, |
| "completions/min_terminated_length": 17.6, |
| "entropy": 0.12447628125082702, |
| "epoch": 117.5, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 4.0092592592592594e-06, |
| "loss": 0.04736369252204895, |
| "num_tokens": 654424.0, |
| "reward": 0.7500000119209289, |
| "reward_std": 0.1414213538169861, |
| "rewards/reward_fn/mean": 0.7500000119209289, |
| "rewards/reward_fn/std": 0.1414213538169861, |
| "step": 235, |
| "step_time": 30.51819963699527 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 173.4, |
| "completions/max_terminated_length": 173.4, |
| "completions/mean_length": 88.6, |
| "completions/mean_terminated_length": 88.6, |
| "completions/min_length": 19.8, |
| "completions/min_terminated_length": 19.8, |
| "entropy": 0.11185428954195231, |
| "epoch": 120.0, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.0, |
| "learning_rate": 3.986111111111112e-06, |
| "loss": 0.028244262933731078, |
| "num_tokens": 667920.0, |
| "reward": 0.700000011920929, |
| "reward_std": 0.2828427076339722, |
| "rewards/reward_fn/mean": 0.700000011920929, |
| "rewards/reward_fn/std": 0.2828427076339722, |
| "step": 240, |
| "step_time": 28.104555083598825 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 156.6, |
| "completions/max_terminated_length": 148.4, |
| "completions/mean_length": 101.75, |
| "completions/mean_terminated_length": 99.26428833007813, |
| "completions/min_length": 61.8, |
| "completions/min_terminated_length": 61.8, |
| "entropy": 0.15239415562245995, |
| "epoch": 122.5, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 3.962962962962963e-06, |
| "loss": 0.016936178505420684, |
| "num_tokens": 682138.0, |
| "reward": 0.7500000119209289, |
| "reward_std": 0.1414213538169861, |
| "rewards/reward_fn/mean": 0.7500000119209289, |
| "rewards/reward_fn/std": 0.1414213538169861, |
| "step": 245, |
| "step_time": 25.842140119400575 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 182.4, |
| "completions/max_terminated_length": 168.6, |
| "completions/mean_length": 96.175, |
| "completions/mean_terminated_length": 91.94642944335938, |
| "completions/min_length": 35.4, |
| "completions/min_terminated_length": 35.4, |
| "entropy": 0.12800856186076998, |
| "epoch": 125.0, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 3.939814814814815e-06, |
| "loss": 0.02638625502586365, |
| "num_tokens": 695937.0, |
| "reward": 0.7500000119209289, |
| "reward_std": 0.1414213538169861, |
| "rewards/reward_fn/mean": 0.7500000119209289, |
| "rewards/reward_fn/std": 0.1414213538169861, |
| "step": 250, |
| "step_time": 29.15611189340343 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 153.8, |
| "completions/max_terminated_length": 153.8, |
| "completions/mean_length": 81.275, |
| "completions/mean_terminated_length": 81.275, |
| "completions/min_length": 27.0, |
| "completions/min_terminated_length": 27.0, |
| "entropy": 0.14729445518460124, |
| "epoch": 127.5, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 3.916666666666667e-06, |
| "loss": 0.017367249727249144, |
| "num_tokens": 709092.0, |
| "reward": 0.700000011920929, |
| "reward_std": 0.1851640224456787, |
| "rewards/reward_fn/mean": 0.700000011920929, |
| "rewards/reward_fn/std": 0.1851640224456787, |
| "step": 255, |
| "step_time": 25.567990457000995 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 199.8, |
| "completions/max_terminated_length": 199.8, |
| "completions/mean_length": 101.3, |
| "completions/mean_terminated_length": 101.3, |
| "completions/min_length": 38.0, |
| "completions/min_terminated_length": 38.0, |
| "entropy": 0.1291541094193235, |
| "epoch": 130.0, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 3.8935185185185185e-06, |
| "loss": 0.0, |
| "num_tokens": 723340.0, |
| "reward": 0.800000011920929, |
| "reward_std": 0.0, |
| "rewards/reward_fn/mean": 0.800000011920929, |
| "rewards/reward_fn/std": 0.0, |
| "step": 260, |
| "step_time": 31.396876741403684 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.075, |
| "completions/max_length": 203.0, |
| "completions/max_terminated_length": 183.0, |
| "completions/mean_length": 103.7, |
| "completions/mean_terminated_length": 91.9607147216797, |
| "completions/min_length": 25.8, |
| "completions/min_terminated_length": 25.8, |
| "entropy": 0.14293926188256592, |
| "epoch": 132.5, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.0, |
| "learning_rate": 3.87037037037037e-06, |
| "loss": 0.015573790669441223, |
| "num_tokens": 737656.0, |
| "reward": 0.7499920129776001, |
| "reward_std": 0.14144398546923184, |
| "rewards/reward_fn/mean": 0.7499920129776001, |
| "rewards/reward_fn/std": 0.14144398245989578, |
| "step": 265, |
| "step_time": 31.94191468279896 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 190.0, |
| "completions/max_terminated_length": 190.0, |
| "completions/mean_length": 102.075, |
| "completions/mean_terminated_length": 102.075, |
| "completions/min_length": 29.4, |
| "completions/min_terminated_length": 29.4, |
| "entropy": 0.12520905593410134, |
| "epoch": 135.0, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.103515625, |
| "learning_rate": 3.847222222222223e-06, |
| "loss": -0.010684289038181305, |
| "num_tokens": 751671.0, |
| "reward": 0.7499920129776001, |
| "reward_std": 0.14144398546923184, |
| "rewards/reward_fn/mean": 0.7499920129776001, |
| "rewards/reward_fn/std": 0.14144398245989578, |
| "step": 270, |
| "step_time": 30.233847617598077 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 198.6, |
| "completions/max_terminated_length": 198.6, |
| "completions/mean_length": 101.05, |
| "completions/mean_terminated_length": 101.05, |
| "completions/min_length": 29.2, |
| "completions/min_terminated_length": 29.2, |
| "entropy": 0.1348960422212258, |
| "epoch": 137.5, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 3.824074074074075e-06, |
| "loss": -0.008155962824821473, |
| "num_tokens": 765881.0, |
| "reward": 0.7500000119209289, |
| "reward_std": 0.1414213538169861, |
| "rewards/reward_fn/mean": 0.7500000119209289, |
| "rewards/reward_fn/std": 0.1414213538169861, |
| "step": 275, |
| "step_time": 31.395782154197512 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 189.8, |
| "completions/max_terminated_length": 170.8, |
| "completions/mean_length": 92.95, |
| "completions/mean_terminated_length": 88.30357208251954, |
| "completions/min_length": 27.6, |
| "completions/min_terminated_length": 27.6, |
| "entropy": 0.11601264334749431, |
| "epoch": 140.0, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 3.8009259259259263e-06, |
| "loss": 0.0, |
| "num_tokens": 779531.0, |
| "reward": 0.800000011920929, |
| "reward_std": 0.0, |
| "rewards/reward_fn/mean": 0.800000011920929, |
| "rewards/reward_fn/std": 0.0, |
| "step": 280, |
| "step_time": 30.19007007140026 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 199.4, |
| "completions/max_terminated_length": 186.0, |
| "completions/mean_length": 111.55, |
| "completions/mean_terminated_length": 108.88571472167969, |
| "completions/min_length": 40.6, |
| "completions/min_terminated_length": 40.6, |
| "entropy": 0.12823146923910828, |
| "epoch": 142.5, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.1337890625, |
| "learning_rate": 3.777777777777778e-06, |
| "loss": 0.016540104150772096, |
| "num_tokens": 794189.0, |
| "reward": 0.7007920145988464, |
| "reward_std": 0.2806025862693787, |
| "rewards/reward_fn/mean": 0.7007920145988464, |
| "rewards/reward_fn/std": 0.28060259819030764, |
| "step": 285, |
| "step_time": 31.41049292399839 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 131.6, |
| "completions/max_terminated_length": 131.6, |
| "completions/mean_length": 85.425, |
| "completions/mean_terminated_length": 85.425, |
| "completions/min_length": 57.0, |
| "completions/min_terminated_length": 57.0, |
| "entropy": 0.1431839597178623, |
| "epoch": 145.0, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 3.75462962962963e-06, |
| "loss": -0.008183705806732177, |
| "num_tokens": 807510.0, |
| "reward": 0.7500000119209289, |
| "reward_std": 0.1414213538169861, |
| "rewards/reward_fn/mean": 0.7500000119209289, |
| "rewards/reward_fn/std": 0.1414213538169861, |
| "step": 290, |
| "step_time": 22.681272815001286 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 173.0, |
| "completions/max_terminated_length": 157.4, |
| "completions/mean_length": 105.45, |
| "completions/mean_terminated_length": 103.09285888671874, |
| "completions/min_length": 60.2, |
| "completions/min_terminated_length": 60.2, |
| "entropy": 0.12795310239307583, |
| "epoch": 147.5, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.11328125, |
| "learning_rate": 3.731481481481482e-06, |
| "loss": 0.005563179403543473, |
| "num_tokens": 821924.0, |
| "reward": 0.6000000089406967, |
| "reward_std": 0.34844101667404176, |
| "rewards/reward_fn/mean": 0.6000000089406967, |
| "rewards/reward_fn/std": 0.34844104051589964, |
| "step": 295, |
| "step_time": 28.041595162999876 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 142.4, |
| "completions/max_terminated_length": 142.4, |
| "completions/mean_length": 85.8, |
| "completions/mean_terminated_length": 85.8, |
| "completions/min_length": 38.4, |
| "completions/min_terminated_length": 38.4, |
| "entropy": 0.12552432040683925, |
| "epoch": 150.0, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.0, |
| "learning_rate": 3.708333333333334e-06, |
| "loss": 0.025445324182510377, |
| "num_tokens": 835260.0, |
| "reward": 0.7499920129776001, |
| "reward_std": 0.14144398546923184, |
| "rewards/reward_fn/mean": 0.7499920129776001, |
| "rewards/reward_fn/std": 0.14144398245989578, |
| "step": 300, |
| "step_time": 24.1276477496016 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 194.8, |
| "completions/max_terminated_length": 177.4, |
| "completions/mean_length": 100.125, |
| "completions/mean_terminated_length": 95.59285736083984, |
| "completions/min_length": 36.6, |
| "completions/min_terminated_length": 36.6, |
| "entropy": 0.14587228901218624, |
| "epoch": 152.5, |
| "frac_reward_zero_std": 0.7, |
| "grad_norm": 0.1435546875, |
| "learning_rate": 3.6851851851851854e-06, |
| "loss": 0.045100918412208556, |
| "num_tokens": 849413.0, |
| "reward": 0.7499840140342713, |
| "reward_std": 0.14146661712147762, |
| "rewards/reward_fn/mean": 0.7499840140342713, |
| "rewards/reward_fn/std": 0.14146661110280548, |
| "step": 305, |
| "step_time": 30.895152483399578 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 164.4, |
| "completions/max_terminated_length": 164.4, |
| "completions/mean_length": 95.8, |
| "completions/mean_terminated_length": 95.8, |
| "completions/min_length": 45.0, |
| "completions/min_terminated_length": 45.0, |
| "entropy": 0.12987288122531027, |
| "epoch": 155.0, |
| "frac_reward_zero_std": 0.7, |
| "grad_norm": 0.0, |
| "learning_rate": 3.662037037037037e-06, |
| "loss": -0.010421520471572876, |
| "num_tokens": 863197.0, |
| "reward": 0.7439999938011169, |
| "reward_std": 0.14894575029611587, |
| "rewards/reward_fn/mean": 0.7439999938011169, |
| "rewards/reward_fn/std": 0.14894574955105783, |
| "step": 310, |
| "step_time": 26.97180611779986 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 128.8, |
| "completions/max_terminated_length": 128.8, |
| "completions/mean_length": 88.525, |
| "completions/mean_terminated_length": 88.525, |
| "completions/min_length": 55.6, |
| "completions/min_terminated_length": 55.6, |
| "entropy": 0.12343773562461138, |
| "epoch": 157.5, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 3.638888888888889e-06, |
| "loss": -2.3096799850463868e-08, |
| "num_tokens": 876690.0, |
| "reward": 0.7970000028610229, |
| "reward_std": 0.008485282957553863, |
| "rewards/reward_fn/mean": 0.7970000028610229, |
| "rewards/reward_fn/std": 0.008485282212495804, |
| "step": 315, |
| "step_time": 22.35670812440003 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 162.2, |
| "completions/max_terminated_length": 162.2, |
| "completions/mean_length": 93.075, |
| "completions/mean_terminated_length": 93.075, |
| "completions/min_length": 40.6, |
| "completions/min_terminated_length": 40.6, |
| "entropy": 0.15649742879904807, |
| "epoch": 160.0, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 3.615740740740741e-06, |
| "loss": 0.0, |
| "num_tokens": 890561.0, |
| "reward": 0.800000011920929, |
| "reward_std": 0.0, |
| "rewards/reward_fn/mean": 0.800000011920929, |
| "rewards/reward_fn/std": 0.0, |
| "step": 320, |
| "step_time": 26.563862933603378 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 170.8, |
| "completions/max_terminated_length": 170.8, |
| "completions/mean_length": 92.8, |
| "completions/mean_terminated_length": 92.8, |
| "completions/min_length": 31.4, |
| "completions/min_terminated_length": 31.4, |
| "entropy": 0.12219774469267577, |
| "epoch": 162.5, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.232421875, |
| "learning_rate": 3.592592592592593e-06, |
| "loss": 0.0173223078250885, |
| "num_tokens": 904205.0, |
| "reward": 0.7499920129776001, |
| "reward_std": 0.14144398546923184, |
| "rewards/reward_fn/mean": 0.7499920129776001, |
| "rewards/reward_fn/std": 0.14144398245989578, |
| "step": 325, |
| "step_time": 27.899399954797264 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 168.0, |
| "completions/max_terminated_length": 168.0, |
| "completions/mean_length": 102.475, |
| "completions/mean_terminated_length": 102.475, |
| "completions/min_length": 37.8, |
| "completions/min_terminated_length": 37.8, |
| "entropy": 0.16676131393760443, |
| "epoch": 165.0, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 3.569444444444445e-06, |
| "loss": -1.1920928955078126e-08, |
| "num_tokens": 918472.0, |
| "reward": 0.7970000028610229, |
| "reward_std": 0.008485282957553863, |
| "rewards/reward_fn/mean": 0.7970000028610229, |
| "rewards/reward_fn/std": 0.008485282212495804, |
| "step": 330, |
| "step_time": 27.33713496620112 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 165.0, |
| "completions/max_terminated_length": 165.0, |
| "completions/mean_length": 94.2, |
| "completions/mean_terminated_length": 94.2, |
| "completions/min_length": 36.4, |
| "completions/min_terminated_length": 36.4, |
| "entropy": 0.13035173853859305, |
| "epoch": 167.5, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 3.5462962962962967e-06, |
| "loss": 0.02846188545227051, |
| "num_tokens": 932388.0, |
| "reward": 0.7500000119209289, |
| "reward_std": 0.1414213538169861, |
| "rewards/reward_fn/mean": 0.7500000119209289, |
| "rewards/reward_fn/std": 0.1414213538169861, |
| "step": 335, |
| "step_time": 27.027245131201198 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 182.2, |
| "completions/max_terminated_length": 182.2, |
| "completions/mean_length": 95.775, |
| "completions/mean_terminated_length": 95.775, |
| "completions/min_length": 33.6, |
| "completions/min_terminated_length": 33.6, |
| "entropy": 0.13310553296469152, |
| "epoch": 170.0, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.193359375, |
| "learning_rate": 3.523148148148148e-06, |
| "loss": 0.001617179811000824, |
| "num_tokens": 946171.0, |
| "reward": 0.700000011920929, |
| "reward_std": 0.2828427076339722, |
| "rewards/reward_fn/mean": 0.700000011920929, |
| "rewards/reward_fn/std": 0.2828427076339722, |
| "step": 340, |
| "step_time": 29.20151192679841 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 192.8, |
| "completions/max_terminated_length": 181.0, |
| "completions/mean_length": 103.925, |
| "completions/mean_terminated_length": 100.5750015258789, |
| "completions/min_length": 30.0, |
| "completions/min_terminated_length": 30.0, |
| "entropy": 0.15284172971732915, |
| "epoch": 172.5, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.1142578125, |
| "learning_rate": 3.5e-06, |
| "loss": -0.0006755538284778595, |
| "num_tokens": 960496.0, |
| "reward": 0.7999920129776001, |
| "reward_std": 2.263165224576369e-05, |
| "rewards/reward_fn/mean": 0.7999920129776001, |
| "rewards/reward_fn/std": 2.2628642909694463e-05, |
| "step": 345, |
| "step_time": 30.46660940139991 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 183.0, |
| "completions/max_terminated_length": 183.0, |
| "completions/mean_length": 96.9, |
| "completions/mean_terminated_length": 96.9, |
| "completions/min_length": 28.4, |
| "completions/min_terminated_length": 28.4, |
| "entropy": 0.1217897635884583, |
| "epoch": 175.0, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.201171875, |
| "learning_rate": 3.476851851851852e-06, |
| "loss": 0.0045785665512084964, |
| "num_tokens": 974304.0, |
| "reward": 0.7500000119209289, |
| "reward_std": 0.1414213538169861, |
| "rewards/reward_fn/mean": 0.7500000119209289, |
| "rewards/reward_fn/std": 0.1414213538169861, |
| "step": 350, |
| "step_time": 29.460710451801425 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 189.8, |
| "completions/max_terminated_length": 180.2, |
| "completions/mean_length": 108.925, |
| "completions/mean_terminated_length": 106.39642944335938, |
| "completions/min_length": 46.2, |
| "completions/min_terminated_length": 46.2, |
| "entropy": 0.13436518078669907, |
| "epoch": 177.5, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 3.453703703703704e-06, |
| "loss": -0.00737251341342926, |
| "num_tokens": 988857.0, |
| "reward": 0.7511820077896119, |
| "reward_std": 0.13807815313339233, |
| "rewards/reward_fn/mean": 0.7511820077896119, |
| "rewards/reward_fn/std": 0.13807815313339233, |
| "step": 355, |
| "step_time": 30.16567378140171 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 144.2, |
| "completions/max_terminated_length": 144.2, |
| "completions/mean_length": 81.875, |
| "completions/mean_terminated_length": 81.875, |
| "completions/min_length": 30.8, |
| "completions/min_terminated_length": 30.8, |
| "entropy": 0.1366418529767543, |
| "epoch": 180.0, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 3.430555555555556e-06, |
| "loss": 0.017211392521858215, |
| "num_tokens": 1002036.0, |
| "reward": 0.7500000119209289, |
| "reward_std": 0.1414213538169861, |
| "rewards/reward_fn/mean": 0.7500000119209289, |
| "rewards/reward_fn/std": 0.1414213538169861, |
| "step": 360, |
| "step_time": 24.196757839000202 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 160.8, |
| "completions/max_terminated_length": 160.8, |
| "completions/mean_length": 97.65, |
| "completions/mean_terminated_length": 97.65, |
| "completions/min_length": 49.8, |
| "completions/min_terminated_length": 49.8, |
| "entropy": 0.11701173500623555, |
| "epoch": 182.5, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 3.4074074074074077e-06, |
| "loss": 0.0, |
| "num_tokens": 1015874.0, |
| "reward": 0.800000011920929, |
| "reward_std": 0.0, |
| "rewards/reward_fn/mean": 0.800000011920929, |
| "rewards/reward_fn/std": 0.0, |
| "step": 365, |
| "step_time": 26.509838377000413 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.05, |
| "completions/max_length": 189.8, |
| "completions/max_terminated_length": 170.8, |
| "completions/mean_length": 104.2, |
| "completions/mean_terminated_length": 96.73333435058593, |
| "completions/min_length": 46.2, |
| "completions/min_terminated_length": 46.2, |
| "entropy": 0.1576181524200365, |
| "epoch": 185.0, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.0, |
| "learning_rate": 3.3842592592592595e-06, |
| "loss": 0.016841967403888703, |
| "num_tokens": 1030210.0, |
| "reward": 0.700000011920929, |
| "reward_std": 0.2828427076339722, |
| "rewards/reward_fn/mean": 0.700000011920929, |
| "rewards/reward_fn/std": 0.2828427076339722, |
| "step": 370, |
| "step_time": 30.090859549198647 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 154.6, |
| "completions/max_terminated_length": 150.6, |
| "completions/mean_length": 78.825, |
| "completions/mean_terminated_length": 75.07857208251953, |
| "completions/min_length": 19.4, |
| "completions/min_terminated_length": 19.4, |
| "entropy": 0.1031433446565643, |
| "epoch": 187.5, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 3.3611111111111117e-06, |
| "loss": 0.016514861583709718, |
| "num_tokens": 1043267.0, |
| "reward": 0.7500000119209289, |
| "reward_std": 0.1414213538169861, |
| "rewards/reward_fn/mean": 0.7500000119209289, |
| "rewards/reward_fn/std": 0.1414213538169861, |
| "step": 375, |
| "step_time": 25.698997188800423 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 179.6, |
| "completions/max_terminated_length": 179.6, |
| "completions/mean_length": 104.575, |
| "completions/mean_terminated_length": 104.575, |
| "completions/min_length": 40.4, |
| "completions/min_terminated_length": 40.4, |
| "entropy": 0.13036173072177917, |
| "epoch": 190.0, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.0, |
| "learning_rate": 3.3379629629629636e-06, |
| "loss": -0.000981520116329193, |
| "num_tokens": 1057646.0, |
| "reward": 0.7499920129776001, |
| "reward_std": 0.14144398546923184, |
| "rewards/reward_fn/mean": 0.7499920129776001, |
| "rewards/reward_fn/std": 0.14144398245989578, |
| "step": 380, |
| "step_time": 28.867799249802193 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 158.8, |
| "completions/max_terminated_length": 158.8, |
| "completions/mean_length": 110.175, |
| "completions/mean_terminated_length": 110.175, |
| "completions/min_length": 72.6, |
| "completions/min_terminated_length": 72.6, |
| "entropy": 0.14139417342375965, |
| "epoch": 192.5, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 3.314814814814815e-06, |
| "loss": -0.009847503900527955, |
| "num_tokens": 1072249.0, |
| "reward": 0.7500000119209289, |
| "reward_std": 0.1414213538169861, |
| "rewards/reward_fn/mean": 0.7500000119209289, |
| "rewards/reward_fn/std": 0.1414213538169861, |
| "step": 385, |
| "step_time": 26.117895688798306 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 162.0, |
| "completions/max_terminated_length": 149.8, |
| "completions/mean_length": 90.825, |
| "completions/mean_terminated_length": 86.45, |
| "completions/min_length": 38.0, |
| "completions/min_terminated_length": 38.0, |
| "entropy": 0.1398956686956808, |
| "epoch": 195.0, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.361328125, |
| "learning_rate": 3.2916666666666668e-06, |
| "loss": 0.07702720761299134, |
| "num_tokens": 1085786.0, |
| "reward": 0.7507840156555176, |
| "reward_std": 0.13920386410463834, |
| "rewards/reward_fn/mean": 0.7507840156555176, |
| "rewards/reward_fn/std": 0.13920387301623122, |
| "step": 390, |
| "step_time": 26.53806324480174 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 184.8, |
| "completions/max_terminated_length": 184.8, |
| "completions/mean_length": 95.3, |
| "completions/mean_terminated_length": 95.3, |
| "completions/min_length": 25.8, |
| "completions/min_terminated_length": 25.8, |
| "entropy": 0.11398763118777425, |
| "epoch": 197.5, |
| "frac_reward_zero_std": 0.6, |
| "grad_norm": 0.25, |
| "learning_rate": 3.2685185185185186e-06, |
| "loss": 0.013139170408248902, |
| "num_tokens": 1099550.0, |
| "reward": 0.6969920039176941, |
| "reward_std": 0.2819044408868649, |
| "rewards/reward_fn/mean": 0.6969920039176941, |
| "rewards/reward_fn/std": 0.28190446171938677, |
| "step": 395, |
| "step_time": 29.58767645379921 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 191.6, |
| "completions/max_terminated_length": 191.6, |
| "completions/mean_length": 106.725, |
| "completions/mean_terminated_length": 106.725, |
| "completions/min_length": 43.2, |
| "completions/min_terminated_length": 43.2, |
| "entropy": 0.1625191917642951, |
| "epoch": 200.0, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 3.2453703703703704e-06, |
| "loss": 0.0, |
| "num_tokens": 1113967.0, |
| "reward": 0.800000011920929, |
| "reward_std": 0.0, |
| "rewards/reward_fn/mean": 0.800000011920929, |
| "rewards/reward_fn/std": 0.0, |
| "step": 400, |
| "step_time": 30.439370583198617 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 146.0, |
| "completions/max_terminated_length": 146.0, |
| "completions/mean_length": 77.4, |
| "completions/mean_terminated_length": 77.4, |
| "completions/min_length": 19.8, |
| "completions/min_terminated_length": 19.8, |
| "entropy": 0.1114686549641192, |
| "epoch": 202.5, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 3.2222222222222227e-06, |
| "loss": 0.016538560390472412, |
| "num_tokens": 1126967.0, |
| "reward": 0.7500000119209289, |
| "reward_std": 0.1414213538169861, |
| "rewards/reward_fn/mean": 0.7500000119209289, |
| "rewards/reward_fn/std": 0.1414213538169861, |
| "step": 405, |
| "step_time": 24.568558594400383 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 176.8, |
| "completions/max_terminated_length": 160.0, |
| "completions/mean_length": 105.125, |
| "completions/mean_terminated_length": 102.4607147216797, |
| "completions/min_length": 55.4, |
| "completions/min_terminated_length": 55.4, |
| "entropy": 0.14484606825280935, |
| "epoch": 205.0, |
| "frac_reward_zero_std": 0.6, |
| "grad_norm": 0.0, |
| "learning_rate": 3.1990740740740745e-06, |
| "loss": 0.04012419581413269, |
| "num_tokens": 1141368.0, |
| "reward": 0.6499920129776001, |
| "reward_std": 0.424286693103204, |
| "rewards/reward_fn/mean": 0.6499920129776001, |
| "rewards/reward_fn/std": 0.42428669009386794, |
| "step": 410, |
| "step_time": 28.52794066679926 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 164.8, |
| "completions/max_terminated_length": 164.8, |
| "completions/mean_length": 79.375, |
| "completions/mean_terminated_length": 79.375, |
| "completions/min_length": 19.8, |
| "completions/min_terminated_length": 19.8, |
| "entropy": 0.1323097031097859, |
| "epoch": 207.5, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 3.1759259259259263e-06, |
| "loss": 0.0, |
| "num_tokens": 1154447.0, |
| "reward": 0.800000011920929, |
| "reward_std": 0.0, |
| "rewards/reward_fn/mean": 0.800000011920929, |
| "rewards/reward_fn/std": 0.0, |
| "step": 415, |
| "step_time": 26.996101147800072 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 167.0, |
| "completions/max_terminated_length": 157.6, |
| "completions/mean_length": 103.525, |
| "completions/mean_terminated_length": 99.01071472167969, |
| "completions/min_length": 62.0, |
| "completions/min_terminated_length": 62.0, |
| "entropy": 0.11831333981826901, |
| "epoch": 210.0, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.19140625, |
| "learning_rate": 3.152777777777778e-06, |
| "loss": 0.015304601192474366, |
| "num_tokens": 1168784.0, |
| "reward": 0.7500000119209289, |
| "reward_std": 0.1414213538169861, |
| "rewards/reward_fn/mean": 0.7500000119209289, |
| "rewards/reward_fn/std": 0.1414213538169861, |
| "step": 420, |
| "step_time": 27.225987496596645 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 180.4, |
| "completions/max_terminated_length": 165.4, |
| "completions/mean_length": 98.0, |
| "completions/mean_terminated_length": 93.45714416503907, |
| "completions/min_length": 45.8, |
| "completions/min_terminated_length": 45.8, |
| "entropy": 0.13548461146419868, |
| "epoch": 212.5, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 3.1296296296296295e-06, |
| "loss": -0.009428337216377258, |
| "num_tokens": 1182852.0, |
| "reward": 0.7500000119209289, |
| "reward_std": 0.1414213538169861, |
| "rewards/reward_fn/mean": 0.7500000119209289, |
| "rewards/reward_fn/std": 0.1414213538169861, |
| "step": 425, |
| "step_time": 29.033930897598474 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 175.6, |
| "completions/max_terminated_length": 175.6, |
| "completions/mean_length": 92.675, |
| "completions/mean_terminated_length": 92.675, |
| "completions/min_length": 36.2, |
| "completions/min_terminated_length": 36.2, |
| "entropy": 0.10527189422864466, |
| "epoch": 215.0, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 3.1064814814814818e-06, |
| "loss": -2.16066837310791e-08, |
| "num_tokens": 1196511.0, |
| "reward": 0.7970000028610229, |
| "reward_std": 0.008485282957553863, |
| "rewards/reward_fn/mean": 0.7970000028610229, |
| "rewards/reward_fn/std": 0.008485282212495804, |
| "step": 430, |
| "step_time": 28.445749506798894 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 182.8, |
| "completions/max_terminated_length": 182.8, |
| "completions/mean_length": 107.5, |
| "completions/mean_terminated_length": 107.5, |
| "completions/min_length": 39.2, |
| "completions/min_terminated_length": 39.2, |
| "entropy": 0.13915550753008574, |
| "epoch": 217.5, |
| "frac_reward_zero_std": 0.7, |
| "grad_norm": 0.1435546875, |
| "learning_rate": 3.0833333333333336e-06, |
| "loss": 0.04852951169013977, |
| "num_tokens": 1211007.0, |
| "reward": 0.6999920129776, |
| "reward_std": 0.28286533928621793, |
| "rewards/reward_fn/mean": 0.6999920129776, |
| "rewards/reward_fn/std": 0.2828653362768819, |
| "step": 435, |
| "step_time": 29.242199634596183 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 142.4, |
| "completions/max_terminated_length": 126.8, |
| "completions/mean_length": 84.875, |
| "completions/mean_terminated_length": 82.12857360839844, |
| "completions/min_length": 50.6, |
| "completions/min_terminated_length": 50.6, |
| "entropy": 0.12801749436184764, |
| "epoch": 220.0, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.0, |
| "learning_rate": 3.0601851851851854e-06, |
| "loss": 0.023633134365081788, |
| "num_tokens": 1224306.0, |
| "reward": 0.6500000089406968, |
| "reward_std": 0.20701966285705567, |
| "rewards/reward_fn/mean": 0.6500000089406968, |
| "rewards/reward_fn/std": 0.20701968669891357, |
| "step": 440, |
| "step_time": 24.112674611999683 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.05, |
| "completions/max_length": 171.4, |
| "completions/max_terminated_length": 144.0, |
| "completions/mean_length": 89.325, |
| "completions/mean_terminated_length": 82.22142944335937, |
| "completions/min_length": 40.6, |
| "completions/min_terminated_length": 40.6, |
| "entropy": 0.14441313657443972, |
| "epoch": 222.5, |
| "frac_reward_zero_std": 0.6, |
| "grad_norm": 0.287109375, |
| "learning_rate": 3.0370370370370372e-06, |
| "loss": 0.061201339960098265, |
| "num_tokens": 1237783.0, |
| "reward": 0.6499920129776001, |
| "reward_std": 0.424286693103204, |
| "rewards/reward_fn/mean": 0.6499920129776001, |
| "rewards/reward_fn/std": 0.42428669009386794, |
| "step": 445, |
| "step_time": 27.72405263900073 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 177.6, |
| "completions/max_terminated_length": 177.6, |
| "completions/mean_length": 101.2, |
| "completions/mean_terminated_length": 101.2, |
| "completions/min_length": 41.4, |
| "completions/min_terminated_length": 41.4, |
| "entropy": 0.1256763830780983, |
| "epoch": 225.0, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 3.013888888888889e-06, |
| "loss": 0.0, |
| "num_tokens": 1252027.0, |
| "reward": 0.800000011920929, |
| "reward_std": 0.0, |
| "rewards/reward_fn/mean": 0.800000011920929, |
| "rewards/reward_fn/std": 0.0, |
| "step": 450, |
| "step_time": 28.645315791801114 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 153.8, |
| "completions/max_terminated_length": 153.8, |
| "completions/mean_length": 106.525, |
| "completions/mean_terminated_length": 106.525, |
| "completions/min_length": 72.4, |
| "completions/min_terminated_length": 72.4, |
| "entropy": 0.11539835934527218, |
| "epoch": 227.5, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.134765625, |
| "learning_rate": 2.990740740740741e-06, |
| "loss": -0.008653049170970917, |
| "num_tokens": 1266484.0, |
| "reward": 0.700000011920929, |
| "reward_std": 0.1851640224456787, |
| "rewards/reward_fn/mean": 0.700000011920929, |
| "rewards/reward_fn/std": 0.1851640224456787, |
| "step": 455, |
| "step_time": 25.554188429601346 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 169.6, |
| "completions/max_terminated_length": 166.8, |
| "completions/mean_length": 84.45, |
| "completions/mean_terminated_length": 80.72142944335937, |
| "completions/min_length": 19.8, |
| "completions/min_terminated_length": 19.8, |
| "entropy": 0.13924790399614723, |
| "epoch": 230.0, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.158203125, |
| "learning_rate": 2.967592592592593e-06, |
| "loss": 0.03869241774082184, |
| "num_tokens": 1279766.0, |
| "reward": 0.6500000119209289, |
| "reward_std": 0.3265853762626648, |
| "rewards/reward_fn/mean": 0.6500000119209289, |
| "rewards/reward_fn/std": 0.3265853762626648, |
| "step": 460, |
| "step_time": 27.48372414899932 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 189.8, |
| "completions/max_terminated_length": 189.8, |
| "completions/mean_length": 93.25, |
| "completions/mean_terminated_length": 93.25, |
| "completions/min_length": 18.0, |
| "completions/min_terminated_length": 18.0, |
| "entropy": 0.11793683604337275, |
| "epoch": 232.5, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.2177734375, |
| "learning_rate": 2.944444444444445e-06, |
| "loss": 0.012316146492958069, |
| "num_tokens": 1293448.0, |
| "reward": 0.6500000119209289, |
| "reward_std": 0.3265853762626648, |
| "rewards/reward_fn/mean": 0.6500000119209289, |
| "rewards/reward_fn/std": 0.3265853762626648, |
| "step": 465, |
| "step_time": 30.217514709999524 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 170.4, |
| "completions/max_terminated_length": 156.6, |
| "completions/mean_length": 93.975, |
| "completions/mean_terminated_length": 90.22500152587891, |
| "completions/min_length": 27.4, |
| "completions/min_terminated_length": 27.4, |
| "entropy": 0.1507271576556377, |
| "epoch": 235.0, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 2.9212962962962964e-06, |
| "loss": 0.024145886301994324, |
| "num_tokens": 1307355.0, |
| "reward": 0.7500000119209289, |
| "reward_std": 0.1414213538169861, |
| "rewards/reward_fn/mean": 0.7500000119209289, |
| "rewards/reward_fn/std": 0.1414213538169861, |
| "step": 470, |
| "step_time": 27.92282026600078 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 174.2, |
| "completions/max_terminated_length": 160.0, |
| "completions/mean_length": 103.6, |
| "completions/mean_terminated_length": 100.88214416503907, |
| "completions/min_length": 39.2, |
| "completions/min_terminated_length": 39.2, |
| "entropy": 0.14852485329611226, |
| "epoch": 237.5, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 2.898148148148148e-06, |
| "loss": 0.0, |
| "num_tokens": 1321667.0, |
| "reward": 0.800000011920929, |
| "reward_std": 0.0, |
| "rewards/reward_fn/mean": 0.800000011920929, |
| "rewards/reward_fn/std": 0.0, |
| "step": 475, |
| "step_time": 28.292469495798287 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 151.6, |
| "completions/max_terminated_length": 151.6, |
| "completions/mean_length": 87.05, |
| "completions/mean_terminated_length": 87.05, |
| "completions/min_length": 34.2, |
| "completions/min_terminated_length": 34.2, |
| "entropy": 0.1000554851605557, |
| "epoch": 240.0, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.0, |
| "learning_rate": 2.875e-06, |
| "loss": 0.01600448936223984, |
| "num_tokens": 1335081.0, |
| "reward": 0.700000011920929, |
| "reward_std": 0.2828427076339722, |
| "rewards/reward_fn/mean": 0.700000011920929, |
| "rewards/reward_fn/std": 0.2828427076339722, |
| "step": 480, |
| "step_time": 25.202083195801244 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 151.6, |
| "completions/max_terminated_length": 151.6, |
| "completions/mean_length": 82.025, |
| "completions/mean_terminated_length": 82.025, |
| "completions/min_length": 19.8, |
| "completions/min_terminated_length": 19.8, |
| "entropy": 0.1268925666459836, |
| "epoch": 242.5, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.0, |
| "learning_rate": 2.8518518518518522e-06, |
| "loss": 0.009142126142978668, |
| "num_tokens": 1348266.0, |
| "reward": 0.700000011920929, |
| "reward_std": 0.2828427076339722, |
| "rewards/reward_fn/mean": 0.700000011920929, |
| "rewards/reward_fn/std": 0.2828427076339722, |
| "step": 485, |
| "step_time": 25.32069047859695 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.05, |
| "completions/max_length": 214.0, |
| "completions/max_terminated_length": 184.2, |
| "completions/mean_length": 112.625, |
| "completions/mean_terminated_length": 105.24642944335938, |
| "completions/min_length": 45.6, |
| "completions/min_terminated_length": 45.6, |
| "entropy": 0.13998855941463262, |
| "epoch": 245.0, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.0, |
| "learning_rate": 2.828703703703704e-06, |
| "loss": 0.06989994049072265, |
| "num_tokens": 1362967.0, |
| "reward": 0.700000011920929, |
| "reward_std": 0.2828427076339722, |
| "rewards/reward_fn/mean": 0.700000011920929, |
| "rewards/reward_fn/std": 0.2828427076339722, |
| "step": 490, |
| "step_time": 33.36453152959875 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 188.8, |
| "completions/max_terminated_length": 188.8, |
| "completions/mean_length": 104.725, |
| "completions/mean_terminated_length": 104.725, |
| "completions/min_length": 43.8, |
| "completions/min_terminated_length": 43.8, |
| "entropy": 0.1694211942027323, |
| "epoch": 247.5, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.0, |
| "learning_rate": 2.805555555555556e-06, |
| "loss": 0.006063133478164673, |
| "num_tokens": 1377324.0, |
| "reward": 0.6500000119209289, |
| "reward_std": 0.3265853762626648, |
| "rewards/reward_fn/mean": 0.6500000119209289, |
| "rewards/reward_fn/std": 0.3265853762626648, |
| "step": 495, |
| "step_time": 29.983117254402895 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 153.8, |
| "completions/max_terminated_length": 153.8, |
| "completions/mean_length": 83.875, |
| "completions/mean_terminated_length": 83.875, |
| "completions/min_length": 27.0, |
| "completions/min_terminated_length": 27.0, |
| "entropy": 0.12524008078034968, |
| "epoch": 250.0, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.146484375, |
| "learning_rate": 2.7824074074074077e-06, |
| "loss": 0.009342719614505769, |
| "num_tokens": 1390611.0, |
| "reward": 0.700000011920929, |
| "reward_std": 0.2828427076339722, |
| "rewards/reward_fn/mean": 0.700000011920929, |
| "rewards/reward_fn/std": 0.2828427076339722, |
| "step": 500, |
| "step_time": 25.58551082920021 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 138.2, |
| "completions/max_terminated_length": 138.2, |
| "completions/mean_length": 83.7, |
| "completions/mean_terminated_length": 83.7, |
| "completions/min_length": 35.6, |
| "completions/min_terminated_length": 35.6, |
| "entropy": 0.12294827501755208, |
| "epoch": 252.5, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.279296875, |
| "learning_rate": 2.759259259259259e-06, |
| "loss": 0.0008182898163795471, |
| "num_tokens": 1403863.0, |
| "reward": 0.7970000028610229, |
| "reward_std": 0.008485282957553863, |
| "rewards/reward_fn/mean": 0.7970000028610229, |
| "rewards/reward_fn/std": 0.008485282212495804, |
| "step": 505, |
| "step_time": 23.533826856199447 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 170.6, |
| "completions/max_terminated_length": 167.4, |
| "completions/mean_length": 101.25, |
| "completions/mean_terminated_length": 98.58928833007812, |
| "completions/min_length": 35.2, |
| "completions/min_terminated_length": 35.2, |
| "entropy": 0.11002964545041323, |
| "epoch": 255.0, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 2.7361111111111118e-06, |
| "loss": 0.01304551213979721, |
| "num_tokens": 1418109.0, |
| "reward": 0.7500000119209289, |
| "reward_std": 0.1414213538169861, |
| "rewards/reward_fn/mean": 0.7500000119209289, |
| "rewards/reward_fn/std": 0.1414213538169861, |
| "step": 510, |
| "step_time": 27.820240558599473 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 171.0, |
| "completions/max_terminated_length": 171.0, |
| "completions/mean_length": 94.05, |
| "completions/mean_terminated_length": 94.05, |
| "completions/min_length": 36.4, |
| "completions/min_terminated_length": 36.4, |
| "entropy": 0.1244091754895635, |
| "epoch": 257.5, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 2.712962962962963e-06, |
| "loss": 0.006943752616643905, |
| "num_tokens": 1432019.0, |
| "reward": 0.7500000119209289, |
| "reward_std": 0.1414213538169861, |
| "rewards/reward_fn/mean": 0.7500000119209289, |
| "rewards/reward_fn/std": 0.1414213538169861, |
| "step": 515, |
| "step_time": 27.71690519019976 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 164.6, |
| "completions/max_terminated_length": 164.6, |
| "completions/mean_length": 93.45, |
| "completions/mean_terminated_length": 93.45, |
| "completions/min_length": 36.2, |
| "completions/min_terminated_length": 36.2, |
| "entropy": 0.11850596296135336, |
| "epoch": 260.0, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.0, |
| "learning_rate": 2.689814814814815e-06, |
| "loss": 0.03078709840774536, |
| "num_tokens": 1445709.0, |
| "reward": 0.747000002861023, |
| "reward_std": 0.14046046733856202, |
| "rewards/reward_fn/mean": 0.747000002861023, |
| "rewards/reward_fn/std": 0.14046047925949096, |
| "step": 520, |
| "step_time": 26.998205948202667 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 154.0, |
| "completions/max_terminated_length": 154.0, |
| "completions/mean_length": 75.9, |
| "completions/mean_terminated_length": 75.9, |
| "completions/min_length": 19.8, |
| "completions/min_terminated_length": 19.8, |
| "entropy": 0.10163291104836389, |
| "epoch": 262.5, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.42578125, |
| "learning_rate": 2.666666666666667e-06, |
| "loss": 0.02896396517753601, |
| "num_tokens": 1458649.0, |
| "reward": 0.7999920129776001, |
| "reward_std": 2.263165224576369e-05, |
| "rewards/reward_fn/mean": 0.7999920129776001, |
| "rewards/reward_fn/std": 2.2628642909694463e-05, |
| "step": 525, |
| "step_time": 25.67267432579829 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 167.4, |
| "completions/max_terminated_length": 167.4, |
| "completions/mean_length": 106.8, |
| "completions/mean_terminated_length": 106.8, |
| "completions/min_length": 60.2, |
| "completions/min_terminated_length": 60.2, |
| "entropy": 0.12018695392180234, |
| "epoch": 265.0, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.10986328125, |
| "learning_rate": 2.6435185185185187e-06, |
| "loss": -9.685754776000977e-09, |
| "num_tokens": 1473117.0, |
| "reward": 0.7970000028610229, |
| "reward_std": 0.008485282957553863, |
| "rewards/reward_fn/mean": 0.7970000028610229, |
| "rewards/reward_fn/std": 0.008485282212495804, |
| "step": 530, |
| "step_time": 27.306792101602333 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 165.2, |
| "completions/max_terminated_length": 165.2, |
| "completions/mean_length": 90.35, |
| "completions/mean_terminated_length": 90.35, |
| "completions/min_length": 33.0, |
| "completions/min_terminated_length": 33.0, |
| "entropy": 0.12258970123948529, |
| "epoch": 267.5, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 2.6203703703703705e-06, |
| "loss": 0.024295052886009215, |
| "num_tokens": 1486879.0, |
| "reward": 0.7507920145988465, |
| "reward_std": 0.13918123245239258, |
| "rewards/reward_fn/mean": 0.7507920145988465, |
| "rewards/reward_fn/std": 0.13918124437332152, |
| "step": 535, |
| "step_time": 27.07393169499992 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.05, |
| "completions/max_length": 202.0, |
| "completions/max_terminated_length": 175.4, |
| "completions/mean_length": 99.4, |
| "completions/mean_terminated_length": 90.5750015258789, |
| "completions/min_length": 27.6, |
| "completions/min_terminated_length": 27.6, |
| "entropy": 0.12210832491982729, |
| "epoch": 270.0, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.0, |
| "learning_rate": 2.5972222222222227e-06, |
| "loss": 0.04415736496448517, |
| "num_tokens": 1500807.0, |
| "reward": 0.6500000119209289, |
| "reward_std": 0.3265853762626648, |
| "rewards/reward_fn/mean": 0.6500000119209289, |
| "rewards/reward_fn/std": 0.3265853762626648, |
| "step": 540, |
| "step_time": 31.728731169800447 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 138.0, |
| "completions/max_terminated_length": 138.0, |
| "completions/mean_length": 88.475, |
| "completions/mean_terminated_length": 88.475, |
| "completions/min_length": 55.0, |
| "completions/min_terminated_length": 55.0, |
| "entropy": 0.1007203730638139, |
| "epoch": 272.5, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.2734375, |
| "learning_rate": 2.5740740740740745e-06, |
| "loss": 0.01148533821105957, |
| "num_tokens": 1514278.0, |
| "reward": 0.7500000119209289, |
| "reward_std": 0.1414213538169861, |
| "rewards/reward_fn/mean": 0.7500000119209289, |
| "rewards/reward_fn/std": 0.1414213538169861, |
| "step": 545, |
| "step_time": 23.616023338599916 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 181.6, |
| "completions/max_terminated_length": 166.0, |
| "completions/mean_length": 99.95, |
| "completions/mean_terminated_length": 95.3357147216797, |
| "completions/min_length": 42.6, |
| "completions/min_terminated_length": 42.6, |
| "entropy": 0.14508932372555136, |
| "epoch": 275.0, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.134765625, |
| "learning_rate": 2.550925925925926e-06, |
| "loss": 0.058039349317550656, |
| "num_tokens": 1528444.0, |
| "reward": 0.700000011920929, |
| "reward_std": 0.2828427076339722, |
| "rewards/reward_fn/mean": 0.700000011920929, |
| "rewards/reward_fn/std": 0.2828427076339722, |
| "step": 550, |
| "step_time": 29.27236720959845 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 172.6, |
| "completions/max_terminated_length": 172.6, |
| "completions/mean_length": 94.725, |
| "completions/mean_terminated_length": 94.725, |
| "completions/min_length": 26.8, |
| "completions/min_terminated_length": 26.8, |
| "entropy": 0.12917686544824392, |
| "epoch": 277.5, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 2.5277777777777778e-06, |
| "loss": 0.008541859686374664, |
| "num_tokens": 1542381.0, |
| "reward": 0.7500000119209289, |
| "reward_std": 0.1414213538169861, |
| "rewards/reward_fn/mean": 0.7500000119209289, |
| "rewards/reward_fn/std": 0.1414213538169861, |
| "step": 555, |
| "step_time": 28.04494361659672 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 166.0, |
| "completions/max_terminated_length": 166.0, |
| "completions/mean_length": 89.575, |
| "completions/mean_terminated_length": 89.575, |
| "completions/min_length": 43.2, |
| "completions/min_terminated_length": 43.2, |
| "entropy": 0.10766231055604294, |
| "epoch": 280.0, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 2.5046296296296296e-06, |
| "loss": 0.0, |
| "num_tokens": 1555916.0, |
| "reward": 0.800000011920929, |
| "reward_std": 0.0, |
| "rewards/reward_fn/mean": 0.800000011920929, |
| "rewards/reward_fn/std": 0.0, |
| "step": 560, |
| "step_time": 27.3150207976003 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 164.4, |
| "completions/max_terminated_length": 164.4, |
| "completions/mean_length": 81.35, |
| "completions/mean_terminated_length": 81.35, |
| "completions/min_length": 28.6, |
| "completions/min_terminated_length": 28.6, |
| "entropy": 0.12244645584141836, |
| "epoch": 282.5, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 2.481481481481482e-06, |
| "loss": 0.0, |
| "num_tokens": 1569074.0, |
| "reward": 0.800000011920929, |
| "reward_std": 0.0, |
| "rewards/reward_fn/mean": 0.800000011920929, |
| "rewards/reward_fn/std": 0.0, |
| "step": 565, |
| "step_time": 26.9950363395983 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 207.8, |
| "completions/max_terminated_length": 207.8, |
| "completions/mean_length": 115.3, |
| "completions/mean_terminated_length": 115.3, |
| "completions/min_length": 50.2, |
| "completions/min_terminated_length": 50.2, |
| "entropy": 0.1410753179807216, |
| "epoch": 285.0, |
| "frac_reward_zero_std": 0.6, |
| "grad_norm": 0.1884765625, |
| "learning_rate": 2.4583333333333332e-06, |
| "loss": -0.0042663484811782835, |
| "num_tokens": 1583882.0, |
| "reward": 0.6040000081062317, |
| "reward_std": 0.46080578565597535, |
| "rewards/reward_fn/mean": 0.6040000081062317, |
| "rewards/reward_fn/std": 0.46080578565597535, |
| "step": 570, |
| "step_time": 32.51511334399911 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 157.6, |
| "completions/max_terminated_length": 157.6, |
| "completions/mean_length": 95.85, |
| "completions/mean_terminated_length": 95.85, |
| "completions/min_length": 50.6, |
| "completions/min_terminated_length": 50.6, |
| "entropy": 0.1270086049917154, |
| "epoch": 287.5, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.0, |
| "learning_rate": 2.4351851851851855e-06, |
| "loss": 0.0031342685222625734, |
| "num_tokens": 1597648.0, |
| "reward": 0.700000011920929, |
| "reward_std": 0.2828427076339722, |
| "rewards/reward_fn/mean": 0.700000011920929, |
| "rewards/reward_fn/std": 0.2828427076339722, |
| "step": 575, |
| "step_time": 26.03918815540019 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 178.6, |
| "completions/max_terminated_length": 170.8, |
| "completions/mean_length": 103.725, |
| "completions/mean_terminated_length": 101.60357360839843, |
| "completions/min_length": 52.2, |
| "completions/min_terminated_length": 52.2, |
| "entropy": 0.14261129099177197, |
| "epoch": 290.0, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.10693359375, |
| "learning_rate": 2.4120370370370373e-06, |
| "loss": 0.010924074053764343, |
| "num_tokens": 1611965.0, |
| "reward": 0.7499920129776001, |
| "reward_std": 0.14144398546923184, |
| "rewards/reward_fn/mean": 0.7499920129776001, |
| "rewards/reward_fn/std": 0.14144398245989578, |
| "step": 580, |
| "step_time": 28.818242219599778 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 181.8, |
| "completions/max_terminated_length": 181.8, |
| "completions/mean_length": 99.45, |
| "completions/mean_terminated_length": 99.45, |
| "completions/min_length": 25.8, |
| "completions/min_terminated_length": 25.8, |
| "entropy": 0.14679535252507775, |
| "epoch": 292.5, |
| "frac_reward_zero_std": 0.6, |
| "grad_norm": 0.193359375, |
| "learning_rate": 2.388888888888889e-06, |
| "loss": 0.04403604865074158, |
| "num_tokens": 1626111.0, |
| "reward": 0.6469920158386231, |
| "reward_std": 0.42329972982406616, |
| "rewards/reward_fn/mean": 0.6469920158386231, |
| "rewards/reward_fn/std": 0.42329972982406616, |
| "step": 585, |
| "step_time": 29.356267316001322 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 175.6, |
| "completions/max_terminated_length": 175.6, |
| "completions/mean_length": 90.075, |
| "completions/mean_terminated_length": 90.075, |
| "completions/min_length": 27.0, |
| "completions/min_terminated_length": 27.0, |
| "entropy": 0.11895611474756151, |
| "epoch": 295.0, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 2.365740740740741e-06, |
| "loss": 0.0307393878698349, |
| "num_tokens": 1639646.0, |
| "reward": 0.7500000119209289, |
| "reward_std": 0.1414213538169861, |
| "rewards/reward_fn/mean": 0.7500000119209289, |
| "rewards/reward_fn/std": 0.1414213538169861, |
| "step": 590, |
| "step_time": 28.44031162360261 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 163.6, |
| "completions/max_terminated_length": 163.6, |
| "completions/mean_length": 83.8, |
| "completions/mean_terminated_length": 83.8, |
| "completions/min_length": 28.0, |
| "completions/min_terminated_length": 28.0, |
| "entropy": 0.11074991283239796, |
| "epoch": 297.5, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.12060546875, |
| "learning_rate": 2.3425925925925928e-06, |
| "loss": -0.0017749778926372528, |
| "num_tokens": 1652930.0, |
| "reward": 0.747000002861023, |
| "reward_std": 0.14990663677453994, |
| "rewards/reward_fn/mean": 0.747000002861023, |
| "rewards/reward_fn/std": 0.1499066360294819, |
| "step": 595, |
| "step_time": 26.963711300198337 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 207.0, |
| "completions/max_terminated_length": 207.0, |
| "completions/mean_length": 100.125, |
| "completions/mean_terminated_length": 100.125, |
| "completions/min_length": 28.6, |
| "completions/min_terminated_length": 28.6, |
| "entropy": 0.15330470782937483, |
| "epoch": 300.0, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.0, |
| "learning_rate": 2.3194444444444446e-06, |
| "loss": 0.0064928531646728516, |
| "num_tokens": 1667103.0, |
| "reward": 0.747000002861023, |
| "reward_std": 0.14046045541763305, |
| "rewards/reward_fn/mean": 0.747000002861023, |
| "rewards/reward_fn/std": 0.14046047925949096, |
| "step": 600, |
| "step_time": 32.507627704396874 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 190.2, |
| "completions/max_terminated_length": 190.2, |
| "completions/mean_length": 96.925, |
| "completions/mean_terminated_length": 96.925, |
| "completions/min_length": 29.2, |
| "completions/min_terminated_length": 29.2, |
| "entropy": 0.1422055270173587, |
| "epoch": 302.5, |
| "frac_reward_zero_std": 0.7, |
| "grad_norm": 0.125, |
| "learning_rate": 2.2962962962962964e-06, |
| "loss": 0.008351585268974305, |
| "num_tokens": 1681128.0, |
| "reward": 0.6999920129776, |
| "reward_std": 0.28286533928621793, |
| "rewards/reward_fn/mean": 0.6999920129776, |
| "rewards/reward_fn/std": 0.2828653362768819, |
| "step": 605, |
| "step_time": 30.293829828397428 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 183.6, |
| "completions/max_terminated_length": 183.6, |
| "completions/mean_length": 94.025, |
| "completions/mean_terminated_length": 94.025, |
| "completions/min_length": 18.0, |
| "completions/min_terminated_length": 18.0, |
| "entropy": 0.10572184595512227, |
| "epoch": 305.0, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.203125, |
| "learning_rate": 2.2731481481481482e-06, |
| "loss": -0.007780712842941284, |
| "num_tokens": 1694841.0, |
| "reward": 0.700000011920929, |
| "reward_std": 0.2828427076339722, |
| "rewards/reward_fn/mean": 0.700000011920929, |
| "rewards/reward_fn/std": 0.2828427076339722, |
| "step": 610, |
| "step_time": 29.523135821998586 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 163.4, |
| "completions/max_terminated_length": 163.4, |
| "completions/mean_length": 92.0, |
| "completions/mean_terminated_length": 92.0, |
| "completions/min_length": 40.8, |
| "completions/min_terminated_length": 40.8, |
| "entropy": 0.11542486005928368, |
| "epoch": 307.5, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.1025390625, |
| "learning_rate": 2.25e-06, |
| "loss": 0.0066208459436893465, |
| "num_tokens": 1708473.0, |
| "reward": 0.747000002861023, |
| "reward_std": 0.14990663677453994, |
| "rewards/reward_fn/mean": 0.747000002861023, |
| "rewards/reward_fn/std": 0.1499066360294819, |
| "step": 615, |
| "step_time": 26.86916997299777 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 157.6, |
| "completions/max_terminated_length": 157.6, |
| "completions/mean_length": 100.575, |
| "completions/mean_terminated_length": 100.575, |
| "completions/min_length": 59.4, |
| "completions/min_terminated_length": 59.4, |
| "entropy": 0.14778990496415645, |
| "epoch": 310.0, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 2.226851851851852e-06, |
| "loss": 0.0, |
| "num_tokens": 1722644.0, |
| "reward": 0.800000011920929, |
| "reward_std": 0.0, |
| "rewards/reward_fn/mean": 0.800000011920929, |
| "rewards/reward_fn/std": 0.0, |
| "step": 620, |
| "step_time": 26.04490917779767 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 143.8, |
| "completions/max_terminated_length": 143.8, |
| "completions/mean_length": 80.75, |
| "completions/mean_terminated_length": 80.75, |
| "completions/min_length": 36.0, |
| "completions/min_terminated_length": 36.0, |
| "entropy": 0.1346577436546795, |
| "epoch": 312.5, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.0, |
| "learning_rate": 2.203703703703704e-06, |
| "loss": 0.0028170838952064512, |
| "num_tokens": 1735778.0, |
| "reward": 0.700000011920929, |
| "reward_std": 0.2828427076339722, |
| "rewards/reward_fn/mean": 0.700000011920929, |
| "rewards/reward_fn/std": 0.2828427076339722, |
| "step": 625, |
| "step_time": 24.271923562001756 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 174.4, |
| "completions/max_terminated_length": 162.2, |
| "completions/mean_length": 112.3, |
| "completions/mean_terminated_length": 109.96428833007812, |
| "completions/min_length": 59.4, |
| "completions/min_terminated_length": 59.4, |
| "entropy": 0.12042991645867004, |
| "epoch": 315.0, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.2255859375, |
| "learning_rate": 2.180555555555556e-06, |
| "loss": 0.021805547177791595, |
| "num_tokens": 1750466.0, |
| "reward": 0.747000002861023, |
| "reward_std": 0.14990663677453994, |
| "rewards/reward_fn/mean": 0.747000002861023, |
| "rewards/reward_fn/std": 0.1499066360294819, |
| "step": 630, |
| "step_time": 28.148792170204253 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 202.0, |
| "completions/max_terminated_length": 189.4, |
| "completions/mean_length": 106.8, |
| "completions/mean_terminated_length": 103.18214416503906, |
| "completions/min_length": 27.2, |
| "completions/min_terminated_length": 27.2, |
| "entropy": 0.1451239718357101, |
| "epoch": 317.5, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 2.1574074074074073e-06, |
| "loss": 0.0, |
| "num_tokens": 1764906.0, |
| "reward": 0.800000011920929, |
| "reward_std": 0.0, |
| "rewards/reward_fn/mean": 0.800000011920929, |
| "rewards/reward_fn/std": 0.0, |
| "step": 635, |
| "step_time": 31.783189745399433 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 170.2, |
| "completions/max_terminated_length": 170.2, |
| "completions/mean_length": 89.025, |
| "completions/mean_terminated_length": 89.025, |
| "completions/min_length": 35.8, |
| "completions/min_terminated_length": 35.8, |
| "entropy": 0.13819038349902257, |
| "epoch": 320.0, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.201171875, |
| "learning_rate": 2.1342592592592596e-06, |
| "loss": 0.02045248746871948, |
| "num_tokens": 1778399.0, |
| "reward": 0.747000002861023, |
| "reward_std": 0.14990663677453994, |
| "rewards/reward_fn/mean": 0.747000002861023, |
| "rewards/reward_fn/std": 0.1499066360294819, |
| "step": 640, |
| "step_time": 27.795567151399155 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 169.0, |
| "completions/max_terminated_length": 169.0, |
| "completions/mean_length": 91.0, |
| "completions/mean_terminated_length": 91.0, |
| "completions/min_length": 26.8, |
| "completions/min_terminated_length": 26.8, |
| "entropy": 0.1383639785577543, |
| "epoch": 322.5, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.0, |
| "learning_rate": 2.1111111111111114e-06, |
| "loss": -2.16066837310791e-08, |
| "num_tokens": 1792187.0, |
| "reward": 0.793999993801117, |
| "reward_std": 0.016970565915107726, |
| "rewards/reward_fn/mean": 0.793999993801117, |
| "rewards/reward_fn/std": 0.01697056442499161, |
| "step": 645, |
| "step_time": 27.67300096500112 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.05, |
| "completions/max_length": 196.4, |
| "completions/max_terminated_length": 176.2, |
| "completions/mean_length": 94.375, |
| "completions/mean_terminated_length": 86.31785888671875, |
| "completions/min_length": 26.8, |
| "completions/min_terminated_length": 26.8, |
| "entropy": 0.13860561683541164, |
| "epoch": 325.0, |
| "frac_reward_zero_std": 0.6, |
| "grad_norm": 0.173828125, |
| "learning_rate": 2.087962962962963e-06, |
| "loss": 0.06564760804176331, |
| "num_tokens": 1805914.0, |
| "reward": 0.6539920151233674, |
| "reward_std": 0.31940706349123504, |
| "rewards/reward_fn/mean": 0.6539920151233674, |
| "rewards/reward_fn/std": 0.3194070724028279, |
| "step": 650, |
| "step_time": 31.155566099599675 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 188.2, |
| "completions/max_terminated_length": 188.2, |
| "completions/mean_length": 96.375, |
| "completions/mean_terminated_length": 96.375, |
| "completions/min_length": 45.4, |
| "completions/min_terminated_length": 45.4, |
| "entropy": 0.12026678196853027, |
| "epoch": 327.5, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 2.064814814814815e-06, |
| "loss": 0.0, |
| "num_tokens": 1819721.0, |
| "reward": 0.800000011920929, |
| "reward_std": 0.0, |
| "rewards/reward_fn/mean": 0.800000011920929, |
| "rewards/reward_fn/std": 0.0, |
| "step": 655, |
| "step_time": 29.93114675219913 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 181.0, |
| "completions/max_terminated_length": 181.0, |
| "completions/mean_length": 92.75, |
| "completions/mean_terminated_length": 92.75, |
| "completions/min_length": 17.6, |
| "completions/min_terminated_length": 17.6, |
| "entropy": 0.12484765433473513, |
| "epoch": 330.0, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 2.041666666666667e-06, |
| "loss": 0.0, |
| "num_tokens": 1833579.0, |
| "reward": 0.800000011920929, |
| "reward_std": 0.0, |
| "rewards/reward_fn/mean": 0.800000011920929, |
| "rewards/reward_fn/std": 0.0, |
| "step": 660, |
| "step_time": 29.045186105799804 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 176.6, |
| "completions/max_terminated_length": 176.6, |
| "completions/mean_length": 88.8, |
| "completions/mean_terminated_length": 88.8, |
| "completions/min_length": 19.8, |
| "completions/min_terminated_length": 19.8, |
| "entropy": 0.11349608366144821, |
| "epoch": 332.5, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 2.0185185185185187e-06, |
| "loss": 0.023330992460250853, |
| "num_tokens": 1847063.0, |
| "reward": 0.7500000119209289, |
| "reward_std": 0.1414213538169861, |
| "rewards/reward_fn/mean": 0.7500000119209289, |
| "rewards/reward_fn/std": 0.1414213538169861, |
| "step": 665, |
| "step_time": 28.709340109200276 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 173.2, |
| "completions/max_terminated_length": 173.2, |
| "completions/mean_length": 94.525, |
| "completions/mean_terminated_length": 94.525, |
| "completions/min_length": 29.4, |
| "completions/min_terminated_length": 29.4, |
| "entropy": 0.13530309102497995, |
| "epoch": 335.0, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.1513671875, |
| "learning_rate": 1.9953703703703705e-06, |
| "loss": 0.005181397497653961, |
| "num_tokens": 1861012.0, |
| "reward": 0.7500000119209289, |
| "reward_std": 0.1414213538169861, |
| "rewards/reward_fn/mean": 0.7500000119209289, |
| "rewards/reward_fn/std": 0.1414213538169861, |
| "step": 670, |
| "step_time": 28.0701173802001 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 160.0, |
| "completions/max_terminated_length": 160.0, |
| "completions/mean_length": 98.75, |
| "completions/mean_terminated_length": 98.75, |
| "completions/min_length": 45.6, |
| "completions/min_terminated_length": 45.6, |
| "entropy": 0.13825307501247153, |
| "epoch": 337.5, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 1.9722222222222224e-06, |
| "loss": 0.007883030176162719, |
| "num_tokens": 1875110.0, |
| "reward": 0.7500000119209289, |
| "reward_std": 0.1414213538169861, |
| "rewards/reward_fn/mean": 0.7500000119209289, |
| "rewards/reward_fn/std": 0.1414213538169861, |
| "step": 675, |
| "step_time": 26.424618140001257 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 180.8, |
| "completions/max_terminated_length": 180.8, |
| "completions/mean_length": 97.3, |
| "completions/mean_terminated_length": 97.3, |
| "completions/min_length": 33.6, |
| "completions/min_terminated_length": 33.6, |
| "entropy": 0.12926889541558922, |
| "epoch": 340.0, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 1.949074074074074e-06, |
| "loss": -1.1175870895385742e-08, |
| "num_tokens": 1888954.0, |
| "reward": 0.7970000028610229, |
| "reward_std": 0.008485282957553863, |
| "rewards/reward_fn/mean": 0.7970000028610229, |
| "rewards/reward_fn/std": 0.008485282212495804, |
| "step": 680, |
| "step_time": 29.160137929796473 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 177.4, |
| "completions/max_terminated_length": 177.4, |
| "completions/mean_length": 98.375, |
| "completions/mean_terminated_length": 98.375, |
| "completions/min_length": 21.2, |
| "completions/min_terminated_length": 21.2, |
| "entropy": 0.12973124553682283, |
| "epoch": 342.5, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.0, |
| "learning_rate": 1.925925925925926e-06, |
| "loss": 0.01764121949672699, |
| "num_tokens": 1903037.0, |
| "reward": 0.7499920129776001, |
| "reward_std": 0.14144398546923184, |
| "rewards/reward_fn/mean": 0.7499920129776001, |
| "rewards/reward_fn/std": 0.14144398245989578, |
| "step": 685, |
| "step_time": 28.66847429559639 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 189.4, |
| "completions/max_terminated_length": 189.4, |
| "completions/mean_length": 95.675, |
| "completions/mean_terminated_length": 95.675, |
| "completions/min_length": 39.2, |
| "completions/min_terminated_length": 39.2, |
| "entropy": 0.13125102190533652, |
| "epoch": 345.0, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 1.9027777777777778e-06, |
| "loss": 0.0, |
| "num_tokens": 1916816.0, |
| "reward": 0.800000011920929, |
| "reward_std": 0.0, |
| "rewards/reward_fn/mean": 0.800000011920929, |
| "rewards/reward_fn/std": 0.0, |
| "step": 690, |
| "step_time": 30.259811803402407 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 179.2, |
| "completions/max_terminated_length": 178.4, |
| "completions/mean_length": 103.3, |
| "completions/mean_terminated_length": 101.18571472167969, |
| "completions/min_length": 38.8, |
| "completions/min_terminated_length": 38.8, |
| "entropy": 0.13168567434186115, |
| "epoch": 347.5, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.0, |
| "learning_rate": 1.8796296296296299e-06, |
| "loss": 0.038299480080604555, |
| "num_tokens": 1931116.0, |
| "reward": 0.7499920129776001, |
| "reward_std": 0.14144398546923184, |
| "rewards/reward_fn/mean": 0.7499920129776001, |
| "rewards/reward_fn/std": 0.14144398245989578, |
| "step": 695, |
| "step_time": 28.8232333001979 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 168.6, |
| "completions/max_terminated_length": 168.6, |
| "completions/mean_length": 95.725, |
| "completions/mean_terminated_length": 95.725, |
| "completions/min_length": 42.0, |
| "completions/min_terminated_length": 42.0, |
| "entropy": 0.11110330240335316, |
| "epoch": 350.0, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.0, |
| "learning_rate": 1.8564814814814817e-06, |
| "loss": 0.019394376873970033, |
| "num_tokens": 1944877.0, |
| "reward": 0.700000011920929, |
| "reward_std": 0.2828427076339722, |
| "rewards/reward_fn/mean": 0.700000011920929, |
| "rewards/reward_fn/std": 0.2828427076339722, |
| "step": 700, |
| "step_time": 27.542852006795876 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 186.4, |
| "completions/max_terminated_length": 186.4, |
| "completions/mean_length": 100.875, |
| "completions/mean_terminated_length": 100.875, |
| "completions/min_length": 36.4, |
| "completions/min_terminated_length": 36.4, |
| "entropy": 0.13480499390279874, |
| "epoch": 352.5, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.193359375, |
| "learning_rate": 1.8333333333333333e-06, |
| "loss": 0.006207095086574554, |
| "num_tokens": 1958864.0, |
| "reward": 0.6499920099973678, |
| "reward_std": 0.20704229450930142, |
| "rewards/reward_fn/mean": 0.6499920099973678, |
| "rewards/reward_fn/std": 0.20704231534182327, |
| "step": 705, |
| "step_time": 29.79549445579905 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 184.6, |
| "completions/max_terminated_length": 181.4, |
| "completions/mean_length": 100.15, |
| "completions/mean_terminated_length": 96.58928680419922, |
| "completions/min_length": 37.4, |
| "completions/min_terminated_length": 37.4, |
| "entropy": 0.14106163564138113, |
| "epoch": 355.0, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 1.8101851851851853e-06, |
| "loss": 0.0, |
| "num_tokens": 1973018.0, |
| "reward": 0.800000011920929, |
| "reward_std": 0.0, |
| "rewards/reward_fn/mean": 0.800000011920929, |
| "rewards/reward_fn/std": 0.0, |
| "step": 710, |
| "step_time": 29.740049313199414 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 137.0, |
| "completions/max_terminated_length": 137.0, |
| "completions/mean_length": 80.225, |
| "completions/mean_terminated_length": 80.225, |
| "completions/min_length": 44.2, |
| "completions/min_terminated_length": 44.2, |
| "entropy": 0.13479442811803893, |
| "epoch": 357.5, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 1.7870370370370371e-06, |
| "loss": -0.0044358067214488985, |
| "num_tokens": 1986131.0, |
| "reward": 0.7999920129776001, |
| "reward_std": 2.263165224576369e-05, |
| "rewards/reward_fn/mean": 0.7999920129776001, |
| "rewards/reward_fn/std": 2.2628642909694463e-05, |
| "step": 715, |
| "step_time": 23.464931381204224 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 210.4, |
| "completions/max_terminated_length": 195.4, |
| "completions/mean_length": 108.975, |
| "completions/mean_terminated_length": 104.57142944335938, |
| "completions/min_length": 36.4, |
| "completions/min_terminated_length": 36.4, |
| "entropy": 0.12921297890134156, |
| "epoch": 360.0, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.2138671875, |
| "learning_rate": 1.7638888888888892e-06, |
| "loss": 0.015706980228424074, |
| "num_tokens": 2000686.0, |
| "reward": 0.6500000119209289, |
| "reward_std": 0.3265853762626648, |
| "rewards/reward_fn/mean": 0.6500000119209289, |
| "rewards/reward_fn/std": 0.3265853762626648, |
| "step": 720, |
| "step_time": 33.07905763199815 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 213.2, |
| "completions/max_terminated_length": 197.6, |
| "completions/mean_length": 109.575, |
| "completions/mean_terminated_length": 105.03928680419922, |
| "completions/min_length": 38.4, |
| "completions/min_terminated_length": 38.4, |
| "entropy": 0.11884576582815498, |
| "epoch": 362.5, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 1.740740740740741e-06, |
| "loss": 0.04035586714744568, |
| "num_tokens": 2015265.0, |
| "reward": 0.7500000119209289, |
| "reward_std": 0.1414213538169861, |
| "rewards/reward_fn/mean": 0.7500000119209289, |
| "rewards/reward_fn/std": 0.1414213538169861, |
| "step": 725, |
| "step_time": 33.32936851200211 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 144.8, |
| "completions/max_terminated_length": 128.4, |
| "completions/mean_length": 87.825, |
| "completions/mean_terminated_length": 85.10714416503906, |
| "completions/min_length": 67.0, |
| "completions/min_terminated_length": 67.0, |
| "entropy": 0.12527481613215058, |
| "epoch": 365.0, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 1.7175925925925926e-06, |
| "loss": 0.00535660944879055, |
| "num_tokens": 2028682.0, |
| "reward": 0.7500000119209289, |
| "reward_std": 0.1414213538169861, |
| "rewards/reward_fn/mean": 0.7500000119209289, |
| "rewards/reward_fn/std": 0.1414213538169861, |
| "step": 730, |
| "step_time": 24.470471370794986 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 194.6, |
| "completions/max_terminated_length": 172.8, |
| "completions/mean_length": 94.275, |
| "completions/mean_terminated_length": 89.62142944335938, |
| "completions/min_length": 21.2, |
| "completions/min_terminated_length": 21.2, |
| "entropy": 0.1287142861634493, |
| "epoch": 367.5, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 1.6944444444444446e-06, |
| "loss": 0.04657252728939056, |
| "num_tokens": 2042601.0, |
| "reward": 0.7500000119209289, |
| "reward_std": 0.1414213538169861, |
| "rewards/reward_fn/mean": 0.7500000119209289, |
| "rewards/reward_fn/std": 0.1414213538169861, |
| "step": 735, |
| "step_time": 31.15084702780441 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 188.2, |
| "completions/max_terminated_length": 179.4, |
| "completions/mean_length": 93.5, |
| "completions/mean_terminated_length": 89.78214416503906, |
| "completions/min_length": 26.0, |
| "completions/min_terminated_length": 26.0, |
| "entropy": 0.12946587130427362, |
| "epoch": 370.0, |
| "frac_reward_zero_std": 0.7, |
| "grad_norm": 0.0, |
| "learning_rate": 1.6712962962962965e-06, |
| "loss": 0.036357957124710086, |
| "num_tokens": 2056293.0, |
| "reward": 0.6999920129776, |
| "reward_std": 0.28286533928621793, |
| "rewards/reward_fn/mean": 0.6999920129776, |
| "rewards/reward_fn/std": 0.2828653362768819, |
| "step": 740, |
| "step_time": 30.288723102195945 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 184.0, |
| "completions/max_terminated_length": 184.0, |
| "completions/mean_length": 96.55, |
| "completions/mean_terminated_length": 96.55, |
| "completions/min_length": 32.8, |
| "completions/min_terminated_length": 32.8, |
| "entropy": 0.14114207804668694, |
| "epoch": 372.5, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.318359375, |
| "learning_rate": 1.648148148148148e-06, |
| "loss": 0.04727624654769898, |
| "num_tokens": 2070107.0, |
| "reward": 0.700000011920929, |
| "reward_std": 0.2828427076339722, |
| "rewards/reward_fn/mean": 0.700000011920929, |
| "rewards/reward_fn/std": 0.2828427076339722, |
| "step": 745, |
| "step_time": 29.68092410319514 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 196.2, |
| "completions/max_terminated_length": 186.8, |
| "completions/mean_length": 102.175, |
| "completions/mean_terminated_length": 97.74285736083985, |
| "completions/min_length": 29.2, |
| "completions/min_terminated_length": 29.2, |
| "entropy": 0.14779772673500702, |
| "epoch": 375.0, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 1.6250000000000001e-06, |
| "loss": 0.03531245589256286, |
| "num_tokens": 2084342.0, |
| "reward": 0.7500000119209289, |
| "reward_std": 0.1414213538169861, |
| "rewards/reward_fn/mean": 0.7500000119209289, |
| "rewards/reward_fn/std": 0.1414213538169861, |
| "step": 750, |
| "step_time": 31.207076488804887 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 600, |
| "num_input_tokens_seen": 2086592, |
| "num_train_epochs": 1, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|