Spaces:
Sleeping
Sleeping
| [ | |
| { | |
| "loss": 0.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5e-06, | |
| "num_tokens": 3013.0, | |
| "completions/mean_length": 137.25, | |
| "completions/min_length": 109.0, | |
| "completions/max_length": 182.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 137.25, | |
| "completions/min_terminated_length": 109.0, | |
| "completions/max_terminated_length": 182.0, | |
| "rewards/reward_fn/mean": 0.33500000834465027, | |
| "rewards/reward_fn/std": 0.0, | |
| "reward": 0.33500000834465027, | |
| "reward_std": 0.0, | |
| "frac_reward_zero_std": 1.0, | |
| "entropy": 1.2721036672592163, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 26.99397590800072, | |
| "epoch": 0.0016666666666666668, | |
| "step": 1 | |
| }, | |
| { | |
| "loss": -0.04769711568951607, | |
| "grad_norm": 0.376953125, | |
| "learning_rate": 4.975000000000001e-06, | |
| "num_tokens": 5917.0, | |
| "completions/mean_length": 114.0, | |
| "completions/min_length": 98.0, | |
| "completions/max_length": 125.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 114.0, | |
| "completions/min_terminated_length": 98.0, | |
| "completions/max_terminated_length": 125.0, | |
| "rewards/reward_fn/mean": 0.9693750143051147, | |
| "rewards/reward_fn/std": 0.00875002145767212, | |
| "reward": 0.9693750143051147, | |
| "reward_std": 0.00875002145767212, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.901360273361206, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 19.053884660999756, | |
| "epoch": 0.0033333333333333335, | |
| "step": 2 | |
| }, | |
| { | |
| "loss": -0.011300479993224144, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 4.95e-06, | |
| "num_tokens": 8552.0, | |
| "completions/mean_length": 143.75, | |
| "completions/min_length": 114.0, | |
| "completions/max_length": 197.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 143.75, | |
| "completions/min_terminated_length": 114.0, | |
| "completions/max_terminated_length": 197.0, | |
| "rewards/reward_fn/mean": 0.4793750047683716, | |
| "rewards/reward_fn/std": 0.288750022649765, | |
| "reward": 0.4793750047683716, | |
| "reward_std": 0.28874996304512024, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1808185577392578, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 28.53829234299701, | |
| "epoch": 0.005, | |
| "step": 3 | |
| }, | |
| { | |
| "loss": 0.07140593230724335, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 4.925e-06, | |
| "num_tokens": 11250.0, | |
| "completions/mean_length": 115.5, | |
| "completions/min_length": 99.0, | |
| "completions/max_length": 132.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 115.5, | |
| "completions/min_terminated_length": 99.0, | |
| "completions/max_terminated_length": 132.0, | |
| "rewards/reward_fn/mean": 0.4925000071525574, | |
| "rewards/reward_fn/std": 0.3149999678134918, | |
| "reward": 0.4925000071525574, | |
| "reward_std": 0.3149999976158142, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0730382204055786, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 20.0260389340001, | |
| "epoch": 0.006666666666666667, | |
| "step": 4 | |
| }, | |
| { | |
| "loss": 0.005974027793854475, | |
| "grad_norm": 0.30078125, | |
| "learning_rate": 4.9000000000000005e-06, | |
| "num_tokens": 14256.0, | |
| "completions/mean_length": 125.5, | |
| "completions/min_length": 106.0, | |
| "completions/max_length": 151.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 125.5, | |
| "completions/min_terminated_length": 106.0, | |
| "completions/max_terminated_length": 151.0, | |
| "rewards/reward_fn/mean": 0.48374998569488525, | |
| "rewards/reward_fn/std": 0.29750001430511475, | |
| "reward": 0.48374998569488525, | |
| "reward_std": 0.29750001430511475, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.044655680656433, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.637786576000508, | |
| "epoch": 0.008333333333333333, | |
| "step": 5 | |
| }, | |
| { | |
| "loss": -0.018765343353152275, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 4.875e-06, | |
| "num_tokens": 17146.0, | |
| "completions/mean_length": 146.5, | |
| "completions/min_length": 120.0, | |
| "completions/max_length": 175.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 146.5, | |
| "completions/min_terminated_length": 120.0, | |
| "completions/max_terminated_length": 175.0, | |
| "rewards/reward_fn/mean": 0.4925000071525574, | |
| "rewards/reward_fn/std": 0.3149999678134918, | |
| "reward": 0.4925000071525574, | |
| "reward_std": 0.3149999976158142, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.157387137413025, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 25.646922176998487, | |
| "epoch": 0.01, | |
| "step": 6 | |
| }, | |
| { | |
| "loss": 0.04119409993290901, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 4.85e-06, | |
| "num_tokens": 20239.0, | |
| "completions/mean_length": 167.25, | |
| "completions/min_length": 135.0, | |
| "completions/max_length": 185.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 167.25, | |
| "completions/min_terminated_length": 135.0, | |
| "completions/max_terminated_length": 185.0, | |
| "rewards/reward_fn/mean": 0.6412500143051147, | |
| "rewards/reward_fn/std": 0.35391557216644287, | |
| "reward": 0.6412500143051147, | |
| "reward_std": 0.35391560196876526, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1936835050582886, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 27.48416990199985, | |
| "epoch": 0.011666666666666667, | |
| "step": 7 | |
| }, | |
| { | |
| "loss": 0.08870118111371994, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 4.825e-06, | |
| "num_tokens": 22611.0, | |
| "completions/mean_length": 168.0, | |
| "completions/min_length": 136.0, | |
| "completions/max_length": 221.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 168.0, | |
| "completions/min_terminated_length": 136.0, | |
| "completions/max_terminated_length": 221.0, | |
| "rewards/reward_fn/mean": 0.500374972820282, | |
| "rewards/reward_fn/std": 0.31010571122169495, | |
| "reward": 0.500374972820282, | |
| "reward_std": 0.31010571122169495, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0776662826538086, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 31.74235246600074, | |
| "epoch": 0.013333333333333334, | |
| "step": 8 | |
| }, | |
| { | |
| "loss": 0.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.800000000000001e-06, | |
| "num_tokens": 25760.0, | |
| "completions/mean_length": 159.25, | |
| "completions/min_length": 107.0, | |
| "completions/max_length": 212.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 159.25, | |
| "completions/min_terminated_length": 107.0, | |
| "completions/max_terminated_length": 212.0, | |
| "rewards/reward_fn/mean": 0.33500000834465027, | |
| "rewards/reward_fn/std": 0.0, | |
| "reward": 0.33500000834465027, | |
| "reward_std": 0.0, | |
| "frac_reward_zero_std": 1.0, | |
| "entropy": 1.1662333011627197, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 31.728966909002338, | |
| "epoch": 0.015, | |
| "step": 9 | |
| }, | |
| { | |
| "loss": 0.1400359570980072, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 4.775e-06, | |
| "num_tokens": 28494.0, | |
| "completions/mean_length": 121.5, | |
| "completions/min_length": 100.0, | |
| "completions/max_length": 152.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 121.5, | |
| "completions/min_terminated_length": 100.0, | |
| "completions/max_terminated_length": 152.0, | |
| "rewards/reward_fn/mean": 0.7856249809265137, | |
| "rewards/reward_fn/std": 0.3052483797073364, | |
| "reward": 0.7856249809265137, | |
| "reward_std": 0.3052483797073364, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0164639949798584, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 23.630102548999275, | |
| "epoch": 0.016666666666666666, | |
| "step": 10 | |
| }, | |
| { | |
| "loss": 0.04951930046081543, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 4.75e-06, | |
| "num_tokens": 31562.0, | |
| "completions/mean_length": 156.0, | |
| "completions/min_length": 127.0, | |
| "completions/max_length": 183.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 156.0, | |
| "completions/min_terminated_length": 127.0, | |
| "completions/max_terminated_length": 183.0, | |
| "rewards/reward_fn/mean": 0.8075000047683716, | |
| "rewards/reward_fn/std": 0.2921329438686371, | |
| "reward": 0.8075000047683716, | |
| "reward_std": 0.2921329736709595, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2048428058624268, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 28.393185649998486, | |
| "epoch": 0.018333333333333333, | |
| "step": 11 | |
| }, | |
| { | |
| "loss": -0.14611932635307312, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 4.7250000000000005e-06, | |
| "num_tokens": 34288.0, | |
| "completions/mean_length": 116.5, | |
| "completions/min_length": 82.0, | |
| "completions/max_length": 163.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 116.5, | |
| "completions/min_terminated_length": 82.0, | |
| "completions/max_terminated_length": 163.0, | |
| "rewards/reward_fn/mean": 0.33125001192092896, | |
| "rewards/reward_fn/std": 0.007500007748603821, | |
| "reward": 0.33125001192092896, | |
| "reward_std": 0.007500012870877981, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0292104482650757, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 25.381137749998743, | |
| "epoch": 0.02, | |
| "step": 12 | |
| }, | |
| { | |
| "loss": 0.04130696505308151, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 4.7e-06, | |
| "num_tokens": 37036.0, | |
| "completions/mean_length": 146.0, | |
| "completions/min_length": 120.0, | |
| "completions/max_length": 157.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 146.0, | |
| "completions/min_terminated_length": 120.0, | |
| "completions/max_terminated_length": 157.0, | |
| "rewards/reward_fn/mean": 0.6412500143051147, | |
| "rewards/reward_fn/std": 0.35391557216644287, | |
| "reward": 0.6412500143051147, | |
| "reward_std": 0.35391560196876526, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2967694997787476, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 24.01959514600094, | |
| "epoch": 0.021666666666666667, | |
| "step": 13 | |
| }, | |
| { | |
| "loss": 0.03943007439374924, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 4.675000000000001e-06, | |
| "num_tokens": 40060.0, | |
| "completions/mean_length": 163.0, | |
| "completions/min_length": 150.0, | |
| "completions/max_length": 181.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 163.0, | |
| "completions/min_terminated_length": 150.0, | |
| "completions/max_terminated_length": 181.0, | |
| "rewards/reward_fn/mean": 0.9693750143051147, | |
| "rewards/reward_fn/std": 0.00875002145767212, | |
| "reward": 0.9693750143051147, | |
| "reward_std": 0.00875002145767212, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2760556936264038, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 26.05132091699852, | |
| "epoch": 0.023333333333333334, | |
| "step": 14 | |
| }, | |
| { | |
| "loss": -0.024401623755693436, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 4.65e-06, | |
| "num_tokens": 42856.0, | |
| "completions/mean_length": 154.0, | |
| "completions/min_length": 145.0, | |
| "completions/max_length": 163.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 154.0, | |
| "completions/min_terminated_length": 145.0, | |
| "completions/max_terminated_length": 163.0, | |
| "rewards/reward_fn/mean": 0.9606249928474426, | |
| "rewards/reward_fn/std": 0.02202034927904606, | |
| "reward": 0.9606249928474426, | |
| "reward_std": 0.02202034927904606, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0788471698760986, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 24.084032504997595, | |
| "epoch": 0.025, | |
| "step": 15 | |
| }, | |
| { | |
| "loss": 0.10702173411846161, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 4.625000000000001e-06, | |
| "num_tokens": 46438.0, | |
| "completions/mean_length": 193.5, | |
| "completions/min_length": 159.0, | |
| "completions/max_length": 257.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 193.5, | |
| "completions/min_terminated_length": 159.0, | |
| "completions/max_terminated_length": 257.0, | |
| "rewards/reward_fn/mean": 0.6493749618530273, | |
| "rewards/reward_fn/std": 0.35504916310310364, | |
| "reward": 0.6493749618530273, | |
| "reward_std": 0.355049192905426, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.4748042821884155, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 37.7986828969988, | |
| "epoch": 0.02666666666666667, | |
| "step": 16 | |
| }, | |
| { | |
| "loss": 0.04349641874432564, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 4.600000000000001e-06, | |
| "num_tokens": 48760.0, | |
| "completions/mean_length": 143.5, | |
| "completions/min_length": 117.0, | |
| "completions/max_length": 156.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 143.5, | |
| "completions/min_terminated_length": 117.0, | |
| "completions/max_terminated_length": 156.0, | |
| "rewards/reward_fn/mean": 0.9275000095367432, | |
| "rewards/reward_fn/std": 0.07499998807907104, | |
| "reward": 0.9275000095367432, | |
| "reward_std": 0.07499998807907104, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.177292823791504, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.96707353400052, | |
| "epoch": 0.028333333333333332, | |
| "step": 17 | |
| }, | |
| { | |
| "loss": -0.0374990850687027, | |
| "grad_norm": 0.4375, | |
| "learning_rate": 4.575e-06, | |
| "num_tokens": 51631.0, | |
| "completions/mean_length": 108.75, | |
| "completions/min_length": 93.0, | |
| "completions/max_length": 122.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 108.75, | |
| "completions/min_terminated_length": 93.0, | |
| "completions/max_terminated_length": 122.0, | |
| "rewards/reward_fn/mean": 0.9693750143051147, | |
| "rewards/reward_fn/std": 0.00875002145767212, | |
| "reward": 0.9693750143051147, | |
| "reward_std": 0.00875002145767212, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.9841785430908203, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 18.896498049998627, | |
| "epoch": 0.03, | |
| "step": 18 | |
| }, | |
| { | |
| "loss": -0.02571682445704937, | |
| "grad_norm": 0.375, | |
| "learning_rate": 4.5500000000000005e-06, | |
| "num_tokens": 54431.0, | |
| "completions/mean_length": 116.0, | |
| "completions/min_length": 107.0, | |
| "completions/max_length": 125.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 116.0, | |
| "completions/min_terminated_length": 107.0, | |
| "completions/max_terminated_length": 125.0, | |
| "rewards/reward_fn/mean": 0.9562499523162842, | |
| "rewards/reward_fn/std": 0.017499983310699463, | |
| "reward": 0.9562499523162842, | |
| "reward_std": 0.017499983310699463, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1009763479232788, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 19.452795449000405, | |
| "epoch": 0.03166666666666667, | |
| "step": 19 | |
| }, | |
| { | |
| "loss": 0.1016233041882515, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 4.525000000000001e-06, | |
| "num_tokens": 57738.0, | |
| "completions/mean_length": 140.75, | |
| "completions/min_length": 85.0, | |
| "completions/max_length": 187.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 140.75, | |
| "completions/min_terminated_length": 85.0, | |
| "completions/max_terminated_length": 187.0, | |
| "rewards/reward_fn/mean": 0.953249990940094, | |
| "rewards/reward_fn/std": 0.01649998500943184, | |
| "reward": 0.953249990940094, | |
| "reward_std": 0.01649998500943184, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.152324914932251, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 28.531233167999744, | |
| "epoch": 0.03333333333333333, | |
| "step": 20 | |
| }, | |
| { | |
| "loss": 0.05338107421994209, | |
| "grad_norm": 0.39453125, | |
| "learning_rate": 4.5e-06, | |
| "num_tokens": 60750.0, | |
| "completions/mean_length": 121.0, | |
| "completions/min_length": 100.0, | |
| "completions/max_length": 144.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 121.0, | |
| "completions/min_terminated_length": 100.0, | |
| "completions/max_terminated_length": 144.0, | |
| "rewards/reward_fn/mean": 0.9728749990463257, | |
| "rewards/reward_fn/std": 0.015750020742416382, | |
| "reward": 0.9728749990463257, | |
| "reward_std": 0.015750011429190636, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0693097114562988, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.120251809003094, | |
| "epoch": 0.035, | |
| "step": 21 | |
| }, | |
| { | |
| "loss": -0.032417263835668564, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 4.475e-06, | |
| "num_tokens": 63754.0, | |
| "completions/mean_length": 122.0, | |
| "completions/min_length": 106.0, | |
| "completions/max_length": 146.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 122.0, | |
| "completions/min_terminated_length": 106.0, | |
| "completions/max_terminated_length": 146.0, | |
| "rewards/reward_fn/mean": 0.3306249976158142, | |
| "rewards/reward_fn/std": 0.008750006556510925, | |
| "reward": 0.3306249976158142, | |
| "reward_std": 0.008750001899898052, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0988726615905762, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.679130085998622, | |
| "epoch": 0.03666666666666667, | |
| "step": 22 | |
| }, | |
| { | |
| "loss": -0.07120607793331146, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 4.450000000000001e-06, | |
| "num_tokens": 66768.0, | |
| "completions/mean_length": 121.5, | |
| "completions/min_length": 91.0, | |
| "completions/max_length": 153.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 121.5, | |
| "completions/min_terminated_length": 91.0, | |
| "completions/max_terminated_length": 153.0, | |
| "rewards/reward_fn/mean": 0.9781249761581421, | |
| "rewards/reward_fn/std": 0.00875002145767212, | |
| "reward": 0.9781249761581421, | |
| "reward_std": 0.00875002145767212, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1439276933670044, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 23.13522673600164, | |
| "epoch": 0.03833333333333333, | |
| "step": 23 | |
| }, | |
| { | |
| "loss": 0.04352416470646858, | |
| "grad_norm": 0.38671875, | |
| "learning_rate": 4.425e-06, | |
| "num_tokens": 69557.0, | |
| "completions/mean_length": 120.25, | |
| "completions/min_length": 102.0, | |
| "completions/max_length": 136.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 120.25, | |
| "completions/min_terminated_length": 102.0, | |
| "completions/max_terminated_length": 136.0, | |
| "rewards/reward_fn/mean": 0.7987499833106995, | |
| "rewards/reward_fn/std": 0.3096065819263458, | |
| "reward": 0.7987499833106995, | |
| "reward_std": 0.3096066117286682, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2446463108062744, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 20.66519093700117, | |
| "epoch": 0.04, | |
| "step": 24 | |
| }, | |
| { | |
| "loss": -0.06304893642663956, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 4.4e-06, | |
| "num_tokens": 72543.0, | |
| "completions/mean_length": 139.5, | |
| "completions/min_length": 111.0, | |
| "completions/max_length": 153.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 139.5, | |
| "completions/min_terminated_length": 111.0, | |
| "completions/max_terminated_length": 153.0, | |
| "rewards/reward_fn/mean": 0.6499999761581421, | |
| "rewards/reward_fn/std": 0.3441172242164612, | |
| "reward": 0.6499999761581421, | |
| "reward_std": 0.3441172242164612, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.232834815979004, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.765788336000696, | |
| "epoch": 0.041666666666666664, | |
| "step": 25 | |
| }, | |
| { | |
| "loss": 0.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.3750000000000005e-06, | |
| "num_tokens": 74829.0, | |
| "completions/mean_length": 138.5, | |
| "completions/min_length": 112.0, | |
| "completions/max_length": 161.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 138.5, | |
| "completions/min_terminated_length": 112.0, | |
| "completions/max_terminated_length": 161.0, | |
| "rewards/reward_fn/mean": 0.9649999737739563, | |
| "rewards/reward_fn/std": 0.0, | |
| "reward": 0.9649999737739563, | |
| "reward_std": 0.0, | |
| "frac_reward_zero_std": 1.0, | |
| "entropy": 1.2499821186065674, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 23.262686510002823, | |
| "epoch": 0.043333333333333335, | |
| "step": 26 | |
| }, | |
| { | |
| "loss": -0.062136210501194, | |
| "grad_norm": 0.40234375, | |
| "learning_rate": 4.350000000000001e-06, | |
| "num_tokens": 78028.0, | |
| "completions/mean_length": 134.75, | |
| "completions/min_length": 118.0, | |
| "completions/max_length": 167.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 134.75, | |
| "completions/min_terminated_length": 118.0, | |
| "completions/max_terminated_length": 167.0, | |
| "rewards/reward_fn/mean": 0.7699999809265137, | |
| "rewards/reward_fn/std": 0.38999998569488525, | |
| "reward": 0.7699999809265137, | |
| "reward_std": 0.38999998569488525, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2326076030731201, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 25.535409896998317, | |
| "epoch": 0.045, | |
| "step": 27 | |
| }, | |
| { | |
| "loss": 0.10957296937704086, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 4.325e-06, | |
| "num_tokens": 80670.0, | |
| "completions/mean_length": 106.5, | |
| "completions/min_length": 83.0, | |
| "completions/max_length": 151.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 106.5, | |
| "completions/min_terminated_length": 83.0, | |
| "completions/max_terminated_length": 151.0, | |
| "rewards/reward_fn/mean": 0.9722499847412109, | |
| "rewards/reward_fn/std": 0.014500021934509277, | |
| "reward": 0.9722499847412109, | |
| "reward_std": 0.014500022865831852, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0672565698623657, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.534873515000072, | |
| "epoch": 0.04666666666666667, | |
| "step": 28 | |
| }, | |
| { | |
| "loss": -0.1097787395119667, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 4.3e-06, | |
| "num_tokens": 83466.0, | |
| "completions/mean_length": 138.0, | |
| "completions/min_length": 107.0, | |
| "completions/max_length": 206.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 138.0, | |
| "completions/min_terminated_length": 107.0, | |
| "completions/max_terminated_length": 206.0, | |
| "rewards/reward_fn/mean": 0.8206249475479126, | |
| "rewards/reward_fn/std": 0.3005298972129822, | |
| "reward": 0.8206249475479126, | |
| "reward_std": 0.3005298972129822, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.063458800315857, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 31.418785993999336, | |
| "epoch": 0.04833333333333333, | |
| "step": 29 | |
| }, | |
| { | |
| "loss": 0.020558878779411316, | |
| "grad_norm": 0.3046875, | |
| "learning_rate": 4.2750000000000006e-06, | |
| "num_tokens": 86306.0, | |
| "completions/mean_length": 146.0, | |
| "completions/min_length": 127.0, | |
| "completions/max_length": 170.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 146.0, | |
| "completions/min_terminated_length": 127.0, | |
| "completions/max_terminated_length": 170.0, | |
| "rewards/reward_fn/mean": 0.32625001668930054, | |
| "rewards/reward_fn/std": 0.010103637352585793, | |
| "reward": 0.32625001668930054, | |
| "reward_std": 0.010103637352585793, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1856608390808105, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 25.58633564399861, | |
| "epoch": 0.05, | |
| "step": 30 | |
| }, | |
| { | |
| "loss": -0.1461215764284134, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 4.25e-06, | |
| "num_tokens": 89374.0, | |
| "completions/mean_length": 134.0, | |
| "completions/min_length": 86.0, | |
| "completions/max_length": 179.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 134.0, | |
| "completions/min_terminated_length": 86.0, | |
| "completions/max_terminated_length": 179.0, | |
| "rewards/reward_fn/mean": 0.33937501907348633, | |
| "rewards/reward_fn/std": 0.022020353004336357, | |
| "reward": 0.33937501907348633, | |
| "reward_std": 0.022020353004336357, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0142335891723633, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 26.99719125000047, | |
| "epoch": 0.051666666666666666, | |
| "step": 31 | |
| }, | |
| { | |
| "loss": -0.09904350340366364, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 4.225e-06, | |
| "num_tokens": 92234.0, | |
| "completions/mean_length": 139.0, | |
| "completions/min_length": 92.0, | |
| "completions/max_length": 170.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 139.0, | |
| "completions/min_terminated_length": 92.0, | |
| "completions/max_terminated_length": 170.0, | |
| "rewards/reward_fn/mean": 0.6587499976158142, | |
| "rewards/reward_fn/std": 0.35391557216644287, | |
| "reward": 0.6587499976158142, | |
| "reward_std": 0.35391557216644287, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.3681882619857788, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 25.45712414800073, | |
| "epoch": 0.05333333333333334, | |
| "step": 32 | |
| }, | |
| { | |
| "loss": -0.13209989666938782, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 4.2000000000000004e-06, | |
| "num_tokens": 94781.0, | |
| "completions/mean_length": 110.75, | |
| "completions/min_length": 65.0, | |
| "completions/max_length": 169.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 110.75, | |
| "completions/min_terminated_length": 65.0, | |
| "completions/max_terminated_length": 169.0, | |
| "rewards/reward_fn/mean": 0.37325000762939453, | |
| "rewards/reward_fn/std": 0.04913502186536789, | |
| "reward": 0.37325000762939453, | |
| "reward_std": 0.04913502559065819, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.112715721130371, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 24.924523759000294, | |
| "epoch": 0.055, | |
| "step": 33 | |
| }, | |
| { | |
| "loss": -0.0015746492426842451, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 4.175e-06, | |
| "num_tokens": 97417.0, | |
| "completions/mean_length": 169.0, | |
| "completions/min_length": 149.0, | |
| "completions/max_length": 196.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 169.0, | |
| "completions/min_terminated_length": 149.0, | |
| "completions/max_terminated_length": 196.0, | |
| "rewards/reward_fn/mean": 0.5913749933242798, | |
| "rewards/reward_fn/std": 0.31556177139282227, | |
| "reward": 0.5913749933242798, | |
| "reward_std": 0.31556180119514465, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.3067529201507568, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 29.42368617000102, | |
| "epoch": 0.056666666666666664, | |
| "step": 34 | |
| }, | |
| { | |
| "loss": 0.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.15e-06, | |
| "num_tokens": 100201.0, | |
| "completions/mean_length": 128.0, | |
| "completions/min_length": 93.0, | |
| "completions/max_length": 152.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 128.0, | |
| "completions/min_terminated_length": 93.0, | |
| "completions/max_terminated_length": 152.0, | |
| "rewards/reward_fn/mean": 0.9649999737739563, | |
| "rewards/reward_fn/std": 0.0, | |
| "reward": 0.9649999737739563, | |
| "reward_std": 0.0, | |
| "frac_reward_zero_std": 1.0, | |
| "entropy": 1.064958930015564, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.600620116001664, | |
| "epoch": 0.058333333333333334, | |
| "step": 35 | |
| }, | |
| { | |
| "loss": -0.03460807725787163, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 4.125e-06, | |
| "num_tokens": 102946.0, | |
| "completions/mean_length": 115.25, | |
| "completions/min_length": 100.0, | |
| "completions/max_length": 133.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 115.25, | |
| "completions/min_terminated_length": 100.0, | |
| "completions/max_terminated_length": 133.0, | |
| "rewards/reward_fn/mean": 0.6850000023841858, | |
| "rewards/reward_fn/std": 0.32457664608955383, | |
| "reward": 0.6850000023841858, | |
| "reward_std": 0.32457661628723145, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1022453308105469, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 19.86391903600088, | |
| "epoch": 0.06, | |
| "step": 36 | |
| }, | |
| { | |
| "loss": 0.0848444476723671, | |
| "grad_norm": 0.29296875, | |
| "learning_rate": 4.1e-06, | |
| "num_tokens": 105380.0, | |
| "completions/mean_length": 152.5, | |
| "completions/min_length": 125.0, | |
| "completions/max_length": 181.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 152.5, | |
| "completions/min_terminated_length": 125.0, | |
| "completions/max_terminated_length": 181.0, | |
| "rewards/reward_fn/mean": 0.6150000095367432, | |
| "rewards/reward_fn/std": 0.3441172242164612, | |
| "reward": 0.6150000095367432, | |
| "reward_std": 0.34411725401878357, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1503243446350098, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 25.59769222900286, | |
| "epoch": 0.06166666666666667, | |
| "step": 37 | |
| }, | |
| { | |
| "loss": 0.016566012054681778, | |
| "grad_norm": 0.400390625, | |
| "learning_rate": 4.075e-06, | |
| "num_tokens": 108438.0, | |
| "completions/mean_length": 113.5, | |
| "completions/min_length": 109.0, | |
| "completions/max_length": 116.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 113.5, | |
| "completions/min_terminated_length": 109.0, | |
| "completions/max_terminated_length": 116.0, | |
| "rewards/reward_fn/mean": 0.9772499799728394, | |
| "rewards/reward_fn/std": 0.015256169252097607, | |
| "reward": 0.9772499799728394, | |
| "reward_std": 0.015256185084581375, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0994595289230347, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 17.853799561002234, | |
| "epoch": 0.06333333333333334, | |
| "step": 38 | |
| }, | |
| { | |
| "loss": -0.16876700520515442, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 4.05e-06, | |
| "num_tokens": 111567.0, | |
| "completions/mean_length": 139.25, | |
| "completions/min_length": 97.0, | |
| "completions/max_length": 187.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 139.25, | |
| "completions/min_terminated_length": 97.0, | |
| "completions/max_terminated_length": 187.0, | |
| "rewards/reward_fn/mean": 0.48375001549720764, | |
| "rewards/reward_fn/std": 0.3209393620491028, | |
| "reward": 0.48375001549720764, | |
| "reward_std": 0.3209393620491028, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2051821947097778, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 27.307055137000134, | |
| "epoch": 0.065, | |
| "step": 39 | |
| }, | |
| { | |
| "loss": 0.09384524822235107, | |
| "grad_norm": 0.328125, | |
| "learning_rate": 4.0250000000000004e-06, | |
| "num_tokens": 114485.0, | |
| "completions/mean_length": 132.5, | |
| "completions/min_length": 105.0, | |
| "completions/max_length": 174.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 132.5, | |
| "completions/min_terminated_length": 105.0, | |
| "completions/max_terminated_length": 174.0, | |
| "rewards/reward_fn/mean": 0.32625001668930054, | |
| "rewards/reward_fn/std": 0.010103637352585793, | |
| "reward": 0.32625001668930054, | |
| "reward_std": 0.010103637352585793, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0574369430541992, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 25.46330049500102, | |
| "epoch": 0.06666666666666667, | |
| "step": 40 | |
| }, | |
| { | |
| "loss": -0.001768031739629805, | |
| "grad_norm": 0.3359375, | |
| "learning_rate": 4.000000000000001e-06, | |
| "num_tokens": 117383.0, | |
| "completions/mean_length": 127.5, | |
| "completions/min_length": 119.0, | |
| "completions/max_length": 138.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 127.5, | |
| "completions/min_terminated_length": 119.0, | |
| "completions/max_terminated_length": 138.0, | |
| "rewards/reward_fn/mean": 0.29750001430511475, | |
| "rewards/reward_fn/std": 0.09970790892839432, | |
| "reward": 0.29750001430511475, | |
| "reward_std": 0.09970790892839432, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.045518159866333, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 20.58559491000051, | |
| "epoch": 0.06833333333333333, | |
| "step": 41 | |
| }, | |
| { | |
| "loss": -0.0037579911295324564, | |
| "grad_norm": 0.375, | |
| "learning_rate": 3.975000000000001e-06, | |
| "num_tokens": 120630.0, | |
| "completions/mean_length": 136.75, | |
| "completions/min_length": 110.0, | |
| "completions/max_length": 156.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 136.75, | |
| "completions/min_terminated_length": 110.0, | |
| "completions/max_terminated_length": 156.0, | |
| "rewards/reward_fn/mean": 0.33937501907348633, | |
| "rewards/reward_fn/std": 0.036077164113521576, | |
| "reward": 0.33937501907348633, | |
| "reward_std": 0.036077164113521576, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2528449296951294, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 23.60465663499781, | |
| "epoch": 0.07, | |
| "step": 42 | |
| }, | |
| { | |
| "loss": -0.010047557763755322, | |
| "grad_norm": 0.396484375, | |
| "learning_rate": 3.95e-06, | |
| "num_tokens": 123572.0, | |
| "completions/mean_length": 123.5, | |
| "completions/min_length": 96.0, | |
| "completions/max_length": 149.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 123.5, | |
| "completions/min_terminated_length": 96.0, | |
| "completions/max_terminated_length": 149.0, | |
| "rewards/reward_fn/mean": 0.972000002861023, | |
| "rewards/reward_fn/std": 0.013999998569488525, | |
| "reward": 0.972000002861023, | |
| "reward_std": 0.013999998569488525, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0263532400131226, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 21.8416585140003, | |
| "epoch": 0.07166666666666667, | |
| "step": 43 | |
| }, | |
| { | |
| "loss": -0.07597894221544266, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 3.9250000000000005e-06, | |
| "num_tokens": 126769.0, | |
| "completions/mean_length": 158.25, | |
| "completions/min_length": 139.0, | |
| "completions/max_length": 179.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 158.25, | |
| "completions/min_terminated_length": 139.0, | |
| "completions/max_terminated_length": 179.0, | |
| "rewards/reward_fn/mean": 0.6499999761581421, | |
| "rewards/reward_fn/std": 0.3441172242164612, | |
| "reward": 0.6499999761581421, | |
| "reward_std": 0.3441172242164612, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.4695942401885986, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 26.284113427002012, | |
| "epoch": 0.07333333333333333, | |
| "step": 44 | |
| }, | |
| { | |
| "loss": 0.04559139907360077, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 3.900000000000001e-06, | |
| "num_tokens": 129512.0, | |
| "completions/mean_length": 132.75, | |
| "completions/min_length": 99.0, | |
| "completions/max_length": 176.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 132.75, | |
| "completions/min_terminated_length": 99.0, | |
| "completions/max_terminated_length": 176.0, | |
| "rewards/reward_fn/mean": 0.6675000190734863, | |
| "rewards/reward_fn/std": 0.34470999240875244, | |
| "reward": 0.6675000190734863, | |
| "reward_std": 0.34470999240875244, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.3445155620574951, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 26.141577066002355, | |
| "epoch": 0.075, | |
| "step": 45 | |
| }, | |
| { | |
| "loss": -0.021484289318323135, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 3.875e-06, | |
| "num_tokens": 132537.0, | |
| "completions/mean_length": 125.25, | |
| "completions/min_length": 115.0, | |
| "completions/max_length": 131.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 125.25, | |
| "completions/min_terminated_length": 115.0, | |
| "completions/max_terminated_length": 131.0, | |
| "rewards/reward_fn/mean": 0.49162501096725464, | |
| "rewards/reward_fn/std": 0.3345796763896942, | |
| "reward": 0.49162501096725464, | |
| "reward_std": 0.3345796763896942, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.098323941230774, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 20.43934466399878, | |
| "epoch": 0.07666666666666666, | |
| "step": 46 | |
| }, | |
| { | |
| "loss": 0.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.85e-06, | |
| "num_tokens": 135108.0, | |
| "completions/mean_length": 127.75, | |
| "completions/min_length": 116.0, | |
| "completions/max_length": 151.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 127.75, | |
| "completions/min_terminated_length": 116.0, | |
| "completions/max_terminated_length": 151.0, | |
| "rewards/reward_fn/mean": 0.9649999737739563, | |
| "rewards/reward_fn/std": 0.0, | |
| "reward": 0.9649999737739563, | |
| "reward_std": 0.0, | |
| "frac_reward_zero_std": 1.0, | |
| "entropy": 1.1513502597808838, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 24.161369523997564, | |
| "epoch": 0.07833333333333334, | |
| "step": 47 | |
| }, | |
| { | |
| "loss": -0.03219185769557953, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 3.825000000000001e-06, | |
| "num_tokens": 137804.0, | |
| "completions/mean_length": 139.0, | |
| "completions/min_length": 130.0, | |
| "completions/max_length": 150.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 139.0, | |
| "completions/min_terminated_length": 130.0, | |
| "completions/max_terminated_length": 150.0, | |
| "rewards/reward_fn/mean": 0.9562499523162842, | |
| "rewards/reward_fn/std": 0.017499983310699463, | |
| "reward": 0.9562499523162842, | |
| "reward_std": 0.017499983310699463, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0865966081619263, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 23.38693464699827, | |
| "epoch": 0.08, | |
| "step": 48 | |
| }, | |
| { | |
| "loss": 0.026122601702809334, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 3.8000000000000005e-06, | |
| "num_tokens": 140483.0, | |
| "completions/mean_length": 116.75, | |
| "completions/min_length": 104.0, | |
| "completions/max_length": 144.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 116.75, | |
| "completions/min_terminated_length": 104.0, | |
| "completions/max_terminated_length": 144.0, | |
| "rewards/reward_fn/mean": 0.9518749713897705, | |
| "rewards/reward_fn/std": 0.016754958778619766, | |
| "reward": 0.9518749713897705, | |
| "reward_std": 0.01675495319068432, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.214436650276184, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.844522692001192, | |
| "epoch": 0.08166666666666667, | |
| "step": 49 | |
| }, | |
| { | |
| "loss": -0.003206648863852024, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 3.7750000000000003e-06, | |
| "num_tokens": 143371.0, | |
| "completions/mean_length": 135.0, | |
| "completions/min_length": 109.0, | |
| "completions/max_length": 162.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 135.0, | |
| "completions/min_terminated_length": 109.0, | |
| "completions/max_terminated_length": 162.0, | |
| "rewards/reward_fn/mean": 0.6499999761581421, | |
| "rewards/reward_fn/std": 0.36373066902160645, | |
| "reward": 0.6499999761581421, | |
| "reward_std": 0.36373066902160645, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2262229919433594, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 24.662258178999764, | |
| "epoch": 0.08333333333333333, | |
| "step": 50 | |
| }, | |
| { | |
| "loss": -0.10688165575265884, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 3.7500000000000005e-06, | |
| "num_tokens": 146097.0, | |
| "completions/mean_length": 127.5, | |
| "completions/min_length": 102.0, | |
| "completions/max_length": 165.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 127.5, | |
| "completions/min_terminated_length": 102.0, | |
| "completions/max_terminated_length": 165.0, | |
| "rewards/reward_fn/mean": 0.9781249761581421, | |
| "rewards/reward_fn/std": 0.016754990443587303, | |
| "reward": 0.9781249761581421, | |
| "reward_std": 0.016754984855651855, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0940418243408203, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 25.447743856999296, | |
| "epoch": 0.085, | |
| "step": 51 | |
| }, | |
| { | |
| "loss": -0.0370216965675354, | |
| "grad_norm": 0.29296875, | |
| "learning_rate": 3.7250000000000003e-06, | |
| "num_tokens": 149049.0, | |
| "completions/mean_length": 148.0, | |
| "completions/min_length": 130.0, | |
| "completions/max_length": 178.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 148.0, | |
| "completions/min_terminated_length": 130.0, | |
| "completions/max_terminated_length": 178.0, | |
| "rewards/reward_fn/mean": 0.9518749713897705, | |
| "rewards/reward_fn/std": 0.026249976828694344, | |
| "reward": 0.9518749713897705, | |
| "reward_std": 0.026249965652823448, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1700999736785889, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 26.492212680001103, | |
| "epoch": 0.08666666666666667, | |
| "step": 52 | |
| }, | |
| { | |
| "loss": -0.04324590787291527, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 3.7e-06, | |
| "num_tokens": 151888.0, | |
| "completions/mean_length": 145.75, | |
| "completions/min_length": 123.0, | |
| "completions/max_length": 179.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 145.75, | |
| "completions/min_terminated_length": 123.0, | |
| "completions/max_terminated_length": 179.0, | |
| "rewards/reward_fn/mean": 0.3306249976158142, | |
| "rewards/reward_fn/std": 0.008750006556510925, | |
| "reward": 0.3306249976158142, | |
| "reward_std": 0.008750001899898052, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0578383207321167, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 26.41010283899959, | |
| "epoch": 0.08833333333333333, | |
| "step": 53 | |
| }, | |
| { | |
| "loss": 0.05730345472693443, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 3.6750000000000004e-06, | |
| "num_tokens": 154483.0, | |
| "completions/mean_length": 131.75, | |
| "completions/min_length": 117.0, | |
| "completions/max_length": 147.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 131.75, | |
| "completions/min_terminated_length": 117.0, | |
| "completions/max_terminated_length": 147.0, | |
| "rewards/reward_fn/mean": 0.6499999761581421, | |
| "rewards/reward_fn/std": 0.3441172242164612, | |
| "reward": 0.6499999761581421, | |
| "reward_std": 0.3441172242164612, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2011436223983765, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 21.763525555998058, | |
| "epoch": 0.09, | |
| "step": 54 | |
| }, | |
| { | |
| "loss": -0.031099891290068626, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 3.65e-06, | |
| "num_tokens": 157822.0, | |
| "completions/mean_length": 127.75, | |
| "completions/min_length": 115.0, | |
| "completions/max_length": 136.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 127.75, | |
| "completions/min_terminated_length": 115.0, | |
| "completions/max_terminated_length": 136.0, | |
| "rewards/reward_fn/mean": 0.6062500476837158, | |
| "rewards/reward_fn/std": 0.29886940121650696, | |
| "reward": 0.6062500476837158, | |
| "reward_std": 0.29886937141418457, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1653622388839722, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 21.180196135002916, | |
| "epoch": 0.09166666666666666, | |
| "step": 55 | |
| }, | |
| { | |
| "loss": 0.044428952038288116, | |
| "grad_norm": 0.400390625, | |
| "learning_rate": 3.625e-06, | |
| "num_tokens": 160477.0, | |
| "completions/mean_length": 110.75, | |
| "completions/min_length": 94.0, | |
| "completions/max_length": 125.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 110.75, | |
| "completions/min_terminated_length": 94.0, | |
| "completions/max_terminated_length": 125.0, | |
| "rewards/reward_fn/mean": 0.3193749785423279, | |
| "rewards/reward_fn/std": 0.01641835644841194, | |
| "reward": 0.3193749785423279, | |
| "reward_std": 0.01641835831105709, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.9973193407058716, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 19.055930811002327, | |
| "epoch": 0.09333333333333334, | |
| "step": 56 | |
| }, | |
| { | |
| "loss": 0.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.6000000000000003e-06, | |
| "num_tokens": 163015.0, | |
| "completions/mean_length": 117.5, | |
| "completions/min_length": 99.0, | |
| "completions/max_length": 137.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 117.5, | |
| "completions/min_terminated_length": 99.0, | |
| "completions/max_terminated_length": 137.0, | |
| "rewards/reward_fn/mean": 0.9649999737739563, | |
| "rewards/reward_fn/std": 0.0, | |
| "reward": 0.9649999737739563, | |
| "reward_std": 0.0, | |
| "frac_reward_zero_std": 1.0, | |
| "entropy": 1.0432900190353394, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 20.755967883000267, | |
| "epoch": 0.095, | |
| "step": 57 | |
| }, | |
| { | |
| "loss": 0.08609388768672943, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 3.575e-06, | |
| "num_tokens": 165757.0, | |
| "completions/mean_length": 127.5, | |
| "completions/min_length": 108.0, | |
| "completions/max_length": 149.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 127.5, | |
| "completions/min_terminated_length": 108.0, | |
| "completions/max_terminated_length": 149.0, | |
| "rewards/reward_fn/mean": 0.33937501907348633, | |
| "rewards/reward_fn/std": 0.022020353004336357, | |
| "reward": 0.33937501907348633, | |
| "reward_std": 0.022020353004336357, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0790032148361206, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.144303656001284, | |
| "epoch": 0.09666666666666666, | |
| "step": 58 | |
| }, | |
| { | |
| "loss": 0.07765334099531174, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 3.5500000000000003e-06, | |
| "num_tokens": 168967.0, | |
| "completions/mean_length": 155.5, | |
| "completions/min_length": 119.0, | |
| "completions/max_length": 201.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 155.5, | |
| "completions/min_terminated_length": 119.0, | |
| "completions/max_terminated_length": 201.0, | |
| "rewards/reward_fn/mean": 0.7524999976158142, | |
| "rewards/reward_fn/std": 0.2626309096813202, | |
| "reward": 0.7524999976158142, | |
| "reward_std": 0.2626309096813202, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.3297035694122314, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 29.628764026998397, | |
| "epoch": 0.09833333333333333, | |
| "step": 59 | |
| }, | |
| { | |
| "loss": -0.07361559569835663, | |
| "grad_norm": 0.24609375, | |
| "learning_rate": 3.525e-06, | |
| "num_tokens": 172254.0, | |
| "completions/mean_length": 181.75, | |
| "completions/min_length": 130.0, | |
| "completions/max_length": 262.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 181.75, | |
| "completions/min_terminated_length": 130.0, | |
| "completions/max_terminated_length": 262.0, | |
| "rewards/reward_fn/mean": 0.953374981880188, | |
| "rewards/reward_fn/std": 0.01649935357272625, | |
| "reward": 0.953374981880188, | |
| "reward_std": 0.01649935357272625, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1880607604980469, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 37.99089504899894, | |
| "epoch": 0.1, | |
| "step": 60 | |
| }, | |
| { | |
| "loss": 0.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.5e-06, | |
| "num_tokens": 175288.0, | |
| "completions/mean_length": 125.5, | |
| "completions/min_length": 96.0, | |
| "completions/max_length": 193.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 125.5, | |
| "completions/min_terminated_length": 96.0, | |
| "completions/max_terminated_length": 193.0, | |
| "rewards/reward_fn/mean": 0.33500000834465027, | |
| "rewards/reward_fn/std": 0.0, | |
| "reward": 0.33500000834465027, | |
| "reward_std": 0.0, | |
| "frac_reward_zero_std": 1.0, | |
| "entropy": 1.1564736366271973, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 28.490477613999246, | |
| "epoch": 0.10166666666666667, | |
| "step": 61 | |
| }, | |
| { | |
| "loss": -0.01383125875145197, | |
| "grad_norm": 0.3359375, | |
| "learning_rate": 3.475e-06, | |
| "num_tokens": 177812.0, | |
| "completions/mean_length": 124.0, | |
| "completions/min_length": 117.0, | |
| "completions/max_length": 135.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 124.0, | |
| "completions/min_terminated_length": 117.0, | |
| "completions/max_terminated_length": 135.0, | |
| "rewards/reward_fn/mean": 0.9737499952316284, | |
| "rewards/reward_fn/std": 0.010103654116392136, | |
| "reward": 0.9737499952316284, | |
| "reward_std": 0.010103654116392136, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0834579467773438, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 20.202116815002228, | |
| "epoch": 0.10333333333333333, | |
| "step": 62 | |
| }, | |
| { | |
| "loss": -0.058121208101511, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 3.45e-06, | |
| "num_tokens": 181135.0, | |
| "completions/mean_length": 156.75, | |
| "completions/min_length": 132.0, | |
| "completions/max_length": 186.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 156.75, | |
| "completions/min_terminated_length": 132.0, | |
| "completions/max_terminated_length": 186.0, | |
| "rewards/reward_fn/mean": 0.643875002861023, | |
| "rewards/reward_fn/std": 0.37093809247016907, | |
| "reward": 0.643875002861023, | |
| "reward_std": 0.3709380626678467, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.3051375150680542, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 27.585933769001713, | |
| "epoch": 0.105, | |
| "step": 63 | |
| }, | |
| { | |
| "loss": -0.024057308211922646, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 3.4250000000000007e-06, | |
| "num_tokens": 184026.0, | |
| "completions/mean_length": 118.75, | |
| "completions/min_length": 112.0, | |
| "completions/max_length": 130.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 118.75, | |
| "completions/min_terminated_length": 112.0, | |
| "completions/max_terminated_length": 130.0, | |
| "rewards/reward_fn/mean": 0.32712501287460327, | |
| "rewards/reward_fn/std": 0.015750005841255188, | |
| "reward": 0.32712501287460327, | |
| "reward_std": 0.015750011429190636, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.9585571885108948, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 19.43507832999967, | |
| "epoch": 0.10666666666666667, | |
| "step": 64 | |
| }, | |
| { | |
| "loss": 0.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.4000000000000005e-06, | |
| "num_tokens": 186604.0, | |
| "completions/mean_length": 124.5, | |
| "completions/min_length": 112.0, | |
| "completions/max_length": 147.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 124.5, | |
| "completions/min_terminated_length": 112.0, | |
| "completions/max_terminated_length": 147.0, | |
| "rewards/reward_fn/mean": 0.33500000834465027, | |
| "rewards/reward_fn/std": 0.0, | |
| "reward": 0.33500000834465027, | |
| "reward_std": 0.0, | |
| "frac_reward_zero_std": 1.0, | |
| "entropy": 1.098405122756958, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 21.2074764780009, | |
| "epoch": 0.10833333333333334, | |
| "step": 65 | |
| }, | |
| { | |
| "loss": -0.09480708837509155, | |
| "grad_norm": 0.2890625, | |
| "learning_rate": 3.3750000000000003e-06, | |
| "num_tokens": 189773.0, | |
| "completions/mean_length": 152.25, | |
| "completions/min_length": 124.0, | |
| "completions/max_length": 200.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 152.25, | |
| "completions/min_terminated_length": 124.0, | |
| "completions/max_terminated_length": 200.0, | |
| "rewards/reward_fn/mean": 0.3525000214576721, | |
| "rewards/reward_fn/std": 0.020207257941365242, | |
| "reward": 0.3525000214576721, | |
| "reward_std": 0.020207257941365242, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2371915578842163, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 29.151788433002366, | |
| "epoch": 0.11, | |
| "step": 66 | |
| }, | |
| { | |
| "loss": 0.18502062559127808, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 3.3500000000000005e-06, | |
| "num_tokens": 192564.0, | |
| "completions/mean_length": 135.75, | |
| "completions/min_length": 101.0, | |
| "completions/max_length": 186.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 135.75, | |
| "completions/min_terminated_length": 101.0, | |
| "completions/max_terminated_length": 186.0, | |
| "rewards/reward_fn/mean": 0.8162499666213989, | |
| "rewards/reward_fn/std": 0.29749998450279236, | |
| "reward": 0.8162499666213989, | |
| "reward_std": 0.29749998450279236, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0945377349853516, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 27.474493719000748, | |
| "epoch": 0.11166666666666666, | |
| "step": 67 | |
| }, | |
| { | |
| "loss": -0.08495711535215378, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 3.3250000000000004e-06, | |
| "num_tokens": 195718.0, | |
| "completions/mean_length": 139.5, | |
| "completions/min_length": 97.0, | |
| "completions/max_length": 160.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 139.5, | |
| "completions/min_terminated_length": 97.0, | |
| "completions/max_terminated_length": 160.0, | |
| "rewards/reward_fn/mean": 0.6412500143051147, | |
| "rewards/reward_fn/std": 0.35391557216644287, | |
| "reward": 0.6412500143051147, | |
| "reward_std": 0.35391560196876526, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2198596000671387, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 23.923134037002455, | |
| "epoch": 0.11333333333333333, | |
| "step": 68 | |
| }, | |
| { | |
| "loss": -0.06370344012975693, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 3.3000000000000006e-06, | |
| "num_tokens": 198864.0, | |
| "completions/mean_length": 149.5, | |
| "completions/min_length": 114.0, | |
| "completions/max_length": 167.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 149.5, | |
| "completions/min_terminated_length": 114.0, | |
| "completions/max_terminated_length": 167.0, | |
| "rewards/reward_fn/mean": 0.6499999761581421, | |
| "rewards/reward_fn/std": 0.36373066902160645, | |
| "reward": 0.6499999761581421, | |
| "reward_std": 0.36373066902160645, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.370664358139038, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 24.89920946699931, | |
| "epoch": 0.115, | |
| "step": 69 | |
| }, | |
| { | |
| "loss": 0.02983768843114376, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 3.2750000000000004e-06, | |
| "num_tokens": 201847.0, | |
| "completions/mean_length": 174.75, | |
| "completions/min_length": 136.0, | |
| "completions/max_length": 254.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 174.75, | |
| "completions/min_terminated_length": 136.0, | |
| "completions/max_terminated_length": 254.0, | |
| "rewards/reward_fn/mean": 0.48899999260902405, | |
| "rewards/reward_fn/std": 0.3174019455909729, | |
| "reward": 0.48899999260902405, | |
| "reward_std": 0.3174019455909729, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2808506488800049, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 37.47509274999902, | |
| "epoch": 0.11666666666666667, | |
| "step": 70 | |
| }, | |
| { | |
| "loss": -0.03286806866526604, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 3.2500000000000002e-06, | |
| "num_tokens": 204520.0, | |
| "completions/mean_length": 110.25, | |
| "completions/min_length": 93.0, | |
| "completions/max_length": 123.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 110.25, | |
| "completions/min_terminated_length": 93.0, | |
| "completions/max_terminated_length": 123.0, | |
| "rewards/reward_fn/mean": 0.824999988079071, | |
| "rewards/reward_fn/std": 0.2800000011920929, | |
| "reward": 0.824999988079071, | |
| "reward_std": 0.2799999713897705, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0120017528533936, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 19.081610291999823, | |
| "epoch": 0.11833333333333333, | |
| "step": 71 | |
| }, | |
| { | |
| "loss": -0.12470149993896484, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 3.2250000000000005e-06, | |
| "num_tokens": 207550.0, | |
| "completions/mean_length": 156.5, | |
| "completions/min_length": 117.0, | |
| "completions/max_length": 177.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 156.5, | |
| "completions/min_terminated_length": 117.0, | |
| "completions/max_terminated_length": 177.0, | |
| "rewards/reward_fn/mean": 0.794374942779541, | |
| "rewards/reward_fn/std": 0.31834450364112854, | |
| "reward": 0.794374942779541, | |
| "reward_std": 0.3183445334434509, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1953434944152832, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 26.463810916997318, | |
| "epoch": 0.12, | |
| "step": 72 | |
| }, | |
| { | |
| "loss": -0.20025718212127686, | |
| "grad_norm": 0.3359375, | |
| "learning_rate": 3.2000000000000003e-06, | |
| "num_tokens": 210661.0, | |
| "completions/mean_length": 143.75, | |
| "completions/min_length": 108.0, | |
| "completions/max_length": 189.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 143.75, | |
| "completions/min_terminated_length": 108.0, | |
| "completions/max_terminated_length": 189.0, | |
| "rewards/reward_fn/mean": 0.6324999928474426, | |
| "rewards/reward_fn/std": 0.34352341294288635, | |
| "reward": 0.6324999928474426, | |
| "reward_std": 0.34352341294288635, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2760404348373413, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 28.05823507000241, | |
| "epoch": 0.12166666666666667, | |
| "step": 73 | |
| }, | |
| { | |
| "loss": -0.06791014224290848, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 3.175e-06, | |
| "num_tokens": 213910.0, | |
| "completions/mean_length": 169.25, | |
| "completions/min_length": 135.0, | |
| "completions/max_length": 217.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 169.25, | |
| "completions/min_terminated_length": 135.0, | |
| "completions/max_terminated_length": 217.0, | |
| "rewards/reward_fn/mean": 0.3306249976158142, | |
| "rewards/reward_fn/std": 0.008750006556510925, | |
| "reward": 0.3306249976158142, | |
| "reward_std": 0.008750001899898052, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1246460676193237, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 31.710728935002408, | |
| "epoch": 0.12333333333333334, | |
| "step": 74 | |
| }, | |
| { | |
| "loss": 0.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.1500000000000003e-06, | |
| "num_tokens": 216608.0, | |
| "completions/mean_length": 122.5, | |
| "completions/min_length": 91.0, | |
| "completions/max_length": 145.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 122.5, | |
| "completions/min_terminated_length": 91.0, | |
| "completions/max_terminated_length": 145.0, | |
| "rewards/reward_fn/mean": 0.33500000834465027, | |
| "rewards/reward_fn/std": 0.0, | |
| "reward": 0.33500000834465027, | |
| "reward_std": 0.0, | |
| "frac_reward_zero_std": 1.0, | |
| "entropy": 1.098929762840271, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 21.737217218000296, | |
| "epoch": 0.125, | |
| "step": 75 | |
| }, | |
| { | |
| "loss": 0.14531265199184418, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 3.125e-06, | |
| "num_tokens": 219434.0, | |
| "completions/mean_length": 146.5, | |
| "completions/min_length": 104.0, | |
| "completions/max_length": 170.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 146.5, | |
| "completions/min_terminated_length": 104.0, | |
| "completions/max_terminated_length": 170.0, | |
| "rewards/reward_fn/mean": 0.5012500286102295, | |
| "rewards/reward_fn/std": 0.3096066117286682, | |
| "reward": 0.5012500286102295, | |
| "reward_std": 0.3096066117286682, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0769822597503662, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 25.090116903000308, | |
| "epoch": 0.12666666666666668, | |
| "step": 76 | |
| }, | |
| { | |
| "loss": -0.00801115296781063, | |
| "grad_norm": 0.38671875, | |
| "learning_rate": 3.1000000000000004e-06, | |
| "num_tokens": 222356.0, | |
| "completions/mean_length": 121.5, | |
| "completions/min_length": 113.0, | |
| "completions/max_length": 131.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 121.5, | |
| "completions/min_terminated_length": 113.0, | |
| "completions/max_terminated_length": 131.0, | |
| "rewards/reward_fn/mean": 0.6657500267028809, | |
| "rewards/reward_fn/std": 0.3621724843978882, | |
| "reward": 0.6657500267028809, | |
| "reward_std": 0.3621724545955658, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.9762334823608398, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 20.045034541999485, | |
| "epoch": 0.12833333333333333, | |
| "step": 77 | |
| }, | |
| { | |
| "loss": -0.055578697472810745, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 3.075e-06, | |
| "num_tokens": 225550.0, | |
| "completions/mean_length": 124.5, | |
| "completions/min_length": 77.0, | |
| "completions/max_length": 165.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 124.5, | |
| "completions/min_terminated_length": 77.0, | |
| "completions/max_terminated_length": 165.0, | |
| "rewards/reward_fn/mean": 0.7949999570846558, | |
| "rewards/reward_fn/std": 0.30700162053108215, | |
| "reward": 0.7949999570846558, | |
| "reward_std": 0.30700162053108215, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2096550464630127, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 25.065289067002595, | |
| "epoch": 0.13, | |
| "step": 78 | |
| }, | |
| { | |
| "loss": -0.197392076253891, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 3.05e-06, | |
| "num_tokens": 228395.0, | |
| "completions/mean_length": 132.25, | |
| "completions/min_length": 101.0, | |
| "completions/max_length": 185.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 132.25, | |
| "completions/min_terminated_length": 101.0, | |
| "completions/max_terminated_length": 185.0, | |
| "rewards/reward_fn/mean": 0.48374998569488525, | |
| "rewards/reward_fn/std": 0.3212572932243347, | |
| "reward": 0.48374998569488525, | |
| "reward_std": 0.3212572932243347, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2732487916946411, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 27.037959209999826, | |
| "epoch": 0.13166666666666665, | |
| "step": 79 | |
| }, | |
| { | |
| "loss": 0.15605716407299042, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 3.0250000000000003e-06, | |
| "num_tokens": 230893.0, | |
| "completions/mean_length": 110.5, | |
| "completions/min_length": 76.0, | |
| "completions/max_length": 136.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 110.5, | |
| "completions/min_terminated_length": 76.0, | |
| "completions/max_terminated_length": 136.0, | |
| "rewards/reward_fn/mean": 0.4865000247955322, | |
| "rewards/reward_fn/std": 0.30300000309944153, | |
| "reward": 0.4865000247955322, | |
| "reward_std": 0.30300000309944153, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0314747095108032, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 20.13105025200275, | |
| "epoch": 0.13333333333333333, | |
| "step": 80 | |
| }, | |
| { | |
| "loss": 0.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3e-06, | |
| "num_tokens": 233867.0, | |
| "completions/mean_length": 141.5, | |
| "completions/min_length": 108.0, | |
| "completions/max_length": 191.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 141.5, | |
| "completions/min_terminated_length": 108.0, | |
| "completions/max_terminated_length": 191.0, | |
| "rewards/reward_fn/mean": 0.33500000834465027, | |
| "rewards/reward_fn/std": 0.0, | |
| "reward": 0.33500000834465027, | |
| "reward_std": 0.0, | |
| "frac_reward_zero_std": 1.0, | |
| "entropy": 1.3124823570251465, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 28.160457204001432, | |
| "epoch": 0.135, | |
| "step": 81 | |
| }, | |
| { | |
| "loss": 0.02396450564265251, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 2.9750000000000003e-06, | |
| "num_tokens": 236651.0, | |
| "completions/mean_length": 146.0, | |
| "completions/min_length": 126.0, | |
| "completions/max_length": 167.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 146.0, | |
| "completions/min_terminated_length": 126.0, | |
| "completions/max_terminated_length": 167.0, | |
| "rewards/reward_fn/mean": 0.8162499666213989, | |
| "rewards/reward_fn/std": 0.29749998450279236, | |
| "reward": 0.8162499666213989, | |
| "reward_std": 0.29749998450279236, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0319082736968994, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 24.461930845001916, | |
| "epoch": 0.13666666666666666, | |
| "step": 82 | |
| }, | |
| { | |
| "loss": 0.06299065053462982, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 2.95e-06, | |
| "num_tokens": 239411.0, | |
| "completions/mean_length": 149.0, | |
| "completions/min_length": 131.0, | |
| "completions/max_length": 179.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 149.0, | |
| "completions/min_terminated_length": 131.0, | |
| "completions/max_terminated_length": 179.0, | |
| "rewards/reward_fn/mean": 0.5618749856948853, | |
| "rewards/reward_fn/std": 0.47596120834350586, | |
| "reward": 0.5618749856948853, | |
| "reward_std": 0.47596120834350586, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2378220558166504, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 25.732968376996723, | |
| "epoch": 0.13833333333333334, | |
| "step": 83 | |
| }, | |
| { | |
| "loss": -0.08758159726858139, | |
| "grad_norm": 0.2470703125, | |
| "learning_rate": 2.925e-06, | |
| "num_tokens": 242474.0, | |
| "completions/mean_length": 172.75, | |
| "completions/min_length": 142.0, | |
| "completions/max_length": 191.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 172.75, | |
| "completions/min_terminated_length": 142.0, | |
| "completions/max_terminated_length": 191.0, | |
| "rewards/reward_fn/mean": 0.794374942779541, | |
| "rewards/reward_fn/std": 0.31834450364112854, | |
| "reward": 0.794374942779541, | |
| "reward_std": 0.3183445334434509, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1778147220611572, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 28.395576832001098, | |
| "epoch": 0.14, | |
| "step": 84 | |
| }, | |
| { | |
| "loss": -0.15402138233184814, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 2.9e-06, | |
| "num_tokens": 245432.0, | |
| "completions/mean_length": 127.5, | |
| "completions/min_length": 88.0, | |
| "completions/max_length": 156.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 127.5, | |
| "completions/min_terminated_length": 88.0, | |
| "completions/max_terminated_length": 156.0, | |
| "rewards/reward_fn/mean": 0.32625001668930054, | |
| "rewards/reward_fn/std": 0.017499998211860657, | |
| "reward": 0.32625001668930054, | |
| "reward_std": 0.017500003799796104, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.192873477935791, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 23.462746484998206, | |
| "epoch": 0.14166666666666666, | |
| "step": 85 | |
| }, | |
| { | |
| "loss": 0.21816286444664001, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 2.875e-06, | |
| "num_tokens": 248295.0, | |
| "completions/mean_length": 135.75, | |
| "completions/min_length": 103.0, | |
| "completions/max_length": 195.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 135.75, | |
| "completions/min_terminated_length": 103.0, | |
| "completions/max_terminated_length": 195.0, | |
| "rewards/reward_fn/mean": 0.8075000047683716, | |
| "rewards/reward_fn/std": 0.3149999976158142, | |
| "reward": 0.8075000047683716, | |
| "reward_std": 0.3149999976158142, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.3281114101409912, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 29.33027525199941, | |
| "epoch": 0.14333333333333334, | |
| "step": 86 | |
| }, | |
| { | |
| "loss": -0.050082288682460785, | |
| "grad_norm": 0.30078125, | |
| "learning_rate": 2.85e-06, | |
| "num_tokens": 251030.0, | |
| "completions/mean_length": 124.75, | |
| "completions/min_length": 111.0, | |
| "completions/max_length": 153.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 124.75, | |
| "completions/min_terminated_length": 111.0, | |
| "completions/max_terminated_length": 153.0, | |
| "rewards/reward_fn/mean": 0.9474999904632568, | |
| "rewards/reward_fn/std": 0.0202072411775589, | |
| "reward": 0.9474999904632568, | |
| "reward_std": 0.0202072411775589, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.168349266052246, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.54528503599795, | |
| "epoch": 0.145, | |
| "step": 87 | |
| }, | |
| { | |
| "loss": -0.04842241480946541, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 2.825e-06, | |
| "num_tokens": 253533.0, | |
| "completions/mean_length": 109.75, | |
| "completions/min_length": 90.0, | |
| "completions/max_length": 128.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 109.75, | |
| "completions/min_terminated_length": 90.0, | |
| "completions/max_terminated_length": 128.0, | |
| "rewards/reward_fn/mean": 0.3306249976158142, | |
| "rewards/reward_fn/std": 0.008750006556510925, | |
| "reward": 0.3306249976158142, | |
| "reward_std": 0.008750001899898052, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.9876170754432678, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 18.978010971000913, | |
| "epoch": 0.14666666666666667, | |
| "step": 88 | |
| }, | |
| { | |
| "loss": 0.019071724265813828, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 2.8000000000000003e-06, | |
| "num_tokens": 256532.0, | |
| "completions/mean_length": 149.75, | |
| "completions/min_length": 138.0, | |
| "completions/max_length": 166.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 149.75, | |
| "completions/min_terminated_length": 138.0, | |
| "completions/max_terminated_length": 166.0, | |
| "rewards/reward_fn/mean": 0.8206250071525574, | |
| "rewards/reward_fn/std": 0.3005298972129822, | |
| "reward": 0.8206250071525574, | |
| "reward_std": 0.3005298972129822, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.289428949356079, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 25.60237361599866, | |
| "epoch": 0.14833333333333334, | |
| "step": 89 | |
| }, | |
| { | |
| "loss": 0.07423283159732819, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 2.7750000000000005e-06, | |
| "num_tokens": 259055.0, | |
| "completions/mean_length": 121.75, | |
| "completions/min_length": 103.0, | |
| "completions/max_length": 142.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 121.75, | |
| "completions/min_terminated_length": 103.0, | |
| "completions/max_terminated_length": 142.0, | |
| "rewards/reward_fn/mean": 0.2983750104904175, | |
| "rewards/reward_fn/std": 0.08031539618968964, | |
| "reward": 0.2983750104904175, | |
| "reward_std": 0.08031539618968964, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0802006721496582, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.080729645000247, | |
| "epoch": 0.15, | |
| "step": 90 | |
| }, | |
| { | |
| "loss": 0.033602241426706314, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 2.7500000000000004e-06, | |
| "num_tokens": 261798.0, | |
| "completions/mean_length": 125.75, | |
| "completions/min_length": 105.0, | |
| "completions/max_length": 151.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 125.75, | |
| "completions/min_terminated_length": 105.0, | |
| "completions/max_terminated_length": 151.0, | |
| "rewards/reward_fn/mean": 0.8031250238418579, | |
| "rewards/reward_fn/std": 0.3128456473350525, | |
| "reward": 0.8031250238418579, | |
| "reward_std": 0.3128456473350525, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1015703678131104, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 23.45034698100062, | |
| "epoch": 0.15166666666666667, | |
| "step": 91 | |
| }, | |
| { | |
| "loss": 0.02976590394973755, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 2.7250000000000006e-06, | |
| "num_tokens": 264628.0, | |
| "completions/mean_length": 108.5, | |
| "completions/min_length": 83.0, | |
| "completions/max_length": 145.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 108.5, | |
| "completions/min_terminated_length": 83.0, | |
| "completions/max_terminated_length": 145.0, | |
| "rewards/reward_fn/mean": 0.9728749990463257, | |
| "rewards/reward_fn/std": 0.015750020742416382, | |
| "reward": 0.9728749990463257, | |
| "reward_std": 0.015750011429190636, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1214567422866821, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.66323952300081, | |
| "epoch": 0.15333333333333332, | |
| "step": 92 | |
| }, | |
| { | |
| "loss": 0.0016713386867195368, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 2.7000000000000004e-06, | |
| "num_tokens": 267337.0, | |
| "completions/mean_length": 119.25, | |
| "completions/min_length": 101.0, | |
| "completions/max_length": 146.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 119.25, | |
| "completions/min_terminated_length": 101.0, | |
| "completions/max_terminated_length": 146.0, | |
| "rewards/reward_fn/mean": 0.9772499799728394, | |
| "rewards/reward_fn/std": 0.015256169252097607, | |
| "reward": 0.9772499799728394, | |
| "reward_std": 0.015256185084581375, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1013864278793335, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.21387937399777, | |
| "epoch": 0.155, | |
| "step": 93 | |
| }, | |
| { | |
| "loss": 0.009300082921981812, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 2.6750000000000002e-06, | |
| "num_tokens": 270352.0, | |
| "completions/mean_length": 126.75, | |
| "completions/min_length": 112.0, | |
| "completions/max_length": 142.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 126.75, | |
| "completions/min_terminated_length": 112.0, | |
| "completions/max_terminated_length": 142.0, | |
| "rewards/reward_fn/mean": 0.48374998569488525, | |
| "rewards/reward_fn/std": 0.3209393620491028, | |
| "reward": 0.48374998569488525, | |
| "reward_std": 0.3209393620491028, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1367976665496826, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.162854308000533, | |
| "epoch": 0.15666666666666668, | |
| "step": 94 | |
| }, | |
| { | |
| "loss": -0.04094283655285835, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 2.6500000000000005e-06, | |
| "num_tokens": 273143.0, | |
| "completions/mean_length": 123.75, | |
| "completions/min_length": 99.0, | |
| "completions/max_length": 148.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 123.75, | |
| "completions/min_terminated_length": 99.0, | |
| "completions/max_terminated_length": 148.0, | |
| "rewards/reward_fn/mean": 0.9693750143051147, | |
| "rewards/reward_fn/std": 0.00875002145767212, | |
| "reward": 0.9693750143051147, | |
| "reward_std": 0.00875002145767212, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0440210103988647, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.38568425199992, | |
| "epoch": 0.15833333333333333, | |
| "step": 95 | |
| }, | |
| { | |
| "loss": 0.15625080466270447, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 2.6250000000000003e-06, | |
| "num_tokens": 275882.0, | |
| "completions/mean_length": 140.75, | |
| "completions/min_length": 97.0, | |
| "completions/max_length": 160.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 140.75, | |
| "completions/min_terminated_length": 97.0, | |
| "completions/max_terminated_length": 160.0, | |
| "rewards/reward_fn/mean": 0.4925000071525574, | |
| "rewards/reward_fn/std": 0.2921329438686371, | |
| "reward": 0.4925000071525574, | |
| "reward_std": 0.2921329736709595, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1425111293792725, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 23.621867647998442, | |
| "epoch": 0.16, | |
| "step": 96 | |
| }, | |
| { | |
| "loss": 0.12305271625518799, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 2.6e-06, | |
| "num_tokens": 278752.0, | |
| "completions/mean_length": 165.5, | |
| "completions/min_length": 100.0, | |
| "completions/max_length": 217.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 165.5, | |
| "completions/min_terminated_length": 100.0, | |
| "completions/max_terminated_length": 217.0, | |
| "rewards/reward_fn/mean": 0.6497499942779541, | |
| "rewards/reward_fn/std": 0.36401957273483276, | |
| "reward": 0.6497499942779541, | |
| "reward_std": 0.3640195429325104, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1441240310668945, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 32.032986541002174, | |
| "epoch": 0.16166666666666665, | |
| "step": 97 | |
| }, | |
| { | |
| "loss": -0.005260481499135494, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 2.5750000000000003e-06, | |
| "num_tokens": 280888.0, | |
| "completions/mean_length": 135.0, | |
| "completions/min_length": 120.0, | |
| "completions/max_length": 156.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 135.0, | |
| "completions/min_terminated_length": 120.0, | |
| "completions/max_terminated_length": 156.0, | |
| "rewards/reward_fn/mean": 0.6850000023841858, | |
| "rewards/reward_fn/std": 0.29560670256614685, | |
| "reward": 0.6850000023841858, | |
| "reward_std": 0.29560673236846924, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.078916311264038, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.477214457998343, | |
| "epoch": 0.16333333333333333, | |
| "step": 98 | |
| }, | |
| { | |
| "loss": 0.1132018193602562, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 2.55e-06, | |
| "num_tokens": 283709.0, | |
| "completions/mean_length": 130.25, | |
| "completions/min_length": 102.0, | |
| "completions/max_length": 150.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 130.25, | |
| "completions/min_terminated_length": 102.0, | |
| "completions/max_terminated_length": 150.0, | |
| "rewards/reward_fn/mean": 0.6368750333786011, | |
| "rewards/reward_fn/std": 0.35040080547332764, | |
| "reward": 0.6368750333786011, | |
| "reward_std": 0.35040080547332764, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.333487868309021, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.616071093001665, | |
| "epoch": 0.165, | |
| "step": 99 | |
| }, | |
| { | |
| "loss": -0.061634328216314316, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 2.5250000000000004e-06, | |
| "num_tokens": 286963.0, | |
| "completions/mean_length": 130.5, | |
| "completions/min_length": 115.0, | |
| "completions/max_length": 141.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 130.5, | |
| "completions/min_terminated_length": 115.0, | |
| "completions/max_terminated_length": 141.0, | |
| "rewards/reward_fn/mean": 0.3062500059604645, | |
| "rewards/reward_fn/std": 0.08250000327825546, | |
| "reward": 0.3062500059604645, | |
| "reward_std": 0.08250000327825546, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2107280492782593, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 21.43296745800035, | |
| "epoch": 0.16666666666666666, | |
| "step": 100 | |
| }, | |
| { | |
| "loss": -0.01924452930688858, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 2.5e-06, | |
| "num_tokens": 289957.0, | |
| "completions/mean_length": 167.5, | |
| "completions/min_length": 130.0, | |
| "completions/max_length": 190.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 167.5, | |
| "completions/min_terminated_length": 130.0, | |
| "completions/max_terminated_length": 190.0, | |
| "rewards/reward_fn/mean": 0.7856249809265137, | |
| "rewards/reward_fn/std": 0.30188554525375366, | |
| "reward": 0.7856249809265137, | |
| "reward_std": 0.30188557505607605, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2127794027328491, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 28.20487242300078, | |
| "epoch": 0.16833333333333333, | |
| "step": 101 | |
| }, | |
| { | |
| "loss": 0.014912793412804604, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 2.475e-06, | |
| "num_tokens": 292740.0, | |
| "completions/mean_length": 163.75, | |
| "completions/min_length": 106.0, | |
| "completions/max_length": 247.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 163.75, | |
| "completions/min_terminated_length": 106.0, | |
| "completions/max_terminated_length": 247.0, | |
| "rewards/reward_fn/mean": 0.437250018119812, | |
| "rewards/reward_fn/std": 0.21189679205417633, | |
| "reward": 0.437250018119812, | |
| "reward_std": 0.21189679205417633, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.3349406719207764, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 35.148211256997456, | |
| "epoch": 0.17, | |
| "step": 102 | |
| }, | |
| { | |
| "loss": 0.09064196795225143, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 2.4500000000000003e-06, | |
| "num_tokens": 295698.0, | |
| "completions/mean_length": 138.5, | |
| "completions/min_length": 123.0, | |
| "completions/max_length": 156.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 138.5, | |
| "completions/min_terminated_length": 123.0, | |
| "completions/max_terminated_length": 156.0, | |
| "rewards/reward_fn/mean": 0.6499999761581421, | |
| "rewards/reward_fn/std": 0.36373066902160645, | |
| "reward": 0.6499999761581421, | |
| "reward_std": 0.36373066902160645, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2255961894989014, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 23.0402411569994, | |
| "epoch": 0.17166666666666666, | |
| "step": 103 | |
| }, | |
| { | |
| "loss": -0.10342099517583847, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 2.425e-06, | |
| "num_tokens": 298992.0, | |
| "completions/mean_length": 183.5, | |
| "completions/min_length": 122.0, | |
| "completions/max_length": 220.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 183.5, | |
| "completions/min_terminated_length": 122.0, | |
| "completions/max_terminated_length": 220.0, | |
| "rewards/reward_fn/mean": 0.47462502121925354, | |
| "rewards/reward_fn/std": 0.29103362560272217, | |
| "reward": 0.47462502121925354, | |
| "reward_std": 0.2910335958003998, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.368587613105774, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 32.10494950900102, | |
| "epoch": 0.17333333333333334, | |
| "step": 104 | |
| }, | |
| { | |
| "loss": 0.1281687319278717, | |
| "grad_norm": 0.39453125, | |
| "learning_rate": 2.4000000000000003e-06, | |
| "num_tokens": 301769.0, | |
| "completions/mean_length": 133.25, | |
| "completions/min_length": 92.0, | |
| "completions/max_length": 172.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 133.25, | |
| "completions/min_terminated_length": 92.0, | |
| "completions/max_terminated_length": 172.0, | |
| "rewards/reward_fn/mean": 0.9650000333786011, | |
| "rewards/reward_fn/std": 0.02474874258041382, | |
| "reward": 0.9650000333786011, | |
| "reward_std": 0.02474874258041382, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0861597061157227, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 25.204073330998654, | |
| "epoch": 0.175, | |
| "step": 105 | |
| }, | |
| { | |
| "loss": 0.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.375e-06, | |
| "num_tokens": 304607.0, | |
| "completions/mean_length": 146.5, | |
| "completions/min_length": 111.0, | |
| "completions/max_length": 173.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 146.5, | |
| "completions/min_terminated_length": 111.0, | |
| "completions/max_terminated_length": 173.0, | |
| "rewards/reward_fn/mean": 0.33500000834465027, | |
| "rewards/reward_fn/std": 0.0, | |
| "reward": 0.33500000834465027, | |
| "reward_std": 0.0, | |
| "frac_reward_zero_std": 1.0, | |
| "entropy": 1.1364024877548218, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 25.721460639000725, | |
| "epoch": 0.17666666666666667, | |
| "step": 106 | |
| }, | |
| { | |
| "loss": -0.1601293683052063, | |
| "grad_norm": 0.248046875, | |
| "learning_rate": 2.35e-06, | |
| "num_tokens": 307800.0, | |
| "completions/mean_length": 159.25, | |
| "completions/min_length": 132.0, | |
| "completions/max_length": 210.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 159.25, | |
| "completions/min_terminated_length": 132.0, | |
| "completions/max_terminated_length": 210.0, | |
| "rewards/reward_fn/mean": 0.4881249964237213, | |
| "rewards/reward_fn/std": 0.318023681640625, | |
| "reward": 0.4881249964237213, | |
| "reward_std": 0.3180236518383026, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2100403308868408, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 31.79101676999926, | |
| "epoch": 0.17833333333333334, | |
| "step": 107 | |
| }, | |
| { | |
| "loss": 0.05961640179157257, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 2.325e-06, | |
| "num_tokens": 310517.0, | |
| "completions/mean_length": 140.25, | |
| "completions/min_length": 117.0, | |
| "completions/max_length": 183.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 140.25, | |
| "completions/min_terminated_length": 117.0, | |
| "completions/max_terminated_length": 183.0, | |
| "rewards/reward_fn/mean": 0.32625001668930054, | |
| "rewards/reward_fn/std": 0.010103637352585793, | |
| "reward": 0.32625001668930054, | |
| "reward_std": 0.010103637352585793, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.120354175567627, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 27.117513699002302, | |
| "epoch": 0.18, | |
| "step": 108 | |
| }, | |
| { | |
| "loss": 0.008527176454663277, | |
| "grad_norm": 0.3046875, | |
| "learning_rate": 2.3000000000000004e-06, | |
| "num_tokens": 312970.0, | |
| "completions/mean_length": 147.25, | |
| "completions/min_length": 143.0, | |
| "completions/max_length": 154.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 147.25, | |
| "completions/min_terminated_length": 143.0, | |
| "completions/max_terminated_length": 154.0, | |
| "rewards/reward_fn/mean": 0.6325000524520874, | |
| "rewards/reward_fn/std": 0.323947012424469, | |
| "reward": 0.6325000524520874, | |
| "reward_std": 0.323947012424469, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1995975971221924, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.515986416998203, | |
| "epoch": 0.18166666666666667, | |
| "step": 109 | |
| }, | |
| { | |
| "loss": 0.23347245156764984, | |
| "grad_norm": 0.369140625, | |
| "learning_rate": 2.2750000000000002e-06, | |
| "num_tokens": 315998.0, | |
| "completions/mean_length": 139.0, | |
| "completions/min_length": 90.0, | |
| "completions/max_length": 177.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 139.0, | |
| "completions/min_terminated_length": 90.0, | |
| "completions/max_terminated_length": 177.0, | |
| "rewards/reward_fn/mean": 0.6587499976158142, | |
| "rewards/reward_fn/std": 0.35391557216644287, | |
| "reward": 0.6587499976158142, | |
| "reward_std": 0.35391557216644287, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2389236688613892, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 26.32129164900107, | |
| "epoch": 0.18333333333333332, | |
| "step": 110 | |
| }, | |
| { | |
| "loss": -0.00834303256124258, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 2.25e-06, | |
| "num_tokens": 319005.0, | |
| "completions/mean_length": 131.75, | |
| "completions/min_length": 114.0, | |
| "completions/max_length": 148.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 131.75, | |
| "completions/min_terminated_length": 114.0, | |
| "completions/max_terminated_length": 148.0, | |
| "rewards/reward_fn/mean": 0.9798749685287476, | |
| "rewards/reward_fn/std": 0.010451287031173706, | |
| "reward": 0.9798749685287476, | |
| "reward_std": 0.010451287031173706, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0297316312789917, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 23.76591824099887, | |
| "epoch": 0.185, | |
| "step": 111 | |
| }, | |
| { | |
| "loss": -0.04647402837872505, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 2.2250000000000003e-06, | |
| "num_tokens": 322001.0, | |
| "completions/mean_length": 117.0, | |
| "completions/min_length": 96.0, | |
| "completions/max_length": 132.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 117.0, | |
| "completions/min_terminated_length": 96.0, | |
| "completions/max_terminated_length": 132.0, | |
| "rewards/reward_fn/mean": 0.9693750143051147, | |
| "rewards/reward_fn/std": 0.00875002145767212, | |
| "reward": 0.9693750143051147, | |
| "reward_std": 0.00875002145767212, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.9716232419013977, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 21.865096360997995, | |
| "epoch": 0.18666666666666668, | |
| "step": 112 | |
| }, | |
| { | |
| "loss": -0.06792350113391876, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 2.2e-06, | |
| "num_tokens": 324989.0, | |
| "completions/mean_length": 142.0, | |
| "completions/min_length": 103.0, | |
| "completions/max_length": 194.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 142.0, | |
| "completions/min_terminated_length": 103.0, | |
| "completions/max_terminated_length": 194.0, | |
| "rewards/reward_fn/mean": 0.7699999809265137, | |
| "rewards/reward_fn/std": 0.29849621653556824, | |
| "reward": 0.7699999809265137, | |
| "reward_std": 0.29849621653556824, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1111323833465576, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 29.331891907997488, | |
| "epoch": 0.18833333333333332, | |
| "step": 113 | |
| }, | |
| { | |
| "loss": 0.041515663266181946, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 2.1750000000000004e-06, | |
| "num_tokens": 327838.0, | |
| "completions/mean_length": 122.25, | |
| "completions/min_length": 97.0, | |
| "completions/max_length": 155.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 122.25, | |
| "completions/min_terminated_length": 97.0, | |
| "completions/max_terminated_length": 155.0, | |
| "rewards/reward_fn/mean": 0.8206250071525574, | |
| "rewards/reward_fn/std": 0.3005298972129822, | |
| "reward": 0.8206250071525574, | |
| "reward_std": 0.3005298972129822, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0778565406799316, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 24.199242210001103, | |
| "epoch": 0.19, | |
| "step": 114 | |
| }, | |
| { | |
| "loss": -0.036224182695150375, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 2.15e-06, | |
| "num_tokens": 330963.0, | |
| "completions/mean_length": 135.25, | |
| "completions/min_length": 119.0, | |
| "completions/max_length": 147.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 135.25, | |
| "completions/min_terminated_length": 119.0, | |
| "completions/max_terminated_length": 147.0, | |
| "rewards/reward_fn/mean": 0.4881250262260437, | |
| "rewards/reward_fn/std": 0.318023681640625, | |
| "reward": 0.4881250262260437, | |
| "reward_std": 0.318023681640625, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2841877937316895, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.69130362800206, | |
| "epoch": 0.19166666666666668, | |
| "step": 115 | |
| }, | |
| { | |
| "loss": 0.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.125e-06, | |
| "num_tokens": 333908.0, | |
| "completions/mean_length": 144.25, | |
| "completions/min_length": 115.0, | |
| "completions/max_length": 159.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 144.25, | |
| "completions/min_terminated_length": 115.0, | |
| "completions/max_terminated_length": 159.0, | |
| "rewards/reward_fn/mean": 0.9649999737739563, | |
| "rewards/reward_fn/std": 0.0, | |
| "reward": 0.9649999737739563, | |
| "reward_std": 0.0, | |
| "frac_reward_zero_std": 1.0, | |
| "entropy": 1.1361361742019653, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 23.554109609001898, | |
| "epoch": 0.19333333333333333, | |
| "step": 116 | |
| }, | |
| { | |
| "loss": -0.01241180207580328, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 2.1000000000000002e-06, | |
| "num_tokens": 336735.0, | |
| "completions/mean_length": 172.75, | |
| "completions/min_length": 131.0, | |
| "completions/max_length": 217.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 172.75, | |
| "completions/min_terminated_length": 131.0, | |
| "completions/max_terminated_length": 217.0, | |
| "rewards/reward_fn/mean": 0.6412500143051147, | |
| "rewards/reward_fn/std": 0.35391557216644287, | |
| "reward": 0.6412500143051147, | |
| "reward_std": 0.35391560196876526, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.189446210861206, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 30.631830683996668, | |
| "epoch": 0.195, | |
| "step": 117 | |
| }, | |
| { | |
| "loss": 0.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.075e-06, | |
| "num_tokens": 338998.0, | |
| "completions/mean_length": 137.75, | |
| "completions/min_length": 112.0, | |
| "completions/max_length": 177.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 137.75, | |
| "completions/min_terminated_length": 112.0, | |
| "completions/max_terminated_length": 177.0, | |
| "rewards/reward_fn/mean": 0.9649999737739563, | |
| "rewards/reward_fn/std": 0.0, | |
| "reward": 0.9649999737739563, | |
| "reward_std": 0.0, | |
| "frac_reward_zero_std": 1.0, | |
| "entropy": 1.0830141305923462, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 24.650718585002323, | |
| "epoch": 0.19666666666666666, | |
| "step": 118 | |
| }, | |
| { | |
| "loss": 0.018085498362779617, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 2.05e-06, | |
| "num_tokens": 342026.0, | |
| "completions/mean_length": 146.0, | |
| "completions/min_length": 130.0, | |
| "completions/max_length": 169.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 146.0, | |
| "completions/min_terminated_length": 130.0, | |
| "completions/max_terminated_length": 169.0, | |
| "rewards/reward_fn/mean": 0.8118749856948853, | |
| "rewards/reward_fn/std": 0.318023681640625, | |
| "reward": 0.8118749856948853, | |
| "reward_std": 0.318023681640625, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2152268886566162, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 25.480774142000882, | |
| "epoch": 0.19833333333333333, | |
| "step": 119 | |
| }, | |
| { | |
| "loss": -0.015368443913757801, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 2.025e-06, | |
| "num_tokens": 344997.0, | |
| "completions/mean_length": 170.75, | |
| "completions/min_length": 159.0, | |
| "completions/max_length": 180.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 170.75, | |
| "completions/min_terminated_length": 159.0, | |
| "completions/max_terminated_length": 180.0, | |
| "rewards/reward_fn/mean": 0.4925000071525574, | |
| "rewards/reward_fn/std": 0.3149999678134918, | |
| "reward": 0.4925000071525574, | |
| "reward_std": 0.3149999976158142, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2765536308288574, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 26.798823175002326, | |
| "epoch": 0.2, | |
| "step": 120 | |
| }, | |
| { | |
| "loss": -0.10509224236011505, | |
| "grad_norm": 0.375, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "num_tokens": 347101.0, | |
| "completions/mean_length": 123.0, | |
| "completions/min_length": 97.0, | |
| "completions/max_length": 151.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 123.0, | |
| "completions/min_terminated_length": 97.0, | |
| "completions/max_terminated_length": 151.0, | |
| "rewards/reward_fn/mean": 0.9562499523162842, | |
| "rewards/reward_fn/std": 0.017499983310699463, | |
| "reward": 0.9562499523162842, | |
| "reward_std": 0.017499983310699463, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0496407747268677, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 21.680493171999842, | |
| "epoch": 0.20166666666666666, | |
| "step": 121 | |
| }, | |
| { | |
| "loss": 0.060011204332113266, | |
| "grad_norm": 0.33984375, | |
| "learning_rate": 1.975e-06, | |
| "num_tokens": 350008.0, | |
| "completions/mean_length": 129.75, | |
| "completions/min_length": 114.0, | |
| "completions/max_length": 152.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 129.75, | |
| "completions/min_terminated_length": 114.0, | |
| "completions/max_terminated_length": 152.0, | |
| "rewards/reward_fn/mean": 0.9693750143051147, | |
| "rewards/reward_fn/std": 0.00875002145767212, | |
| "reward": 0.9693750143051147, | |
| "reward_std": 0.00875002145767212, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2729018926620483, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.95859299299991, | |
| "epoch": 0.20333333333333334, | |
| "step": 122 | |
| }, | |
| { | |
| "loss": 0.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9500000000000004e-06, | |
| "num_tokens": 352667.0, | |
| "completions/mean_length": 110.75, | |
| "completions/min_length": 99.0, | |
| "completions/max_length": 120.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 110.75, | |
| "completions/min_terminated_length": 99.0, | |
| "completions/max_terminated_length": 120.0, | |
| "rewards/reward_fn/mean": 0.9649999737739563, | |
| "rewards/reward_fn/std": 0.0, | |
| "reward": 0.9649999737739563, | |
| "reward_std": 0.0, | |
| "frac_reward_zero_std": 1.0, | |
| "entropy": 1.0064460039138794, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 18.586940206998406, | |
| "epoch": 0.205, | |
| "step": 123 | |
| }, | |
| { | |
| "loss": -0.09901498258113861, | |
| "grad_norm": 0.4375, | |
| "learning_rate": 1.925e-06, | |
| "num_tokens": 355204.0, | |
| "completions/mean_length": 117.25, | |
| "completions/min_length": 94.0, | |
| "completions/max_length": 144.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 117.25, | |
| "completions/min_terminated_length": 94.0, | |
| "completions/max_terminated_length": 144.0, | |
| "rewards/reward_fn/mean": 0.29750001430511475, | |
| "rewards/reward_fn/std": 0.07500001043081284, | |
| "reward": 0.29750001430511475, | |
| "reward_std": 0.07500000298023224, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0144492387771606, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 21.89907346399923, | |
| "epoch": 0.20666666666666667, | |
| "step": 124 | |
| }, | |
| { | |
| "loss": 0.01803763210773468, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 1.9000000000000002e-06, | |
| "num_tokens": 358015.0, | |
| "completions/mean_length": 130.75, | |
| "completions/min_length": 118.0, | |
| "completions/max_length": 149.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 130.75, | |
| "completions/min_terminated_length": 118.0, | |
| "completions/max_terminated_length": 149.0, | |
| "rewards/reward_fn/mean": 0.32625001668930054, | |
| "rewards/reward_fn/std": 0.010103637352585793, | |
| "reward": 0.32625001668930054, | |
| "reward_std": 0.010103637352585793, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1170016527175903, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.859302955999738, | |
| "epoch": 0.20833333333333334, | |
| "step": 125 | |
| }, | |
| { | |
| "loss": 0.1318972259759903, | |
| "grad_norm": 0.33984375, | |
| "learning_rate": 1.8750000000000003e-06, | |
| "num_tokens": 360842.0, | |
| "completions/mean_length": 122.75, | |
| "completions/min_length": 90.0, | |
| "completions/max_length": 148.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 122.75, | |
| "completions/min_terminated_length": 90.0, | |
| "completions/max_terminated_length": 148.0, | |
| "rewards/reward_fn/mean": 0.9693750143051147, | |
| "rewards/reward_fn/std": 0.00875002145767212, | |
| "reward": 0.9693750143051147, | |
| "reward_std": 0.00875002145767212, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.15077543258667, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 23.222802561998833, | |
| "epoch": 0.21, | |
| "step": 126 | |
| }, | |
| { | |
| "loss": -0.06622515618801117, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 1.85e-06, | |
| "num_tokens": 364028.0, | |
| "completions/mean_length": 207.5, | |
| "completions/min_length": 180.0, | |
| "completions/max_length": 221.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 207.5, | |
| "completions/min_terminated_length": 180.0, | |
| "completions/max_terminated_length": 221.0, | |
| "rewards/reward_fn/mean": 0.8159999847412109, | |
| "rewards/reward_fn/std": 0.297333687543869, | |
| "reward": 0.8159999847412109, | |
| "reward_std": 0.297333687543869, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2833231687545776, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 33.3597098350001, | |
| "epoch": 0.21166666666666667, | |
| "step": 127 | |
| }, | |
| { | |
| "loss": 0.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.825e-06, | |
| "num_tokens": 366980.0, | |
| "completions/mean_length": 115.0, | |
| "completions/min_length": 91.0, | |
| "completions/max_length": 155.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 115.0, | |
| "completions/min_terminated_length": 91.0, | |
| "completions/max_terminated_length": 155.0, | |
| "rewards/reward_fn/mean": 0.33500000834465027, | |
| "rewards/reward_fn/std": 0.0, | |
| "reward": 0.33500000834465027, | |
| "reward_std": 0.0, | |
| "frac_reward_zero_std": 1.0, | |
| "entropy": 0.9939418435096741, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 23.618680087998655, | |
| "epoch": 0.21333333333333335, | |
| "step": 128 | |
| }, | |
| { | |
| "loss": 0.16602951288223267, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 1.8000000000000001e-06, | |
| "num_tokens": 369676.0, | |
| "completions/mean_length": 144.0, | |
| "completions/min_length": 74.0, | |
| "completions/max_length": 193.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 144.0, | |
| "completions/min_terminated_length": 74.0, | |
| "completions/max_terminated_length": 193.0, | |
| "rewards/reward_fn/mean": 0.8041249513626099, | |
| "rewards/reward_fn/std": 0.31281474232673645, | |
| "reward": 0.8041249513626099, | |
| "reward_std": 0.31281474232673645, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1560587882995605, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 28.466979378998076, | |
| "epoch": 0.215, | |
| "step": 129 | |
| }, | |
| { | |
| "loss": -0.006709250155836344, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 1.7750000000000002e-06, | |
| "num_tokens": 372676.0, | |
| "completions/mean_length": 149.0, | |
| "completions/min_length": 112.0, | |
| "completions/max_length": 204.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 149.0, | |
| "completions/min_terminated_length": 112.0, | |
| "completions/max_terminated_length": 204.0, | |
| "rewards/reward_fn/mean": 0.4925000071525574, | |
| "rewards/reward_fn/std": 0.3149999678134918, | |
| "reward": 0.4925000071525574, | |
| "reward_std": 0.3149999976158142, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2238303422927856, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 30.414441388998966, | |
| "epoch": 0.21666666666666667, | |
| "step": 130 | |
| }, | |
| { | |
| "loss": -0.0529935248196125, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 1.75e-06, | |
| "num_tokens": 375679.0, | |
| "completions/mean_length": 160.75, | |
| "completions/min_length": 129.0, | |
| "completions/max_length": 195.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 160.75, | |
| "completions/min_terminated_length": 129.0, | |
| "completions/max_terminated_length": 195.0, | |
| "rewards/reward_fn/mean": 0.9772499799728394, | |
| "rewards/reward_fn/std": 0.015256169252097607, | |
| "reward": 0.9772499799728394, | |
| "reward_std": 0.015256185084581375, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.114032506942749, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 28.930023082000844, | |
| "epoch": 0.21833333333333332, | |
| "step": 131 | |
| }, | |
| { | |
| "loss": -0.016852514818310738, | |
| "grad_norm": 0.30078125, | |
| "learning_rate": 1.725e-06, | |
| "num_tokens": 378594.0, | |
| "completions/mean_length": 155.75, | |
| "completions/min_length": 130.0, | |
| "completions/max_length": 187.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 155.75, | |
| "completions/min_terminated_length": 130.0, | |
| "completions/max_terminated_length": 187.0, | |
| "rewards/reward_fn/mean": 0.32362502813339233, | |
| "rewards/reward_fn/std": 0.01381650660187006, | |
| "reward": 0.32362502813339233, | |
| "reward_std": 0.01381650846451521, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0404397249221802, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 27.9692573909997, | |
| "epoch": 0.22, | |
| "step": 132 | |
| }, | |
| { | |
| "loss": -0.021115276962518692, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 1.7000000000000002e-06, | |
| "num_tokens": 381292.0, | |
| "completions/mean_length": 129.5, | |
| "completions/min_length": 106.0, | |
| "completions/max_length": 158.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 129.5, | |
| "completions/min_terminated_length": 106.0, | |
| "completions/max_terminated_length": 158.0, | |
| "rewards/reward_fn/mean": 0.34375, | |
| "rewards/reward_fn/std": 0.017499998211860657, | |
| "reward": 0.34375, | |
| "reward_std": 0.017500003799796104, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1616339683532715, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 23.989496837002662, | |
| "epoch": 0.22166666666666668, | |
| "step": 133 | |
| }, | |
| { | |
| "loss": 0.024493159726262093, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 1.6750000000000003e-06, | |
| "num_tokens": 384255.0, | |
| "completions/mean_length": 137.75, | |
| "completions/min_length": 131.0, | |
| "completions/max_length": 145.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 137.75, | |
| "completions/min_terminated_length": 131.0, | |
| "completions/max_terminated_length": 145.0, | |
| "rewards/reward_fn/mean": 0.4925000071525574, | |
| "rewards/reward_fn/std": 0.3149999678134918, | |
| "reward": 0.4925000071525574, | |
| "reward_std": 0.3149999976158142, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1078652143478394, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.140087828000105, | |
| "epoch": 0.22333333333333333, | |
| "step": 134 | |
| }, | |
| { | |
| "loss": -0.05794854834675789, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 1.6500000000000003e-06, | |
| "num_tokens": 386962.0, | |
| "completions/mean_length": 130.75, | |
| "completions/min_length": 103.0, | |
| "completions/max_length": 152.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 130.75, | |
| "completions/min_terminated_length": 103.0, | |
| "completions/max_terminated_length": 152.0, | |
| "rewards/reward_fn/mean": 0.9728749990463257, | |
| "rewards/reward_fn/std": 0.015750020742416382, | |
| "reward": 0.9728749990463257, | |
| "reward_std": 0.015750011429190636, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1481397151947021, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.68116860399823, | |
| "epoch": 0.225, | |
| "step": 135 | |
| }, | |
| { | |
| "loss": -0.008061882108449936, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 1.6250000000000001e-06, | |
| "num_tokens": 389566.0, | |
| "completions/mean_length": 124.0, | |
| "completions/min_length": 121.0, | |
| "completions/max_length": 130.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 124.0, | |
| "completions/min_terminated_length": 121.0, | |
| "completions/max_terminated_length": 130.0, | |
| "rewards/reward_fn/mean": 0.8075000047683716, | |
| "rewards/reward_fn/std": 0.3149999976158142, | |
| "reward": 0.8075000047683716, | |
| "reward_std": 0.3149999976158142, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2222754955291748, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 19.605333135001274, | |
| "epoch": 0.22666666666666666, | |
| "step": 136 | |
| }, | |
| { | |
| "loss": 0.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.6000000000000001e-06, | |
| "num_tokens": 392290.0, | |
| "completions/mean_length": 137.0, | |
| "completions/min_length": 128.0, | |
| "completions/max_length": 152.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 137.0, | |
| "completions/min_terminated_length": 128.0, | |
| "completions/max_terminated_length": 152.0, | |
| "rewards/reward_fn/mean": 0.33500000834465027, | |
| "rewards/reward_fn/std": 0.0, | |
| "reward": 0.33500000834465027, | |
| "reward_std": 0.0, | |
| "frac_reward_zero_std": 1.0, | |
| "entropy": 1.1045880317687988, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.16470382499756, | |
| "epoch": 0.22833333333333333, | |
| "step": 137 | |
| }, | |
| { | |
| "loss": 0.07870414108037949, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 1.5750000000000002e-06, | |
| "num_tokens": 394880.0, | |
| "completions/mean_length": 129.5, | |
| "completions/min_length": 105.0, | |
| "completions/max_length": 153.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 129.5, | |
| "completions/min_terminated_length": 105.0, | |
| "completions/max_terminated_length": 153.0, | |
| "rewards/reward_fn/mean": 0.3062500059604645, | |
| "rewards/reward_fn/std": 0.08250000327825546, | |
| "reward": 0.3062500059604645, | |
| "reward_std": 0.08250001072883606, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1190224885940552, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.143214091000118, | |
| "epoch": 0.23, | |
| "step": 138 | |
| }, | |
| { | |
| "loss": -0.30225688219070435, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 1.5500000000000002e-06, | |
| "num_tokens": 397699.0, | |
| "completions/mean_length": 139.75, | |
| "completions/min_length": 79.0, | |
| "completions/max_length": 223.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 139.75, | |
| "completions/min_terminated_length": 79.0, | |
| "completions/max_terminated_length": 223.0, | |
| "rewards/reward_fn/mean": 0.47712501883506775, | |
| "rewards/reward_fn/std": 0.30046311020851135, | |
| "reward": 0.47712501883506775, | |
| "reward_std": 0.30046308040618896, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0666494369506836, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 31.786553677000484, | |
| "epoch": 0.23166666666666666, | |
| "step": 139 | |
| }, | |
| { | |
| "loss": -0.03009037859737873, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 1.525e-06, | |
| "num_tokens": 400412.0, | |
| "completions/mean_length": 141.25, | |
| "completions/min_length": 134.0, | |
| "completions/max_length": 150.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 141.25, | |
| "completions/min_terminated_length": 134.0, | |
| "completions/max_terminated_length": 150.0, | |
| "rewards/reward_fn/mean": 0.4802500009536743, | |
| "rewards/reward_fn/std": 0.32342347502708435, | |
| "reward": 0.4802500009536743, | |
| "reward_std": 0.32342347502708435, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.147861123085022, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.017525466002553, | |
| "epoch": 0.23333333333333334, | |
| "step": 140 | |
| }, | |
| { | |
| "loss": 0.15167391300201416, | |
| "grad_norm": 0.388671875, | |
| "learning_rate": 1.5e-06, | |
| "num_tokens": 403278.0, | |
| "completions/mean_length": 129.5, | |
| "completions/min_length": 81.0, | |
| "completions/max_length": 172.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 129.5, | |
| "completions/min_terminated_length": 81.0, | |
| "completions/max_terminated_length": 172.0, | |
| "rewards/reward_fn/mean": 0.8233749866485596, | |
| "rewards/reward_fn/std": 0.2794349491596222, | |
| "reward": 0.8233749866485596, | |
| "reward_std": 0.2794349789619446, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1366225481033325, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 25.47716004799804, | |
| "epoch": 0.235, | |
| "step": 141 | |
| }, | |
| { | |
| "loss": -0.029183873906731606, | |
| "grad_norm": 0.3359375, | |
| "learning_rate": 1.475e-06, | |
| "num_tokens": 405972.0, | |
| "completions/mean_length": 133.5, | |
| "completions/min_length": 123.0, | |
| "completions/max_length": 143.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 133.5, | |
| "completions/min_terminated_length": 123.0, | |
| "completions/max_terminated_length": 143.0, | |
| "rewards/reward_fn/mean": 0.6499999761581421, | |
| "rewards/reward_fn/std": 0.36373066902160645, | |
| "reward": 0.6499999761581421, | |
| "reward_std": 0.36373066902160645, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.071744680404663, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 21.137350061002508, | |
| "epoch": 0.23666666666666666, | |
| "step": 142 | |
| }, | |
| { | |
| "loss": 0.025060316547751427, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 1.45e-06, | |
| "num_tokens": 408671.0, | |
| "completions/mean_length": 121.75, | |
| "completions/min_length": 97.0, | |
| "completions/max_length": 151.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 121.75, | |
| "completions/min_terminated_length": 97.0, | |
| "completions/max_terminated_length": 151.0, | |
| "rewards/reward_fn/mean": 0.484624981880188, | |
| "rewards/reward_fn/std": 0.32059407234191895, | |
| "reward": 0.484624981880188, | |
| "reward_std": 0.32059407234191895, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0496933460235596, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.204842587998428, | |
| "epoch": 0.23833333333333334, | |
| "step": 143 | |
| }, | |
| { | |
| "loss": 0.1501941978931427, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 1.425e-06, | |
| "num_tokens": 411356.0, | |
| "completions/mean_length": 124.25, | |
| "completions/min_length": 99.0, | |
| "completions/max_length": 162.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 124.25, | |
| "completions/min_terminated_length": 99.0, | |
| "completions/max_terminated_length": 162.0, | |
| "rewards/reward_fn/mean": 0.3306249976158142, | |
| "rewards/reward_fn/std": 0.008750006556510925, | |
| "reward": 0.3306249976158142, | |
| "reward_std": 0.008750001899898052, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0704416036605835, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 23.816178979999677, | |
| "epoch": 0.24, | |
| "step": 144 | |
| }, | |
| { | |
| "loss": 0.03730739653110504, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 1.4000000000000001e-06, | |
| "num_tokens": 414443.0, | |
| "completions/mean_length": 125.75, | |
| "completions/min_length": 120.0, | |
| "completions/max_length": 136.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 125.75, | |
| "completions/min_terminated_length": 120.0, | |
| "completions/max_terminated_length": 136.0, | |
| "rewards/reward_fn/mean": 0.918749988079071, | |
| "rewards/reward_fn/std": 0.07110730558633804, | |
| "reward": 0.918749988079071, | |
| "reward_std": 0.07110730558633804, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1473332643508911, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 20.62395732100049, | |
| "epoch": 0.24166666666666667, | |
| "step": 145 | |
| }, | |
| { | |
| "loss": 0.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.3750000000000002e-06, | |
| "num_tokens": 417223.0, | |
| "completions/mean_length": 133.0, | |
| "completions/min_length": 99.0, | |
| "completions/max_length": 161.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 133.0, | |
| "completions/min_terminated_length": 99.0, | |
| "completions/max_terminated_length": 161.0, | |
| "rewards/reward_fn/mean": 0.9649999737739563, | |
| "rewards/reward_fn/std": 0.0, | |
| "reward": 0.9649999737739563, | |
| "reward_std": 0.0, | |
| "frac_reward_zero_std": 1.0, | |
| "entropy": 1.1500812768936157, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 23.47268433299905, | |
| "epoch": 0.24333333333333335, | |
| "step": 146 | |
| }, | |
| { | |
| "loss": 0.08234935998916626, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 1.3500000000000002e-06, | |
| "num_tokens": 420117.0, | |
| "completions/mean_length": 130.5, | |
| "completions/min_length": 109.0, | |
| "completions/max_length": 150.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 130.5, | |
| "completions/min_terminated_length": 109.0, | |
| "completions/max_terminated_length": 150.0, | |
| "rewards/reward_fn/mean": 0.4925000071525574, | |
| "rewards/reward_fn/std": 0.3149999678134918, | |
| "reward": 0.4925000071525574, | |
| "reward_std": 0.3149999976158142, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.165224313735962, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.157199847999436, | |
| "epoch": 0.245, | |
| "step": 147 | |
| }, | |
| { | |
| "loss": -0.060243088752031326, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 1.3250000000000002e-06, | |
| "num_tokens": 423030.0, | |
| "completions/mean_length": 127.25, | |
| "completions/min_length": 82.0, | |
| "completions/max_length": 156.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 127.25, | |
| "completions/min_terminated_length": 82.0, | |
| "completions/max_terminated_length": 156.0, | |
| "rewards/reward_fn/mean": 0.4741249978542328, | |
| "rewards/reward_fn/std": 0.32808491587638855, | |
| "reward": 0.4741249978542328, | |
| "reward_std": 0.32808494567871094, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0290559530258179, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 23.24264987100105, | |
| "epoch": 0.24666666666666667, | |
| "step": 148 | |
| }, | |
| { | |
| "loss": 0.11729130893945694, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 1.3e-06, | |
| "num_tokens": 425636.0, | |
| "completions/mean_length": 138.5, | |
| "completions/min_length": 106.0, | |
| "completions/max_length": 184.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 138.5, | |
| "completions/min_terminated_length": 106.0, | |
| "completions/max_terminated_length": 184.0, | |
| "rewards/reward_fn/mean": 0.4925000071525574, | |
| "rewards/reward_fn/std": 0.3149999678134918, | |
| "reward": 0.4925000071525574, | |
| "reward_std": 0.3149999976158142, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1438663005828857, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 27.72107682299975, | |
| "epoch": 0.24833333333333332, | |
| "step": 149 | |
| }, | |
| { | |
| "loss": 0.023503979668021202, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 1.275e-06, | |
| "num_tokens": 428360.0, | |
| "completions/mean_length": 160.0, | |
| "completions/min_length": 128.0, | |
| "completions/max_length": 203.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 160.0, | |
| "completions/min_terminated_length": 128.0, | |
| "completions/max_terminated_length": 203.0, | |
| "rewards/reward_fn/mean": 0.49687498807907104, | |
| "rewards/reward_fn/std": 0.3128456473350525, | |
| "reward": 0.49687498807907104, | |
| "reward_std": 0.3128456175327301, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.3133922815322876, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 29.582455317002314, | |
| "epoch": 0.25, | |
| "step": 150 | |
| }, | |
| { | |
| "loss": 0.030243342742323875, | |
| "grad_norm": 0.30078125, | |
| "learning_rate": 1.25e-06, | |
| "num_tokens": 431399.0, | |
| "completions/mean_length": 140.75, | |
| "completions/min_length": 114.0, | |
| "completions/max_length": 194.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 140.75, | |
| "completions/min_terminated_length": 114.0, | |
| "completions/max_terminated_length": 194.0, | |
| "rewards/reward_fn/mean": 0.3174999952316284, | |
| "rewards/reward_fn/std": 0.014288689009845257, | |
| "reward": 0.3174999952316284, | |
| "reward_std": 0.014288689009845257, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.092279314994812, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 28.14050171800045, | |
| "epoch": 0.25166666666666665, | |
| "step": 151 | |
| }, | |
| { | |
| "loss": -0.11270540952682495, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 1.2250000000000001e-06, | |
| "num_tokens": 434015.0, | |
| "completions/mean_length": 141.0, | |
| "completions/min_length": 120.0, | |
| "completions/max_length": 172.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 141.0, | |
| "completions/min_terminated_length": 120.0, | |
| "completions/max_terminated_length": 172.0, | |
| "rewards/reward_fn/mean": 0.4881249964237213, | |
| "rewards/reward_fn/std": 0.2716260850429535, | |
| "reward": 0.4881249964237213, | |
| "reward_std": 0.2716260850429535, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.3847908973693848, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 25.02353806499741, | |
| "epoch": 0.25333333333333335, | |
| "step": 152 | |
| }, | |
| { | |
| "loss": 0.15968742966651917, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 1.2000000000000002e-06, | |
| "num_tokens": 436210.0, | |
| "completions/mean_length": 138.75, | |
| "completions/min_length": 93.0, | |
| "completions/max_length": 180.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 138.75, | |
| "completions/min_terminated_length": 93.0, | |
| "completions/max_terminated_length": 180.0, | |
| "rewards/reward_fn/mean": 0.48375001549720764, | |
| "rewards/reward_fn/std": 0.27466267347335815, | |
| "reward": 0.48375001549720764, | |
| "reward_std": 0.27466264367103577, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2065187692642212, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 25.63544118199934, | |
| "epoch": 0.255, | |
| "step": 153 | |
| }, | |
| { | |
| "loss": -0.030262991786003113, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 1.175e-06, | |
| "num_tokens": 439040.0, | |
| "completions/mean_length": 122.5, | |
| "completions/min_length": 109.0, | |
| "completions/max_length": 135.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 122.5, | |
| "completions/min_terminated_length": 109.0, | |
| "completions/max_terminated_length": 135.0, | |
| "rewards/reward_fn/mean": 0.9693750143051147, | |
| "rewards/reward_fn/std": 0.00875002145767212, | |
| "reward": 0.9693750143051147, | |
| "reward_std": 0.00875002145767212, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0571459531784058, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 20.32930038700215, | |
| "epoch": 0.25666666666666665, | |
| "step": 154 | |
| }, | |
| { | |
| "loss": 0.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.1500000000000002e-06, | |
| "num_tokens": 441942.0, | |
| "completions/mean_length": 135.5, | |
| "completions/min_length": 114.0, | |
| "completions/max_length": 155.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 135.5, | |
| "completions/min_terminated_length": 114.0, | |
| "completions/max_terminated_length": 155.0, | |
| "rewards/reward_fn/mean": 0.33500000834465027, | |
| "rewards/reward_fn/std": 0.0, | |
| "reward": 0.33500000834465027, | |
| "reward_std": 0.0, | |
| "frac_reward_zero_std": 1.0, | |
| "entropy": 1.005255937576294, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 23.203776197999105, | |
| "epoch": 0.25833333333333336, | |
| "step": 155 | |
| }, | |
| { | |
| "loss": -0.035824455320835114, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 1.125e-06, | |
| "num_tokens": 444708.0, | |
| "completions/mean_length": 145.5, | |
| "completions/min_length": 133.0, | |
| "completions/max_length": 156.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 145.5, | |
| "completions/min_terminated_length": 133.0, | |
| "completions/max_terminated_length": 156.0, | |
| "rewards/reward_fn/mean": 0.972000002861023, | |
| "rewards/reward_fn/std": 0.013999998569488525, | |
| "reward": 0.972000002861023, | |
| "reward_std": 0.013999998569488525, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0427701473236084, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 23.09061835200191, | |
| "epoch": 0.26, | |
| "step": 156 | |
| }, | |
| { | |
| "loss": 0.031955230981111526, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 1.1e-06, | |
| "num_tokens": 447946.0, | |
| "completions/mean_length": 175.5, | |
| "completions/min_length": 155.0, | |
| "completions/max_length": 202.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 175.5, | |
| "completions/min_terminated_length": 155.0, | |
| "completions/max_terminated_length": 202.0, | |
| "rewards/reward_fn/mean": 0.5012500286102295, | |
| "rewards/reward_fn/std": 0.3096066117286682, | |
| "reward": 0.5012500286102295, | |
| "reward_std": 0.3096066117286682, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2932279109954834, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 30.293572720001976, | |
| "epoch": 0.26166666666666666, | |
| "step": 157 | |
| }, | |
| { | |
| "loss": -0.007350577507168055, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 1.075e-06, | |
| "num_tokens": 450582.0, | |
| "completions/mean_length": 136.0, | |
| "completions/min_length": 109.0, | |
| "completions/max_length": 177.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 136.0, | |
| "completions/min_terminated_length": 109.0, | |
| "completions/max_terminated_length": 177.0, | |
| "rewards/reward_fn/mean": 0.4925000071525574, | |
| "rewards/reward_fn/std": 0.3149999678134918, | |
| "reward": 0.4925000071525574, | |
| "reward_std": 0.3149999976158142, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.183039665222168, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 26.09575506499823, | |
| "epoch": 0.2633333333333333, | |
| "step": 158 | |
| }, | |
| { | |
| "loss": -0.083620585501194, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 1.0500000000000001e-06, | |
| "num_tokens": 453231.0, | |
| "completions/mean_length": 132.25, | |
| "completions/min_length": 110.0, | |
| "completions/max_length": 151.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 132.25, | |
| "completions/min_terminated_length": 110.0, | |
| "completions/max_terminated_length": 151.0, | |
| "rewards/reward_fn/mean": 0.8162499666213989, | |
| "rewards/reward_fn/std": 0.32093939185142517, | |
| "reward": 0.8162499666213989, | |
| "reward_std": 0.3209393620491028, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0820621252059937, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.651842983003007, | |
| "epoch": 0.265, | |
| "step": 159 | |
| }, | |
| { | |
| "loss": -0.023693839088082314, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 1.025e-06, | |
| "num_tokens": 456406.0, | |
| "completions/mean_length": 137.75, | |
| "completions/min_length": 101.0, | |
| "completions/max_length": 185.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 137.75, | |
| "completions/min_terminated_length": 101.0, | |
| "completions/max_terminated_length": 185.0, | |
| "rewards/reward_fn/mean": 0.8162499666213989, | |
| "rewards/reward_fn/std": 0.32093939185142517, | |
| "reward": 0.8162499666213989, | |
| "reward_std": 0.32093939185142517, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2464767694473267, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 27.780609208999522, | |
| "epoch": 0.26666666666666666, | |
| "step": 160 | |
| }, | |
| { | |
| "loss": 0.1468632072210312, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 1.0000000000000002e-06, | |
| "num_tokens": 459585.0, | |
| "completions/mean_length": 154.75, | |
| "completions/min_length": 107.0, | |
| "completions/max_length": 201.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 154.75, | |
| "completions/min_terminated_length": 107.0, | |
| "completions/max_terminated_length": 201.0, | |
| "rewards/reward_fn/mean": 0.6587499976158142, | |
| "rewards/reward_fn/std": 0.3738343119621277, | |
| "reward": 0.6587499976158142, | |
| "reward_std": 0.3738343417644501, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.316744327545166, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 29.671674831999553, | |
| "epoch": 0.2683333333333333, | |
| "step": 161 | |
| }, | |
| { | |
| "loss": -0.060441743582487106, | |
| "grad_norm": 0.3828125, | |
| "learning_rate": 9.750000000000002e-07, | |
| "num_tokens": 462603.0, | |
| "completions/mean_length": 113.5, | |
| "completions/min_length": 89.0, | |
| "completions/max_length": 132.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 113.5, | |
| "completions/min_terminated_length": 89.0, | |
| "completions/max_terminated_length": 132.0, | |
| "rewards/reward_fn/mean": 0.32625001668930054, | |
| "rewards/reward_fn/std": 0.010103637352585793, | |
| "reward": 0.32625001668930054, | |
| "reward_std": 0.010103637352585793, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0979673862457275, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 20.17059165000319, | |
| "epoch": 0.27, | |
| "step": 162 | |
| }, | |
| { | |
| "loss": 0.06180550530552864, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 9.500000000000001e-07, | |
| "num_tokens": 465256.0, | |
| "completions/mean_length": 127.25, | |
| "completions/min_length": 106.0, | |
| "completions/max_length": 140.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 127.25, | |
| "completions/min_terminated_length": 106.0, | |
| "completions/max_terminated_length": 140.0, | |
| "rewards/reward_fn/mean": 0.6543749570846558, | |
| "rewards/reward_fn/std": 0.3688516914844513, | |
| "reward": 0.6543749570846558, | |
| "reward_std": 0.3688516914844513, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1192840337753296, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 20.585320939000667, | |
| "epoch": 0.27166666666666667, | |
| "step": 163 | |
| }, | |
| { | |
| "loss": 0.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.25e-07, | |
| "num_tokens": 467961.0, | |
| "completions/mean_length": 117.25, | |
| "completions/min_length": 92.0, | |
| "completions/max_length": 146.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 117.25, | |
| "completions/min_terminated_length": 92.0, | |
| "completions/max_terminated_length": 146.0, | |
| "rewards/reward_fn/mean": 0.33500000834465027, | |
| "rewards/reward_fn/std": 0.0, | |
| "reward": 0.33500000834465027, | |
| "reward_std": 0.0, | |
| "frac_reward_zero_std": 1.0, | |
| "entropy": 1.060671091079712, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 21.4030541309985, | |
| "epoch": 0.2733333333333333, | |
| "step": 164 | |
| }, | |
| { | |
| "loss": 0.12060170620679855, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 9.000000000000001e-07, | |
| "num_tokens": 470142.0, | |
| "completions/mean_length": 122.25, | |
| "completions/min_length": 92.0, | |
| "completions/max_length": 162.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 122.25, | |
| "completions/min_terminated_length": 92.0, | |
| "completions/max_terminated_length": 162.0, | |
| "rewards/reward_fn/mean": 0.5056250095367432, | |
| "rewards/reward_fn/std": 0.30724838376045227, | |
| "reward": 0.5056250095367432, | |
| "reward_std": 0.30724838376045227, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.010462760925293, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 23.291043189001357, | |
| "epoch": 0.275, | |
| "step": 165 | |
| }, | |
| { | |
| "loss": -0.0018487756606191397, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 8.75e-07, | |
| "num_tokens": 473312.0, | |
| "completions/mean_length": 171.5, | |
| "completions/min_length": 109.0, | |
| "completions/max_length": 237.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 171.5, | |
| "completions/min_terminated_length": 109.0, | |
| "completions/max_terminated_length": 237.0, | |
| "rewards/reward_fn/mean": 0.6617499589920044, | |
| "rewards/reward_fn/std": 0.35750025510787964, | |
| "reward": 0.6617499589920044, | |
| "reward_std": 0.35750025510787964, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.3963382244110107, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 34.14518971199868, | |
| "epoch": 0.27666666666666667, | |
| "step": 166 | |
| }, | |
| { | |
| "loss": 0.13671796023845673, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 8.500000000000001e-07, | |
| "num_tokens": 476340.0, | |
| "completions/mean_length": 163.0, | |
| "completions/min_length": 142.0, | |
| "completions/max_length": 208.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 163.0, | |
| "completions/min_terminated_length": 142.0, | |
| "completions/max_terminated_length": 208.0, | |
| "rewards/reward_fn/mean": 0.968500018119812, | |
| "rewards/reward_fn/std": 0.028719918802380562, | |
| "reward": 0.968500018119812, | |
| "reward_std": 0.028719913214445114, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.246614933013916, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 30.15575728999829, | |
| "epoch": 0.2783333333333333, | |
| "step": 167 | |
| }, | |
| { | |
| "loss": 0.023372644558548927, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 8.250000000000001e-07, | |
| "num_tokens": 479566.0, | |
| "completions/mean_length": 137.5, | |
| "completions/min_length": 129.0, | |
| "completions/max_length": 149.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 137.5, | |
| "completions/min_terminated_length": 129.0, | |
| "completions/max_terminated_length": 149.0, | |
| "rewards/reward_fn/mean": 0.9693750143051147, | |
| "rewards/reward_fn/std": 0.00875002145767212, | |
| "reward": 0.9693750143051147, | |
| "reward_std": 0.00875002145767212, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2037601470947266, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.651985214000888, | |
| "epoch": 0.28, | |
| "step": 168 | |
| }, | |
| { | |
| "loss": -0.05584796518087387, | |
| "grad_norm": 0.337890625, | |
| "learning_rate": 8.000000000000001e-07, | |
| "num_tokens": 482692.0, | |
| "completions/mean_length": 119.5, | |
| "completions/min_length": 106.0, | |
| "completions/max_length": 137.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 119.5, | |
| "completions/min_terminated_length": 106.0, | |
| "completions/max_terminated_length": 137.0, | |
| "rewards/reward_fn/mean": 0.3306249976158142, | |
| "rewards/reward_fn/std": 0.008750006556510925, | |
| "reward": 0.3306249976158142, | |
| "reward_std": 0.008750001899898052, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 0.9957237243652344, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 20.86080199800199, | |
| "epoch": 0.2816666666666667, | |
| "step": 169 | |
| }, | |
| { | |
| "loss": 0.013437695801258087, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 7.750000000000001e-07, | |
| "num_tokens": 485351.0, | |
| "completions/mean_length": 112.75, | |
| "completions/min_length": 89.0, | |
| "completions/max_length": 140.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 112.75, | |
| "completions/min_terminated_length": 89.0, | |
| "completions/max_terminated_length": 140.0, | |
| "rewards/reward_fn/mean": 0.6324999928474426, | |
| "rewards/reward_fn/std": 0.34352341294288635, | |
| "reward": 0.6324999928474426, | |
| "reward_std": 0.34352341294288635, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0123518705368042, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 20.707748422002624, | |
| "epoch": 0.2833333333333333, | |
| "step": 170 | |
| }, | |
| { | |
| "loss": -0.017979849129915237, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 7.5e-07, | |
| "num_tokens": 488211.0, | |
| "completions/mean_length": 141.0, | |
| "completions/min_length": 113.0, | |
| "completions/max_length": 174.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 141.0, | |
| "completions/min_terminated_length": 113.0, | |
| "completions/max_terminated_length": 174.0, | |
| "rewards/reward_fn/mean": 0.7612500190734863, | |
| "rewards/reward_fn/std": 0.2913009822368622, | |
| "reward": 0.7612500190734863, | |
| "reward_std": 0.2913009524345398, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2882589101791382, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 25.314323123999202, | |
| "epoch": 0.285, | |
| "step": 171 | |
| }, | |
| { | |
| "loss": 0.0024343333207070827, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 7.25e-07, | |
| "num_tokens": 491483.0, | |
| "completions/mean_length": 137.0, | |
| "completions/min_length": 80.0, | |
| "completions/max_length": 178.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 137.0, | |
| "completions/min_terminated_length": 80.0, | |
| "completions/max_terminated_length": 178.0, | |
| "rewards/reward_fn/mean": 0.7942500114440918, | |
| "rewards/reward_fn/std": 0.30649998784065247, | |
| "reward": 0.7942500114440918, | |
| "reward_std": 0.30650001764297485, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2393285036087036, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 26.337294665998343, | |
| "epoch": 0.2866666666666667, | |
| "step": 172 | |
| }, | |
| { | |
| "loss": -0.1553015410900116, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 7.000000000000001e-07, | |
| "num_tokens": 494264.0, | |
| "completions/mean_length": 127.25, | |
| "completions/min_length": 98.0, | |
| "completions/max_length": 167.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 127.25, | |
| "completions/min_terminated_length": 98.0, | |
| "completions/max_terminated_length": 167.0, | |
| "rewards/reward_fn/mean": 0.34375, | |
| "rewards/reward_fn/std": 0.017499998211860657, | |
| "reward": 0.34375, | |
| "reward_std": 0.017500003799796104, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0118447542190552, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 24.38981057300043, | |
| "epoch": 0.28833333333333333, | |
| "step": 173 | |
| }, | |
| { | |
| "loss": 0.047720227390527725, | |
| "grad_norm": 0.328125, | |
| "learning_rate": 6.750000000000001e-07, | |
| "num_tokens": 497373.0, | |
| "completions/mean_length": 132.25, | |
| "completions/min_length": 103.0, | |
| "completions/max_length": 147.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 132.25, | |
| "completions/min_terminated_length": 103.0, | |
| "completions/max_terminated_length": 147.0, | |
| "rewards/reward_fn/mean": 0.8118749856948853, | |
| "rewards/reward_fn/std": 0.2953908145427704, | |
| "reward": 0.8118749856948853, | |
| "reward_std": 0.2953908145427704, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.3742601871490479, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.009925050002494, | |
| "epoch": 0.29, | |
| "step": 174 | |
| }, | |
| { | |
| "loss": 0.11253699660301208, | |
| "grad_norm": 0.29296875, | |
| "learning_rate": 6.5e-07, | |
| "num_tokens": 500407.0, | |
| "completions/mean_length": 158.5, | |
| "completions/min_length": 120.0, | |
| "completions/max_length": 239.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 158.5, | |
| "completions/min_terminated_length": 120.0, | |
| "completions/max_terminated_length": 239.0, | |
| "rewards/reward_fn/mean": 0.9673749804496765, | |
| "rewards/reward_fn/std": 0.010765498504042625, | |
| "reward": 0.9673749804496765, | |
| "reward_std": 0.010765510611236095, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1549588441848755, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 34.68952643800003, | |
| "epoch": 0.2916666666666667, | |
| "step": 175 | |
| }, | |
| { | |
| "loss": 0.11750662326812744, | |
| "grad_norm": 0.33984375, | |
| "learning_rate": 6.25e-07, | |
| "num_tokens": 503025.0, | |
| "completions/mean_length": 142.5, | |
| "completions/min_length": 99.0, | |
| "completions/max_length": 176.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 142.5, | |
| "completions/min_terminated_length": 99.0, | |
| "completions/max_terminated_length": 176.0, | |
| "rewards/reward_fn/mean": 0.8075000047683716, | |
| "rewards/reward_fn/std": 0.3149999976158142, | |
| "reward": 0.8075000047683716, | |
| "reward_std": 0.3149999976158142, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1769404411315918, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 26.404072473997076, | |
| "epoch": 0.29333333333333333, | |
| "step": 176 | |
| }, | |
| { | |
| "loss": -0.14939624071121216, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 6.000000000000001e-07, | |
| "num_tokens": 505870.0, | |
| "completions/mean_length": 116.25, | |
| "completions/min_length": 86.0, | |
| "completions/max_length": 151.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 116.25, | |
| "completions/min_terminated_length": 86.0, | |
| "completions/max_terminated_length": 151.0, | |
| "rewards/reward_fn/mean": 0.484624981880188, | |
| "rewards/reward_fn/std": 0.32059407234191895, | |
| "reward": 0.484624981880188, | |
| "reward_std": 0.32059407234191895, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.045095682144165, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 23.14946024099845, | |
| "epoch": 0.295, | |
| "step": 177 | |
| }, | |
| { | |
| "loss": 0.1224709302186966, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 5.750000000000001e-07, | |
| "num_tokens": 508679.0, | |
| "completions/mean_length": 138.25, | |
| "completions/min_length": 104.0, | |
| "completions/max_length": 178.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 138.25, | |
| "completions/min_terminated_length": 104.0, | |
| "completions/max_terminated_length": 178.0, | |
| "rewards/reward_fn/mean": 0.3218750059604645, | |
| "rewards/reward_fn/std": 0.008750006556510925, | |
| "reward": 0.3218750059604645, | |
| "reward_std": 0.008750011213123798, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0988469123840332, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 26.35802866800077, | |
| "epoch": 0.2966666666666667, | |
| "step": 178 | |
| }, | |
| { | |
| "loss": 0.007333819754421711, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 5.5e-07, | |
| "num_tokens": 511573.0, | |
| "completions/mean_length": 124.5, | |
| "completions/min_length": 107.0, | |
| "completions/max_length": 150.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 124.5, | |
| "completions/min_terminated_length": 107.0, | |
| "completions/max_terminated_length": 150.0, | |
| "rewards/reward_fn/mean": 0.48375001549720764, | |
| "rewards/reward_fn/std": 0.3209393620491028, | |
| "reward": 0.48375001549720764, | |
| "reward_std": 0.3209393620491028, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0090521574020386, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.760440566002217, | |
| "epoch": 0.29833333333333334, | |
| "step": 179 | |
| }, | |
| { | |
| "loss": 0.03865119442343712, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 5.250000000000001e-07, | |
| "num_tokens": 514181.0, | |
| "completions/mean_length": 112.0, | |
| "completions/min_length": 91.0, | |
| "completions/max_length": 143.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 112.0, | |
| "completions/min_terminated_length": 91.0, | |
| "completions/max_terminated_length": 143.0, | |
| "rewards/reward_fn/mean": 0.6499999761581421, | |
| "rewards/reward_fn/std": 0.36373066902160645, | |
| "reward": 0.6499999761581421, | |
| "reward_std": 0.36373066902160645, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2578190565109253, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 21.571868511000503, | |
| "epoch": 0.3, | |
| "step": 180 | |
| }, | |
| { | |
| "loss": 0.05512908101081848, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 5.000000000000001e-07, | |
| "num_tokens": 517096.0, | |
| "completions/mean_length": 124.75, | |
| "completions/min_length": 106.0, | |
| "completions/max_length": 146.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 124.75, | |
| "completions/min_terminated_length": 106.0, | |
| "completions/max_terminated_length": 146.0, | |
| "rewards/reward_fn/mean": 0.6412500143051147, | |
| "rewards/reward_fn/std": 0.35391557216644287, | |
| "reward": 0.6412500143051147, | |
| "reward_std": 0.35391560196876526, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.3287837505340576, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 21.837761578000936, | |
| "epoch": 0.3016666666666667, | |
| "step": 181 | |
| }, | |
| { | |
| "loss": -0.10974273085594177, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 4.7500000000000006e-07, | |
| "num_tokens": 520018.0, | |
| "completions/mean_length": 116.5, | |
| "completions/min_length": 89.0, | |
| "completions/max_length": 152.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 116.5, | |
| "completions/min_terminated_length": 89.0, | |
| "completions/max_terminated_length": 152.0, | |
| "rewards/reward_fn/mean": 0.8136249780654907, | |
| "rewards/reward_fn/std": 0.29675617814064026, | |
| "reward": 0.8136249780654907, | |
| "reward_std": 0.29675617814064026, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0346094369888306, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.461976462000166, | |
| "epoch": 0.30333333333333334, | |
| "step": 182 | |
| }, | |
| { | |
| "loss": 0.042166367173194885, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 4.5000000000000003e-07, | |
| "num_tokens": 522491.0, | |
| "completions/mean_length": 127.25, | |
| "completions/min_length": 109.0, | |
| "completions/max_length": 138.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 127.25, | |
| "completions/min_terminated_length": 109.0, | |
| "completions/max_terminated_length": 138.0, | |
| "rewards/reward_fn/mean": 0.3062500059604645, | |
| "rewards/reward_fn/std": 0.05750000849366188, | |
| "reward": 0.3062500059604645, | |
| "reward_std": 0.057500001043081284, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0929187536239624, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 20.686902885001473, | |
| "epoch": 0.305, | |
| "step": 183 | |
| }, | |
| { | |
| "loss": 0.02638399973511696, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 4.2500000000000006e-07, | |
| "num_tokens": 525490.0, | |
| "completions/mean_length": 107.75, | |
| "completions/min_length": 95.0, | |
| "completions/max_length": 122.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 107.75, | |
| "completions/min_terminated_length": 95.0, | |
| "completions/max_terminated_length": 122.0, | |
| "rewards/reward_fn/mean": 0.9693750143051147, | |
| "rewards/reward_fn/std": 0.00875002145767212, | |
| "reward": 0.9693750143051147, | |
| "reward_std": 0.00875002145767212, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0409756898880005, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 18.76558146499883, | |
| "epoch": 0.30666666666666664, | |
| "step": 184 | |
| }, | |
| { | |
| "loss": -0.023447571322321892, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 4.0000000000000003e-07, | |
| "num_tokens": 528287.0, | |
| "completions/mean_length": 136.25, | |
| "completions/min_length": 120.0, | |
| "completions/max_length": 159.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 136.25, | |
| "completions/min_terminated_length": 120.0, | |
| "completions/max_terminated_length": 159.0, | |
| "rewards/reward_fn/mean": 0.6631250381469727, | |
| "rewards/reward_fn/std": 0.3590344488620758, | |
| "reward": 0.6631250381469727, | |
| "reward_std": 0.3590344488620758, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1596969366073608, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 23.50510012199811, | |
| "epoch": 0.30833333333333335, | |
| "step": 185 | |
| }, | |
| { | |
| "loss": -0.08003614097833633, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 3.75e-07, | |
| "num_tokens": 531576.0, | |
| "completions/mean_length": 130.25, | |
| "completions/min_length": 107.0, | |
| "completions/max_length": 168.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 130.25, | |
| "completions/min_terminated_length": 107.0, | |
| "completions/max_terminated_length": 168.0, | |
| "rewards/reward_fn/mean": 0.9562499523162842, | |
| "rewards/reward_fn/std": 0.041658345609903336, | |
| "reward": 0.9562499523162842, | |
| "reward_std": 0.04165836051106453, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1854372024536133, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 25.317642065001564, | |
| "epoch": 0.31, | |
| "step": 186 | |
| }, | |
| { | |
| "loss": -0.11360026895999908, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 3.5000000000000004e-07, | |
| "num_tokens": 534248.0, | |
| "completions/mean_length": 132.0, | |
| "completions/min_length": 109.0, | |
| "completions/max_length": 162.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 132.0, | |
| "completions/min_terminated_length": 109.0, | |
| "completions/max_terminated_length": 162.0, | |
| "rewards/reward_fn/mean": 0.4925000071525574, | |
| "rewards/reward_fn/std": 0.3149999678134918, | |
| "reward": 0.4925000071525574, | |
| "reward_std": 0.3149999976158142, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1301367282867432, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 24.889422044998355, | |
| "epoch": 0.31166666666666665, | |
| "step": 187 | |
| }, | |
| { | |
| "loss": 0.06846462935209274, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 3.25e-07, | |
| "num_tokens": 536917.0, | |
| "completions/mean_length": 137.25, | |
| "completions/min_length": 84.0, | |
| "completions/max_length": 170.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 137.25, | |
| "completions/min_terminated_length": 84.0, | |
| "completions/max_terminated_length": 170.0, | |
| "rewards/reward_fn/mean": 0.6456249952316284, | |
| "rewards/reward_fn/std": 0.35874998569488525, | |
| "reward": 0.6456249952316284, | |
| "reward_std": 0.35874998569488525, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.198452115058899, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 24.68997424599729, | |
| "epoch": 0.31333333333333335, | |
| "step": 188 | |
| }, | |
| { | |
| "loss": 0.2711350619792938, | |
| "grad_norm": 0.375, | |
| "learning_rate": 3.0000000000000004e-07, | |
| "num_tokens": 539912.0, | |
| "completions/mean_length": 140.75, | |
| "completions/min_length": 99.0, | |
| "completions/max_length": 249.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 140.75, | |
| "completions/min_terminated_length": 99.0, | |
| "completions/max_terminated_length": 249.0, | |
| "rewards/reward_fn/mean": 0.9668749570846558, | |
| "rewards/reward_fn/std": 0.01143370196223259, | |
| "reward": 0.9668749570846558, | |
| "reward_std": 0.01143370196223259, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.196223497390747, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 35.739375315999496, | |
| "epoch": 0.315, | |
| "step": 189 | |
| }, | |
| { | |
| "loss": 0.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.75e-07, | |
| "num_tokens": 543321.0, | |
| "completions/mean_length": 147.25, | |
| "completions/min_length": 131.0, | |
| "completions/max_length": 173.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 147.25, | |
| "completions/min_terminated_length": 131.0, | |
| "completions/max_terminated_length": 173.0, | |
| "rewards/reward_fn/mean": 0.9649999737739563, | |
| "rewards/reward_fn/std": 0.0, | |
| "reward": 0.9649999737739563, | |
| "reward_std": 0.0, | |
| "frac_reward_zero_std": 1.0, | |
| "entropy": 1.1819069385528564, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 26.15833759999805, | |
| "epoch": 0.31666666666666665, | |
| "step": 190 | |
| }, | |
| { | |
| "loss": 0.04181407764554024, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 2.5000000000000004e-07, | |
| "num_tokens": 546288.0, | |
| "completions/mean_length": 150.75, | |
| "completions/min_length": 129.0, | |
| "completions/max_length": 174.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 150.75, | |
| "completions/min_terminated_length": 129.0, | |
| "completions/max_terminated_length": 174.0, | |
| "rewards/reward_fn/mean": 0.9693750143051147, | |
| "rewards/reward_fn/std": 0.00875002145767212, | |
| "reward": 0.9693750143051147, | |
| "reward_std": 0.00875002145767212, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2341399192810059, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 25.681773741998768, | |
| "epoch": 0.31833333333333336, | |
| "step": 191 | |
| }, | |
| { | |
| "loss": -0.02491738647222519, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 2.2500000000000002e-07, | |
| "num_tokens": 549370.0, | |
| "completions/mean_length": 165.5, | |
| "completions/min_length": 101.0, | |
| "completions/max_length": 268.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 165.5, | |
| "completions/min_terminated_length": 101.0, | |
| "completions/max_terminated_length": 268.0, | |
| "rewards/reward_fn/mean": 0.8023749589920044, | |
| "rewards/reward_fn/std": 0.3117331564426422, | |
| "reward": 0.8023749589920044, | |
| "reward_std": 0.3117331564426422, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.2476222515106201, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 38.91904668999996, | |
| "epoch": 0.32, | |
| "step": 192 | |
| }, | |
| { | |
| "loss": -0.012936650775372982, | |
| "grad_norm": 0.29296875, | |
| "learning_rate": 2.0000000000000002e-07, | |
| "num_tokens": 552399.0, | |
| "completions/mean_length": 125.25, | |
| "completions/min_length": 115.0, | |
| "completions/max_length": 135.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 125.25, | |
| "completions/min_terminated_length": 115.0, | |
| "completions/max_terminated_length": 135.0, | |
| "rewards/reward_fn/mean": 0.9474999904632568, | |
| "rewards/reward_fn/std": 0.034999996423721313, | |
| "reward": 0.9474999904632568, | |
| "reward_std": 0.03500000759959221, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.207331895828247, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 21.57561057500061, | |
| "epoch": 0.32166666666666666, | |
| "step": 193 | |
| }, | |
| { | |
| "loss": 0.08878301829099655, | |
| "grad_norm": 0.375, | |
| "learning_rate": 1.7500000000000002e-07, | |
| "num_tokens": 555579.0, | |
| "completions/mean_length": 125.0, | |
| "completions/min_length": 110.0, | |
| "completions/max_length": 145.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 125.0, | |
| "completions/min_terminated_length": 110.0, | |
| "completions/max_terminated_length": 145.0, | |
| "rewards/reward_fn/mean": 0.6631250381469727, | |
| "rewards/reward_fn/std": 0.3590344488620758, | |
| "reward": 0.6631250381469727, | |
| "reward_std": 0.3590344190597534, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.171470046043396, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.777231099997152, | |
| "epoch": 0.3233333333333333, | |
| "step": 194 | |
| }, | |
| { | |
| "loss": -0.06446050107479095, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 1.5000000000000002e-07, | |
| "num_tokens": 558548.0, | |
| "completions/mean_length": 125.25, | |
| "completions/min_length": 109.0, | |
| "completions/max_length": 146.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 125.25, | |
| "completions/min_terminated_length": 109.0, | |
| "completions/max_terminated_length": 146.0, | |
| "rewards/reward_fn/mean": 0.32712501287460327, | |
| "rewards/reward_fn/std": 0.015750005841255188, | |
| "reward": 0.32712501287460327, | |
| "reward_std": 0.015750011429190636, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.1294211149215698, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 22.614405715998146, | |
| "epoch": 0.325, | |
| "step": 195 | |
| }, | |
| { | |
| "loss": 0.06022682413458824, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 1.2500000000000002e-07, | |
| "num_tokens": 561567.0, | |
| "completions/mean_length": 119.75, | |
| "completions/min_length": 97.0, | |
| "completions/max_length": 137.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 119.75, | |
| "completions/min_terminated_length": 97.0, | |
| "completions/max_terminated_length": 137.0, | |
| "rewards/reward_fn/mean": 0.9667500257492065, | |
| "rewards/reward_fn/std": 0.011608189903199673, | |
| "reward": 0.9667500257492065, | |
| "reward_std": 0.011608189903199673, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0905414819717407, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 20.814484424998227, | |
| "epoch": 0.32666666666666666, | |
| "step": 196 | |
| }, | |
| { | |
| "loss": 0.025765087455511093, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 1.0000000000000001e-07, | |
| "num_tokens": 564581.0, | |
| "completions/mean_length": 145.5, | |
| "completions/min_length": 123.0, | |
| "completions/max_length": 177.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 145.5, | |
| "completions/min_terminated_length": 123.0, | |
| "completions/max_terminated_length": 177.0, | |
| "rewards/reward_fn/mean": 0.8075000047683716, | |
| "rewards/reward_fn/std": 0.3149999976158142, | |
| "reward": 0.8075000047683716, | |
| "reward_std": 0.3149999976158142, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0498154163360596, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 26.312665902001754, | |
| "epoch": 0.3283333333333333, | |
| "step": 197 | |
| }, | |
| { | |
| "loss": 0.010705118998885155, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 7.500000000000001e-08, | |
| "num_tokens": 567648.0, | |
| "completions/mean_length": 119.75, | |
| "completions/min_length": 106.0, | |
| "completions/max_length": 140.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 119.75, | |
| "completions/min_terminated_length": 106.0, | |
| "completions/max_terminated_length": 140.0, | |
| "rewards/reward_fn/mean": 0.9772499799728394, | |
| "rewards/reward_fn/std": 0.015256168320775032, | |
| "reward": 0.9772499799728394, | |
| "reward_std": 0.015256163664162159, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0515799522399902, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 21.37977665500148, | |
| "epoch": 0.33, | |
| "step": 198 | |
| }, | |
| { | |
| "loss": -0.1868162900209427, | |
| "grad_norm": 0.328125, | |
| "learning_rate": 5.0000000000000004e-08, | |
| "num_tokens": 570646.0, | |
| "completions/mean_length": 122.5, | |
| "completions/min_length": 97.0, | |
| "completions/max_length": 170.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 122.5, | |
| "completions/min_terminated_length": 97.0, | |
| "completions/max_terminated_length": 170.0, | |
| "rewards/reward_fn/mean": 0.34812501072883606, | |
| "rewards/reward_fn/std": 0.03880372643470764, | |
| "reward": 0.34812501072883606, | |
| "reward_std": 0.03880372270941734, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0080755949020386, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 25.808188715000142, | |
| "epoch": 0.33166666666666667, | |
| "step": 199 | |
| }, | |
| { | |
| "loss": 0.09302064031362534, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 2.5000000000000002e-08, | |
| "num_tokens": 573155.0, | |
| "completions/mean_length": 118.25, | |
| "completions/min_length": 96.0, | |
| "completions/max_length": 144.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/mean_terminated_length": 118.25, | |
| "completions/min_terminated_length": 96.0, | |
| "completions/max_terminated_length": 144.0, | |
| "rewards/reward_fn/mean": 0.9693750143051147, | |
| "rewards/reward_fn/std": 0.00875002145767212, | |
| "reward": 0.9693750143051147, | |
| "reward_std": 0.00875002145767212, | |
| "frac_reward_zero_std": 0.0, | |
| "entropy": 1.0755765438079834, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "step_time": 21.597608337000565, | |
| "epoch": 0.3333333333333333, | |
| "step": 200 | |
| }, | |
| { | |
| "train_runtime": 5010.8646, | |
| "train_samples_per_second": 0.16, | |
| "train_steps_per_second": 0.04, | |
| "total_flos": 0.0, | |
| "train_loss": 0.0033340772934025154, | |
| "epoch": 0.3333333333333333, | |
| "step": 200 | |
| } | |
| ] |