{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 25.0, "eval_steps": 500, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1, "completions/max_length": 193.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 103.975, "completions/mean_terminated_length": 90.39881134033203, "completions/min_length": 17.6, "completions/min_terminated_length": 17.6, "entropy": 0.1820149033330381, "epoch": 2.5, "frac_reward_zero_std": 0.5, "grad_norm": 0.25, "learning_rate": 1.0000000000000002e-06, "loss": 0.0794088900089264, "num_tokens": 14355.0, "reward": 0.15020800828933717, "reward_std": 0.6376187483081595, "rewards/reward_fn/mean": 0.15020800828933717, "rewards/reward_fn/std": 0.6376187764341011, "step": 5, "step_time": 30.522303848797673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 151.2, "completions/max_terminated_length": 141.6, "completions/mean_length": 80.075, "completions/mean_terminated_length": 77.78928833007812, "completions/min_length": 17.8, "completions/min_terminated_length": 17.8, "entropy": 0.16221200795844198, "epoch": 5.0, "frac_reward_zero_std": 0.5, "grad_norm": 0.228515625, "learning_rate": 2.25e-06, "loss": 0.07169516086578369, "num_tokens": 27462.0, "reward": 0.4012819856405258, "reward_std": 0.4128362699819263, "rewards/reward_fn/mean": 0.4012819856405258, "rewards/reward_fn/std": 0.4128363010211615, "step": 10, "step_time": 25.01976956339822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 185.6, "completions/max_terminated_length": 171.6, "completions/mean_length": 83.65, "completions/mean_terminated_length": 79.14285736083984, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "entropy": 0.14225535104051232, "epoch": 7.5, "frac_reward_zero_std": 0.3, "grad_norm": 0.265625, "learning_rate": 3.5e-06, "loss": 0.0717179834842682, "num_tokens": 40740.0, "reward": 0.10207997858524323, "reward_std": 0.7454913818277419, "rewards/reward_fn/mean": 0.10207997858524323, "rewards/reward_fn/std": 0.7454914333298802, "step": 15, "step_time": 29.626422570000432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1, "completions/max_length": 202.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 98.85, "completions/mean_terminated_length": 81.79285888671875, "completions/min_length": 19.2, "completions/min_terminated_length": 19.2, "entropy": 0.1534867493668571, "epoch": 10.0, "frac_reward_zero_std": 0.4, "grad_norm": 0.2431640625, "learning_rate": 4.75e-06, "loss": 0.09726614952087402, "num_tokens": 54862.0, "reward": 0.25181599259376525, "reward_std": 0.6045640033902601, "rewards/reward_fn/mean": 0.25181599259376525, "rewards/reward_fn/std": 0.6045640454394743, "step": 20, "step_time": 31.625062462999267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 233.6, "completions/max_terminated_length": 194.4, "completions/mean_length": 99.525, "completions/mean_terminated_length": 77.78690643310547, "completions/min_length": 18.8, "completions/min_terminated_length": 18.8, "entropy": 0.1538564210291952, "epoch": 12.5, "frac_reward_zero_std": 0.6, "grad_norm": 0.0, "learning_rate": 4.981481481481482e-06, "loss": 0.13929661512374877, "num_tokens": 69039.0, "reward": 0.0006859898567199707, "reward_std": 1.0108385920524596, "rewards/reward_fn/mean": 0.0006859898567199707, "rewards/reward_fn/std": 1.0108386158943177, "step": 25, "step_time": 35.65783783460065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 181.8, "completions/max_terminated_length": 153.0, "completions/mean_length": 65.05, "completions/mean_terminated_length": 56.03928680419922, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "entropy": 0.13224927680566906, "epoch": 15.0, "frac_reward_zero_std": 0.5, "grad_norm": 0.1875, "learning_rate": 4.958333333333334e-06, "loss": -0.003383058309555054, "num_tokens": 81545.0, "reward": 0.3994179755449295, "reward_std": 0.5622525057464373, "rewards/reward_fn/mean": 0.3994179755449295, "rewards/reward_fn/std": 0.56225254482124, "step": 30, "step_time": 29.112151033400732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 131.0, "completions/max_terminated_length": 128.6, "completions/mean_length": 61.075, "completions/mean_terminated_length": 53.282144165039064, "completions/min_length": 17.6, "completions/min_terminated_length": 17.6, "entropy": 0.11974610288161784, "epoch": 17.5, "frac_reward_zero_std": 0.7, "grad_norm": 0.220703125, "learning_rate": 4.935185185185186e-06, "loss": 0.004663025587797165, "num_tokens": 93892.0, "reward": 0.3469119846820831, "reward_std": 0.5775202971824911, "rewards/reward_fn/mean": 0.3469119846820831, "rewards/reward_fn/std": 0.5775203009106917, "step": 35, "step_time": 22.457519923200017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.075, "completions/max_length": 179.6, "completions/max_terminated_length": 131.2, "completions/mean_length": 78.025, "completions/mean_terminated_length": 65.70357513427734, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.12550681543070824, "epoch": 20.0, "frac_reward_zero_std": 0.4, "grad_norm": 0.4609375, "learning_rate": 4.9120370370370375e-06, "loss": 0.036792796850204465, "num_tokens": 107209.0, "reward": 0.3498039901256561, "reward_std": 0.7408117946935817, "rewards/reward_fn/mean": 0.3498039901256561, "rewards/reward_fn/std": 0.7408118456369266, "step": 40, "step_time": 28.692756705999635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 189.0, "completions/max_terminated_length": 178.8, "completions/mean_length": 92.675, "completions/mean_terminated_length": 89.51071472167969, "completions/min_length": 17.8, "completions/min_terminated_length": 17.8, "entropy": 0.15165529411751777, "epoch": 22.5, "frac_reward_zero_std": 0.5, "grad_norm": 0.232421875, "learning_rate": 4.888888888888889e-06, "loss": -0.004727205634117127, "num_tokens": 120868.0, "reward": 0.30072798430919645, "reward_std": 0.7614144545921591, "rewards/reward_fn/mean": 0.30072798430919645, "rewards/reward_fn/std": 0.7614145380415721, "step": 45, "step_time": 30.053675796201425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.075, "completions/max_length": 208.6, "completions/max_terminated_length": 189.6, "completions/mean_length": 102.625, "completions/mean_terminated_length": 90.11666870117188, "completions/min_length": 18.8, "completions/min_terminated_length": 18.8, "entropy": 0.15195838457439095, "epoch": 25.0, "frac_reward_zero_std": 0.6, "grad_norm": 0.208984375, "learning_rate": 4.865740740740741e-06, "loss": 0.04500017166137695, "num_tokens": 135121.0, "reward": 0.5009119868278503, "reward_std": 0.5213470441231038, "rewards/reward_fn/mean": 0.5009119868278503, "rewards/reward_fn/std": 0.5213470560469432, "step": 50, "step_time": 32.55890014459801 } ], "logging_steps": 5, "max_steps": 1100, "num_input_tokens_seen": 135121, "num_train_epochs": 550, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }