{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 25.0,
  "eval_steps": 500,
  "global_step": 50,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1,
      "completions/max_length": 193.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 103.975,
      "completions/mean_terminated_length": 90.39881134033203,
      "completions/min_length": 17.6,
      "completions/min_terminated_length": 17.6,
      "entropy": 0.1820149033330381,
      "epoch": 2.5,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.25,
      "learning_rate": 1.0000000000000002e-06,
      "loss": 0.0794088900089264,
      "num_tokens": 14355.0,
      "reward": 0.15020800828933717,
      "reward_std": 0.6376187483081595,
      "rewards/reward_fn/mean": 0.15020800828933717,
      "rewards/reward_fn/std": 0.6376187764341011,
      "step": 5,
      "step_time": 30.522303848797673
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.025,
      "completions/max_length": 151.2,
      "completions/max_terminated_length": 141.6,
      "completions/mean_length": 80.075,
      "completions/mean_terminated_length": 77.78928833007812,
      "completions/min_length": 17.8,
      "completions/min_terminated_length": 17.8,
      "entropy": 0.16221200795844198,
      "epoch": 5.0,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.228515625,
      "learning_rate": 2.25e-06,
      "loss": 0.07169516086578369,
      "num_tokens": 27462.0,
      "reward": 0.4012819856405258,
      "reward_std": 0.4128362699819263,
      "rewards/reward_fn/mean": 0.4012819856405258,
      "rewards/reward_fn/std": 0.4128363010211615,
      "step": 10,
      "step_time": 25.01976956339822
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.025,
      "completions/max_length": 185.6,
      "completions/max_terminated_length": 171.6,
      "completions/mean_length": 83.65,
      "completions/mean_terminated_length": 79.14285736083984,
      "completions/min_length": 16.6,
      "completions/min_terminated_length": 16.6,
      "entropy": 0.14225535104051232,
      "epoch": 7.5,
      "frac_reward_zero_std": 0.3,
      "grad_norm": 0.265625,
      "learning_rate": 3.5e-06,
      "loss": 0.0717179834842682,
      "num_tokens": 40740.0,
      "reward": 0.10207997858524323,
      "reward_std": 0.7454913818277419,
      "rewards/reward_fn/mean": 0.10207997858524323,
      "rewards/reward_fn/std": 0.7454914333298802,
      "step": 15,
      "step_time": 29.626422570000432
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1,
      "completions/max_length": 202.0,
      "completions/max_terminated_length": 163.0,
      "completions/mean_length": 98.85,
      "completions/mean_terminated_length": 81.79285888671875,
      "completions/min_length": 19.2,
      "completions/min_terminated_length": 19.2,
      "entropy": 0.1534867493668571,
      "epoch": 10.0,
      "frac_reward_zero_std": 0.4,
      "grad_norm": 0.2431640625,
      "learning_rate": 4.75e-06,
      "loss": 0.09726614952087402,
      "num_tokens": 54862.0,
      "reward": 0.25181599259376525,
      "reward_std": 0.6045640033902601,
      "rewards/reward_fn/mean": 0.25181599259376525,
      "rewards/reward_fn/std": 0.6045640454394743,
      "step": 20,
      "step_time": 31.625062462999267
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 233.6,
      "completions/max_terminated_length": 194.4,
      "completions/mean_length": 99.525,
      "completions/mean_terminated_length": 77.78690643310547,
      "completions/min_length": 18.8,
      "completions/min_terminated_length": 18.8,
      "entropy": 0.1538564210291952,
      "epoch": 12.5,
      "frac_reward_zero_std": 0.6,
      "grad_norm": 0.0,
      "learning_rate": 4.981481481481482e-06,
      "loss": 0.13929661512374877,
      "num_tokens": 69039.0,
      "reward": 0.0006859898567199707,
      "reward_std": 1.0108385920524596,
      "rewards/reward_fn/mean": 0.0006859898567199707,
      "rewards/reward_fn/std": 1.0108386158943177,
      "step": 25,
      "step_time": 35.65783783460065
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.05,
      "completions/max_length": 181.8,
      "completions/max_terminated_length": 153.0,
      "completions/mean_length": 65.05,
      "completions/mean_terminated_length": 56.03928680419922,
      "completions/min_length": 16.6,
      "completions/min_terminated_length": 16.6,
      "entropy": 0.13224927680566906,
      "epoch": 15.0,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1875,
      "learning_rate": 4.958333333333334e-06,
      "loss": -0.003383058309555054,
      "num_tokens": 81545.0,
      "reward": 0.3994179755449295,
      "reward_std": 0.5622525057464373,
      "rewards/reward_fn/mean": 0.3994179755449295,
      "rewards/reward_fn/std": 0.56225254482124,
      "step": 30,
      "step_time": 29.112151033400732
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.05,
      "completions/max_length": 131.0,
      "completions/max_terminated_length": 128.6,
      "completions/mean_length": 61.075,
      "completions/mean_terminated_length": 53.282144165039064,
      "completions/min_length": 17.6,
      "completions/min_terminated_length": 17.6,
      "entropy": 0.11974610288161784,
      "epoch": 17.5,
      "frac_reward_zero_std": 0.7,
      "grad_norm": 0.220703125,
      "learning_rate": 4.935185185185186e-06,
      "loss": 0.004663025587797165,
      "num_tokens": 93892.0,
      "reward": 0.3469119846820831,
      "reward_std": 0.5775202971824911,
      "rewards/reward_fn/mean": 0.3469119846820831,
      "rewards/reward_fn/std": 0.5775203009106917,
      "step": 35,
      "step_time": 22.457519923200017
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.075,
      "completions/max_length": 179.6,
      "completions/max_terminated_length": 131.2,
      "completions/mean_length": 78.025,
      "completions/mean_terminated_length": 65.70357513427734,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "entropy": 0.12550681543070824,
      "epoch": 20.0,
      "frac_reward_zero_std": 0.4,
      "grad_norm": 0.4609375,
      "learning_rate": 4.9120370370370375e-06,
      "loss": 0.036792796850204465,
      "num_tokens": 107209.0,
      "reward": 0.3498039901256561,
      "reward_std": 0.7408117946935817,
      "rewards/reward_fn/mean": 0.3498039901256561,
      "rewards/reward_fn/std": 0.7408118456369266,
      "step": 40,
      "step_time": 28.692756705999635
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.025,
      "completions/max_length": 189.0,
      "completions/max_terminated_length": 178.8,
      "completions/mean_length": 92.675,
      "completions/mean_terminated_length": 89.51071472167969,
      "completions/min_length": 17.8,
      "completions/min_terminated_length": 17.8,
      "entropy": 0.15165529411751777,
      "epoch": 22.5,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.232421875,
      "learning_rate": 4.888888888888889e-06,
      "loss": -0.004727205634117127,
      "num_tokens": 120868.0,
      "reward": 0.30072798430919645,
      "reward_std": 0.7614144545921591,
      "rewards/reward_fn/mean": 0.30072798430919645,
      "rewards/reward_fn/std": 0.7614145380415721,
      "step": 45,
      "step_time": 30.053675796201425
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.075,
      "completions/max_length": 208.6,
      "completions/max_terminated_length": 189.6,
      "completions/mean_length": 102.625,
      "completions/mean_terminated_length": 90.11666870117188,
      "completions/min_length": 18.8,
      "completions/min_terminated_length": 18.8,
      "entropy": 0.15195838457439095,
      "epoch": 25.0,
      "frac_reward_zero_std": 0.6,
      "grad_norm": 0.208984375,
      "learning_rate": 4.865740740740741e-06,
      "loss": 0.04500017166137695,
      "num_tokens": 135121.0,
      "reward": 0.5009119868278503,
      "reward_std": 0.5213470441231038,
      "rewards/reward_fn/mean": 0.5009119868278503,
      "rewards/reward_fn/std": 0.5213470560469432,
      "step": 50,
      "step_time": 32.55890014459801
    }
  ],
  "logging_steps": 5,
  "max_steps": 1100,
  "num_input_tokens_seen": 135121,
  "num_train_epochs": 550,
  "save_steps": 50,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}