{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.4011299435028249,
  "eval_steps": 500,
  "global_step": 213,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 495.0,
      "completions/mean_length": 290.59375,
      "completions/mean_terminated_length": 271.83050537109375,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "entropy": 2.203125,
      "epoch": 0.0018832391713747645,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.8694089651107788,
      "learning_rate": 1e-06,
      "loss": -0.0086,
      "num_tokens": 140262.0,
      "reward": 0.206417053937912,
      "reward_std": 0.12193800508975983,
      "rewards/acc_reward/mean": 0.19810229539871216,
      "rewards/acc_reward/std": 0.28156620264053345,
      "rewards/format_reward/mean": 0.28125,
      "rewards/format_reward/std": 0.4531635046005249,
      "step": 1
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 247.734375,
      "completions/mean_terminated_length": 243.53970336914062,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "entropy": 1.6796875,
      "epoch": 0.003766478342749529,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.8876559734344482,
      "learning_rate": 9.981167608286253e-07,
      "loss": 0.0222,
      "num_tokens": 273701.0,
      "reward": 0.41670364141464233,
      "reward_std": 0.150786355137825,
      "rewards/acc_reward/mean": 0.42307350039482117,
      "rewards/acc_reward/std": 0.29976290464401245,
      "rewards/format_reward/mean": 0.359375,
      "rewards/format_reward/std": 0.4836103618144989,
      "step": 2
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 482.0,
      "completions/max_terminated_length": 482.0,
      "completions/mean_length": 236.046875,
      "completions/mean_terminated_length": 236.046875,
      "completions/min_length": 86.0,
      "completions/min_terminated_length": 86.0,
      "entropy": 1.6640625,
      "epoch": 0.005649717514124294,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.8232742547988892,
      "learning_rate": 9.962335216572504e-07,
      "loss": 0.0698,
      "num_tokens": 396392.0,
      "reward": 0.4056413769721985,
      "reward_std": 0.17224639654159546,
      "rewards/acc_reward/mean": 0.3864765465259552,
      "rewards/acc_reward/std": 0.3097887337207794,
      "rewards/format_reward/mean": 0.578125,
      "rewards/format_reward/std": 0.49776285886764526,
      "step": 3
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 389.0,
      "completions/max_terminated_length": 389.0,
      "completions/mean_length": 227.8125,
      "completions/mean_terminated_length": 227.8125,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 1.6484375,
      "epoch": 0.007532956685499058,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.0226917266845703,
      "learning_rate": 9.943502824858757e-07,
      "loss": -0.0298,
      "num_tokens": 523036.0,
      "reward": 0.3570261001586914,
      "reward_std": 0.12964516878128052,
      "rewards/acc_reward/mean": 0.33766794204711914,
      "rewards/acc_reward/std": 0.35702335834503174,
      "rewards/format_reward/mean": 0.53125,
      "rewards/format_reward/std": 0.5029674172401428,
      "step": 4
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 414.0,
      "completions/max_terminated_length": 414.0,
      "completions/mean_length": 214.515625,
      "completions/mean_terminated_length": 214.515625,
      "completions/min_length": 69.0,
      "completions/min_terminated_length": 69.0,
      "entropy": 1.8515625,
      "epoch": 0.009416195856873822,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 1.8740723133087158,
      "learning_rate": 9.92467043314501e-07,
      "loss": -0.0099,
      "num_tokens": 649309.0,
      "reward": 0.4160749912261963,
      "reward_std": 0.13374584913253784,
      "rewards/acc_reward/mean": 0.37723612785339355,
      "rewards/acc_reward/std": 0.31442132592201233,
      "rewards/format_reward/mean": 0.765625,
      "rewards/format_reward/std": 0.42695629596710205,
      "step": 5
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 470.0,
      "completions/mean_length": 266.046875,
      "completions/mean_terminated_length": 258.1129150390625,
      "completions/min_length": 65.0,
      "completions/min_terminated_length": 65.0,
      "entropy": 1.796875,
      "epoch": 0.011299435028248588,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.7314085960388184,
      "learning_rate": 9.905838041431261e-07,
      "loss": 0.1098,
      "num_tokens": 788480.0,
      "reward": 0.2147485464811325,
      "reward_std": 0.1807367354631424,
      "rewards/acc_reward/mean": 0.15006783604621887,
      "rewards/acc_reward/std": 0.3153360188007355,
      "rewards/format_reward/mean": 0.796875,
      "rewards/format_reward/std": 0.40550529956817627,
      "step": 6
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 506.0,
      "completions/max_terminated_length": 506.0,
      "completions/mean_length": 232.5625,
      "completions/mean_terminated_length": 232.5625,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "entropy": 1.84375,
      "epoch": 0.013182674199623353,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 1.3759692907333374,
      "learning_rate": 9.887005649717514e-07,
      "loss": 0.0142,
      "num_tokens": 923788.0,
      "reward": 0.21130093932151794,
      "reward_std": 0.14664816856384277,
      "rewards/acc_reward/mean": 0.12713992595672607,
      "rewards/acc_reward/std": 0.2867279052734375,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "step": 7
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 461.0,
      "completions/max_terminated_length": 461.0,
      "completions/mean_length": 256.796875,
      "completions/mean_terminated_length": 256.796875,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "entropy": 1.6484375,
      "epoch": 0.015065913370998116,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 1.6152563095092773,
      "learning_rate": 9.868173258003765e-07,
      "loss": 0.0757,
      "num_tokens": 1062207.0,
      "reward": 0.3737403452396393,
      "reward_std": 0.14178410172462463,
      "rewards/acc_reward/mean": 0.30589205026626587,
      "rewards/acc_reward/std": 0.3208266496658325,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 8
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 436.0,
      "completions/mean_length": 255.265625,
      "completions/mean_terminated_length": 238.15000915527344,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "entropy": 1.765625,
      "epoch": 0.01694915254237288,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 1.4234596490859985,
      "learning_rate": 9.849340866290019e-07,
      "loss": 0.0147,
      "num_tokens": 1205848.0,
      "reward": 0.3045212924480438,
      "reward_std": 0.1969890296459198,
      "rewards/acc_reward/mean": 0.22898197174072266,
      "rewards/acc_reward/std": 0.3434719443321228,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 9
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 487.0,
      "completions/max_terminated_length": 487.0,
      "completions/mean_length": 224.8125,
      "completions/mean_terminated_length": 224.8125,
      "completions/min_length": 83.0,
      "completions/min_terminated_length": 83.0,
      "entropy": 1.8203125,
      "epoch": 0.018832391713747645,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 1.4884109497070312,
      "learning_rate": 9.830508474576272e-07,
      "loss": -0.0137,
      "num_tokens": 1330620.0,
      "reward": 0.26978251338005066,
      "reward_std": 0.11548593640327454,
      "rewards/acc_reward/mean": 0.19038332998752594,
      "rewards/acc_reward/std": 0.31174740195274353,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 10
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 484.0,
      "completions/mean_length": 256.890625,
      "completions/mean_terminated_length": 252.84127807617188,
      "completions/min_length": 92.0,
      "completions/min_terminated_length": 92.0,
      "entropy": 1.8046875,
      "epoch": 0.02071563088512241,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 1.1678308248519897,
      "learning_rate": 9.811676082862523e-07,
      "loss": 0.0051,
      "num_tokens": 1474925.0,
      "reward": 0.21130861341953278,
      "reward_std": 0.11159157752990723,
      "rewards/acc_reward/mean": 0.12367624044418335,
      "rewards/acc_reward/std": 0.28885576128959656,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 11
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 438.0,
      "completions/max_terminated_length": 438.0,
      "completions/mean_length": 224.640625,
      "completions/mean_terminated_length": 224.640625,
      "completions/min_length": 69.0,
      "completions/min_terminated_length": 69.0,
      "entropy": 1.59375,
      "epoch": 0.022598870056497175,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 1.6417983770370483,
      "learning_rate": 9.792843691148776e-07,
      "loss": -0.0083,
      "num_tokens": 1610966.0,
      "reward": 0.3159927725791931,
      "reward_std": 0.1229424774646759,
      "rewards/acc_reward/mean": 0.24172811210155487,
      "rewards/acc_reward/std": 0.31462714076042175,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 12
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 483.0,
      "completions/max_terminated_length": 483.0,
      "completions/mean_length": 219.6875,
      "completions/mean_terminated_length": 219.6875,
      "completions/min_length": 80.0,
      "completions/min_terminated_length": 80.0,
      "entropy": 1.4921875,
      "epoch": 0.02448210922787194,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 1.5047811269760132,
      "learning_rate": 9.774011299435027e-07,
      "loss": 0.0277,
      "num_tokens": 1755810.0,
      "reward": 0.2687425911426544,
      "reward_std": 0.14603173732757568,
      "rewards/acc_reward/mean": 0.1874917596578598,
      "rewards/acc_reward/std": 0.3036465644836426,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 13
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 459.0,
      "completions/mean_length": 277.0625,
      "completions/mean_terminated_length": 257.15252685546875,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "entropy": 1.890625,
      "epoch": 0.026365348399246705,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 1.3763067722320557,
      "learning_rate": 9.75517890772128e-07,
      "loss": 0.0409,
      "num_tokens": 1901926.0,
      "reward": 0.3287838101387024,
      "reward_std": 0.17755521833896637,
      "rewards/acc_reward/mean": 0.2542042136192322,
      "rewards/acc_reward/std": 0.383696049451828,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 14
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 483.0,
      "completions/mean_length": 232.046875,
      "completions/mean_terminated_length": 227.60317993164062,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "entropy": 1.59375,
      "epoch": 0.02824858757062147,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 1.1948471069335938,
      "learning_rate": 9.736346516007531e-07,
      "loss": -0.0154,
      "num_tokens": 2028921.0,
      "reward": 0.31290093064308167,
      "reward_std": 0.08047251403331757,
      "rewards/acc_reward/mean": 0.23655660450458527,
      "rewards/acc_reward/std": 0.3165181875228882,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 15
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 486.0,
      "completions/mean_length": 212.40625,
      "completions/mean_terminated_length": 207.6508026123047,
      "completions/min_length": 83.0,
      "completions/min_terminated_length": 83.0,
      "entropy": 1.734375,
      "epoch": 0.030131826741996232,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 1.5261269807815552,
      "learning_rate": 9.717514124293785e-07,
      "loss": 0.09,
      "num_tokens": 2165179.0,
      "reward": 0.30339735746383667,
      "reward_std": 0.133845254778862,
      "rewards/acc_reward/mean": 0.22599703073501587,
      "rewards/acc_reward/std": 0.3543343245983124,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 16
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 482.0,
      "completions/max_terminated_length": 482.0,
      "completions/mean_length": 201.40625,
      "completions/mean_terminated_length": 201.40625,
      "completions/min_length": 89.0,
      "completions/min_terminated_length": 89.0,
      "entropy": 1.4375,
      "epoch": 0.032015065913371,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 1.5391112565994263,
      "learning_rate": 9.698681732580038e-07,
      "loss": 0.009,
      "num_tokens": 2296853.0,
      "reward": 0.1953125,
      "reward_std": 0.2162797451019287,
      "rewards/acc_reward/mean": 0.109375,
      "rewards/acc_reward/std": 0.3145764470100403,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "step": 17
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 482.0,
      "completions/mean_length": 220.84375,
      "completions/mean_terminated_length": 216.2222442626953,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "entropy": 1.7421875,
      "epoch": 0.03389830508474576,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 1.1895616054534912,
      "learning_rate": 9.679849340866289e-07,
      "loss": 0.0424,
      "num_tokens": 2431011.0,
      "reward": 0.30048421025276184,
      "reward_std": 0.06477973610162735,
      "rewards/acc_reward/mean": 0.22276021540164948,
      "rewards/acc_reward/std": 0.31460005044937134,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 18
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 428.0,
      "completions/max_terminated_length": 428.0,
      "completions/mean_length": 223.953125,
      "completions/mean_terminated_length": 223.953125,
      "completions/min_length": 86.0,
      "completions/min_terminated_length": 86.0,
      "entropy": 1.9453125,
      "epoch": 0.035781544256120526,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 1.671203374862671,
      "learning_rate": 9.661016949152542e-07,
      "loss": 0.0416,
      "num_tokens": 2571808.0,
      "reward": 0.42682912945747375,
      "reward_std": 0.13467340171337128,
      "rewards/acc_reward/mean": 0.3631434738636017,
      "rewards/acc_reward/std": 0.37232744693756104,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 19
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 482.0,
      "completions/mean_length": 224.140625,
      "completions/mean_terminated_length": 214.85482788085938,
      "completions/min_length": 83.0,
      "completions/min_terminated_length": 83.0,
      "entropy": 1.625,
      "epoch": 0.03766478342749529,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 1.5360738039016724,
      "learning_rate": 9.642184557438793e-07,
      "loss": 0.0717,
      "num_tokens": 2711697.0,
      "reward": 0.3133315443992615,
      "reward_std": 0.22658446431159973,
      "rewards/acc_reward/mean": 0.23703500628471375,
      "rewards/acc_reward/std": 0.3616204857826233,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 20
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 398.0,
      "completions/mean_length": 222.515625,
      "completions/mean_terminated_length": 213.1774139404297,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "entropy": 1.7109375,
      "epoch": 0.03954802259887006,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 1.8884894847869873,
      "learning_rate": 9.623352165725046e-07,
      "loss": 0.0024,
      "num_tokens": 2841242.0,
      "reward": 0.5347622036933899,
      "reward_std": 0.15656878054141998,
      "rewards/acc_reward/mean": 0.4830690622329712,
      "rewards/acc_reward/std": 0.31471821665763855,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 21
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 485.0,
      "completions/mean_length": 226.703125,
      "completions/mean_terminated_length": 222.1746063232422,
      "completions/min_length": 90.0,
      "completions/min_terminated_length": 90.0,
      "entropy": 1.84375,
      "epoch": 0.04143126177024482,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 1.8899277448654175,
      "learning_rate": 9.6045197740113e-07,
      "loss": 0.0705,
      "num_tokens": 2963015.0,
      "reward": 0.43775349855422974,
      "reward_std": 0.2172819972038269,
      "rewards/acc_reward/mean": 0.3752816617488861,
      "rewards/acc_reward/std": 0.3713799715042114,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 22
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 482.0,
      "completions/mean_length": 229.5,
      "completions/mean_terminated_length": 220.3870849609375,
      "completions/min_length": 77.0,
      "completions/min_terminated_length": 77.0,
      "entropy": 1.7421875,
      "epoch": 0.04331450094161959,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 1.6153416633605957,
      "learning_rate": 9.58568738229755e-07,
      "loss": -0.0054,
      "num_tokens": 3109447.0,
      "reward": 0.32855772972106934,
      "reward_std": 0.27425360679626465,
      "rewards/acc_reward/mean": 0.2556891441345215,
      "rewards/acc_reward/std": 0.40467146039009094,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 23
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 510.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 212.984375,
      "completions/mean_terminated_length": 212.984375,
      "completions/min_length": 94.0,
      "completions/min_terminated_length": 94.0,
      "entropy": 1.484375,
      "epoch": 0.04519774011299435,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 1.8740508556365967,
      "learning_rate": 9.566854990583804e-07,
      "loss": 0.0035,
      "num_tokens": 3255782.0,
      "reward": 0.4697909355163574,
      "reward_std": 0.18794436752796173,
      "rewards/acc_reward/mean": 0.41261494159698486,
      "rewards/acc_reward/std": 0.39874467253685,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 24
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 483.0,
      "completions/max_terminated_length": 483.0,
      "completions/mean_length": 237.375,
      "completions/mean_terminated_length": 237.375,
      "completions/min_length": 42.0,
      "completions/min_terminated_length": 42.0,
      "entropy": 1.9375,
      "epoch": 0.047080979284369114,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.9496995210647583,
      "learning_rate": 9.548022598870055e-07,
      "loss": 0.1125,
      "num_tokens": 3398398.0,
      "reward": 0.43061739206314087,
      "reward_std": 0.25355520844459534,
      "rewards/acc_reward/mean": 0.36735260486602783,
      "rewards/acc_reward/std": 0.3482816219329834,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 25
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 483.0,
      "completions/max_terminated_length": 483.0,
      "completions/mean_length": 236.890625,
      "completions/mean_terminated_length": 236.890625,
      "completions/min_length": 86.0,
      "completions/min_terminated_length": 86.0,
      "entropy": 1.765625,
      "epoch": 0.04896421845574388,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 1.7656739950180054,
      "learning_rate": 9.529190207156308e-07,
      "loss": 0.0431,
      "num_tokens": 3534743.0,
      "reward": 0.49807009100914,
      "reward_std": 0.23242174088954926,
      "rewards/acc_reward/mean": 0.44230008125305176,
      "rewards/acc_reward/std": 0.38975948095321655,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 26
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 398.0,
      "completions/max_terminated_length": 398.0,
      "completions/mean_length": 187.546875,
      "completions/mean_terminated_length": 187.546875,
      "completions/min_length": 74.0,
      "completions/min_terminated_length": 74.0,
      "entropy": 1.78125,
      "epoch": 0.05084745762711865,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 2.076228618621826,
      "learning_rate": 9.510357815442561e-07,
      "loss": 0.0356,
      "num_tokens": 3659290.0,
      "reward": 0.44223499298095703,
      "reward_std": 0.21930678188800812,
      "rewards/acc_reward/mean": 0.383733332157135,
      "rewards/acc_reward/std": 0.3709230422973633,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "step": 27
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 484.0,
      "completions/max_terminated_length": 484.0,
      "completions/mean_length": 209.65625,
      "completions/mean_terminated_length": 209.65625,
      "completions/min_length": 37.0,
      "completions/min_terminated_length": 37.0,
      "entropy": 1.875,
      "epoch": 0.05273069679849341,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 1.8770859241485596,
      "learning_rate": 9.491525423728813e-07,
      "loss": 0.0769,
      "num_tokens": 3796772.0,
      "reward": 0.41265854239463806,
      "reward_std": 0.3168802857398987,
      "rewards/acc_reward/mean": 0.34913450479507446,
      "rewards/acc_reward/std": 0.42575517296791077,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 28
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 385.0,
      "completions/max_terminated_length": 385.0,
      "completions/mean_length": 192.75,
      "completions/mean_terminated_length": 192.75,
      "completions/min_length": 66.0,
      "completions/min_terminated_length": 66.0,
      "entropy": 2.140625,
      "epoch": 0.054613935969868174,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 2.1020395755767822,
      "learning_rate": 9.472693032015065e-07,
      "loss": 0.0712,
      "num_tokens": 3925692.0,
      "reward": 0.41072791814804077,
      "reward_std": 0.3003063201904297,
      "rewards/acc_reward/mean": 0.34698933362960815,
      "rewards/acc_reward/std": 0.4590102732181549,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 29
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 479.0,
      "completions/max_terminated_length": 479.0,
      "completions/mean_length": 189.5625,
      "completions/mean_terminated_length": 189.5625,
      "completions/min_length": 81.0,
      "completions/min_terminated_length": 81.0,
      "entropy": 1.9609375,
      "epoch": 0.05649717514124294,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 2.122809410095215,
      "learning_rate": 9.453860640301318e-07,
      "loss": -0.0219,
      "num_tokens": 4063328.0,
      "reward": 0.4698576331138611,
      "reward_std": 0.30180490016937256,
      "rewards/acc_reward/mean": 0.41095292568206787,
      "rewards/acc_reward/std": 0.4547015130519867,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 30
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 492.0,
      "completions/max_terminated_length": 492.0,
      "completions/mean_length": 185.6875,
      "completions/mean_terminated_length": 185.6875,
      "completions/min_length": 88.0,
      "completions/min_terminated_length": 88.0,
      "entropy": 1.8359375,
      "epoch": 0.0583804143126177,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 2.101128578186035,
      "learning_rate": 9.43502824858757e-07,
      "loss": 0.0325,
      "num_tokens": 4191876.0,
      "reward": 0.4844944179058075,
      "reward_std": 0.30700868368148804,
      "rewards/acc_reward/mean": 0.42895209789276123,
      "rewards/acc_reward/std": 0.4517141282558441,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 31
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 435.0,
      "completions/mean_length": 179.5,
      "completions/mean_terminated_length": 174.22222900390625,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "entropy": 1.859375,
      "epoch": 0.060263653483992465,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.2371370792388916,
      "learning_rate": 9.416195856873822e-07,
      "loss": -0.0023,
      "num_tokens": 4322148.0,
      "reward": 0.5825260281562805,
      "reward_std": 0.3706633746623993,
      "rewards/acc_reward/mean": 0.536139965057373,
      "rewards/acc_reward/std": 0.4555891752243042,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 32
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 482.0,
      "completions/max_terminated_length": 482.0,
      "completions/mean_length": 188.6875,
      "completions/mean_terminated_length": 188.6875,
      "completions/min_length": 41.0,
      "completions/min_terminated_length": 41.0,
      "entropy": 1.625,
      "epoch": 0.062146892655367235,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.9997694492340088,
      "learning_rate": 9.397363465160075e-07,
      "loss": 0.0677,
      "num_tokens": 4460208.0,
      "reward": 0.6379782557487488,
      "reward_std": 0.284807950258255,
      "rewards/acc_reward/mean": 0.599489688873291,
      "rewards/acc_reward/std": 0.42208555340766907,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 33
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 479.0,
      "completions/max_terminated_length": 479.0,
      "completions/mean_length": 206.375,
      "completions/mean_terminated_length": 206.375,
      "completions/min_length": 76.0,
      "completions/min_terminated_length": 76.0,
      "entropy": 2.28125,
      "epoch": 0.064030131826742,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.249376058578491,
      "learning_rate": 9.378531073446327e-07,
      "loss": 0.0497,
      "num_tokens": 4598760.0,
      "reward": 0.6912428140640259,
      "reward_std": 0.350800096988678,
      "rewards/acc_reward/mean": 0.658672571182251,
      "rewards/acc_reward/std": 0.45664042234420776,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 34
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 482.0,
      "completions/max_terminated_length": 482.0,
      "completions/mean_length": 171.703125,
      "completions/mean_terminated_length": 171.703125,
      "completions/min_length": 62.0,
      "completions/min_terminated_length": 62.0,
      "entropy": 1.9609375,
      "epoch": 0.06591337099811675,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 1.9761137962341309,
      "learning_rate": 9.359698681732579e-07,
      "loss": 0.0538,
      "num_tokens": 4729973.0,
      "reward": 0.7882718443870544,
      "reward_std": 0.24334368109703064,
      "rewards/acc_reward/mean": 0.7664825916290283,
      "rewards/acc_reward/std": 0.34395766258239746,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 35
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 319.0,
      "completions/max_terminated_length": 319.0,
      "completions/mean_length": 165.21875,
      "completions/mean_terminated_length": 165.21875,
      "completions/min_length": 84.0,
      "completions/min_terminated_length": 84.0,
      "entropy": 1.9375,
      "epoch": 0.06779661016949153,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.3349790573120117,
      "learning_rate": 9.340866290018831e-07,
      "loss": 0.0651,
      "num_tokens": 4870371.0,
      "reward": 0.7641240358352661,
      "reward_std": 0.3322174549102783,
      "rewards/acc_reward/mean": 0.7396517395973206,
      "rewards/acc_reward/std": 0.409751832485199,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 36
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 398.0,
      "completions/mean_length": 164.234375,
      "completions/mean_terminated_length": 158.71429443359375,
      "completions/min_length": 42.0,
      "completions/min_terminated_length": 42.0,
      "entropy": 1.9765625,
      "epoch": 0.0696798493408663,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 2.1828882694244385,
      "learning_rate": 9.322033898305083e-07,
      "loss": 0.0297,
      "num_tokens": 5002066.0,
      "reward": 0.7222064733505249,
      "reward_std": 0.2505956292152405,
      "rewards/acc_reward/mean": 0.6913405656814575,
      "rewards/acc_reward/std": 0.3850165903568268,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 37
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 483.0,
      "completions/mean_length": 189.25,
      "completions/mean_terminated_length": 184.1269989013672,
      "completions/min_length": 67.0,
      "completions/min_terminated_length": 67.0,
      "entropy": 2.203125,
      "epoch": 0.07156308851224105,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.241468906402588,
      "learning_rate": 9.303201506591337e-07,
      "loss": 0.0657,
      "num_tokens": 5135842.0,
      "reward": 0.767461359500885,
      "reward_std": 0.23975765705108643,
      "rewards/acc_reward/mean": 0.74509596824646,
      "rewards/acc_reward/std": 0.3569471538066864,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "step": 38
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 334.0,
      "completions/mean_length": 167.296875,
      "completions/mean_terminated_length": 161.82540893554688,
      "completions/min_length": 64.0,
      "completions/min_terminated_length": 64.0,
      "entropy": 2.21875,
      "epoch": 0.07344632768361582,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 2.625629186630249,
      "learning_rate": 9.28436911487759e-07,
      "loss": 0.0755,
      "num_tokens": 5268213.0,
      "reward": 0.7769017219543457,
      "reward_std": 0.301779568195343,
      "rewards/acc_reward/mean": 0.7555853128433228,
      "rewards/acc_reward/std": 0.3923026919364929,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "step": 39
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 182.0,
      "completions/mean_terminated_length": 165.77047729492188,
      "completions/min_length": 73.0,
      "completions/min_terminated_length": 73.0,
      "entropy": 2.34375,
      "epoch": 0.07532956685499058,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 1.8064289093017578,
      "learning_rate": 9.265536723163842e-07,
      "loss": -0.0294,
      "num_tokens": 5400085.0,
      "reward": 0.8888776302337646,
      "reward_std": 0.2301492542028427,
      "rewards/acc_reward/mean": 0.8782668113708496,
      "rewards/acc_reward/std": 0.31358516216278076,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 40
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 377.0,
      "completions/mean_length": 170.578125,
      "completions/mean_terminated_length": 165.15872192382812,
      "completions/min_length": 46.0,
      "completions/min_terminated_length": 46.0,
      "entropy": 2.375,
      "epoch": 0.07721280602636535,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 1.6493501663208008,
      "learning_rate": 9.246704331450094e-07,
      "loss": -0.034,
      "num_tokens": 5529746.0,
      "reward": 0.8990048170089722,
      "reward_std": 0.1494266837835312,
      "rewards/acc_reward/mean": 0.889519214630127,
      "rewards/acc_reward/std": 0.26512882113456726,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 41
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 393.0,
      "completions/max_terminated_length": 393.0,
      "completions/mean_length": 158.859375,
      "completions/mean_terminated_length": 158.859375,
      "completions/min_length": 71.0,
      "completions/min_terminated_length": 71.0,
      "entropy": 2.4375,
      "epoch": 0.07909604519774012,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 1.683705449104309,
      "learning_rate": 9.227871939736346e-07,
      "loss": 0.0056,
      "num_tokens": 5668777.0,
      "reward": 0.9233179092407227,
      "reward_std": 0.14834892749786377,
      "rewards/acc_reward/mean": 0.9165338277816772,
      "rewards/acc_reward/std": 0.2692948877811432,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 42
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 460.0,
      "completions/max_terminated_length": 460.0,
      "completions/mean_length": 187.859375,
      "completions/mean_terminated_length": 187.859375,
      "completions/min_length": 44.0,
      "completions/min_terminated_length": 44.0,
      "entropy": 2.46875,
      "epoch": 0.08097928436911488,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 1.3769292831420898,
      "learning_rate": 9.209039548022598e-07,
      "loss": 0.0487,
      "num_tokens": 5800544.0,
      "reward": 0.9674270153045654,
      "reward_std": 0.048613592982292175,
      "rewards/acc_reward/mean": 0.9672800302505493,
      "rewards/acc_reward/std": 0.12451620399951935,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "step": 43
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 453.0,
      "completions/mean_length": 164.546875,
      "completions/mean_terminated_length": 153.33871459960938,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "entropy": 2.671875,
      "epoch": 0.08286252354048965,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 1.0136916637420654,
      "learning_rate": 9.190207156308852e-07,
      "loss": -0.0441,
      "num_tokens": 5929459.0,
      "reward": 0.9583332538604736,
      "reward_std": 0.016743799671530724,
      "rewards/acc_reward/mean": 0.9571758508682251,
      "rewards/acc_reward/std": 0.0556831993162632,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "step": 44
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 294.0,
      "completions/max_terminated_length": 294.0,
      "completions/mean_length": 166.890625,
      "completions/mean_terminated_length": 166.890625,
      "completions/min_length": 51.0,
      "completions/min_terminated_length": 51.0,
      "entropy": 2.25,
      "epoch": 0.0847457627118644,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 2.0560693740844727,
      "learning_rate": 9.171374764595104e-07,
      "loss": -0.041,
      "num_tokens": 6061324.0,
      "reward": 0.9419167041778564,
      "reward_std": 0.08913865685462952,
      "rewards/acc_reward/mean": 0.9458796381950378,
      "rewards/acc_reward/std": 0.15155728161334991,
      "rewards/format_reward/mean": 0.90625,
      "rewards/format_reward/std": 0.29378482699394226,
      "step": 45
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 462.0,
      "completions/max_terminated_length": 462.0,
      "completions/mean_length": 184.484375,
      "completions/mean_terminated_length": 184.484375,
      "completions/min_length": 74.0,
      "completions/min_terminated_length": 74.0,
      "entropy": 2.453125,
      "epoch": 0.08662900188323917,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 1.4456909894943237,
      "learning_rate": 9.152542372881356e-07,
      "loss": 0.0865,
      "num_tokens": 6199275.0,
      "reward": 0.956250011920929,
      "reward_std": 0.1237436980009079,
      "rewards/acc_reward/mean": 0.953125,
      "rewards/acc_reward/std": 0.21304203569889069,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 46
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 395.0,
      "completions/mean_length": 182.65625,
      "completions/mean_terminated_length": 172.03225708007812,
      "completions/min_length": 47.0,
      "completions/min_terminated_length": 47.0,
      "entropy": 2.734375,
      "epoch": 0.08851224105461393,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.9000487923622131,
      "learning_rate": 9.133709981167608e-07,
      "loss": 0.0061,
      "num_tokens": 6339829.0,
      "reward": 0.9662767648696899,
      "reward_std": 0.039774756878614426,
      "rewards/acc_reward/mean": 0.9625297784805298,
      "rewards/acc_reward/std": 0.12811994552612305,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 47
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 360.0,
      "completions/mean_length": 174.3125,
      "completions/mean_terminated_length": 168.952392578125,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "entropy": 2.546875,
      "epoch": 0.0903954802259887,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 1.0134769678115845,
      "learning_rate": 9.11487758945386e-07,
      "loss": -0.0181,
      "num_tokens": 6473609.0,
      "reward": 0.9744918346405029,
      "reward_std": 0.008838837035000324,
      "rewards/acc_reward/mean": 0.9751298427581787,
      "rewards/acc_reward/std": 0.04131903871893883,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "step": 48
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 454.0,
      "completions/mean_length": 188.421875,
      "completions/mean_terminated_length": 177.98385620117188,
      "completions/min_length": 65.0,
      "completions/min_terminated_length": 65.0,
      "entropy": 2.625,
      "epoch": 0.09227871939736347,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 1.7283865213394165,
      "learning_rate": 9.096045197740112e-07,
      "loss": 0.0749,
      "num_tokens": 6607932.0,
      "reward": 0.9752188920974731,
      "reward_std": 0.015992172062397003,
      "rewards/acc_reward/mean": 0.981145977973938,
      "rewards/acc_reward/std": 0.025175929069519043,
      "rewards/format_reward/mean": 0.921875,
      "rewards/format_reward/std": 0.27048972249031067,
      "step": 49
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 509.0,
      "completions/max_terminated_length": 509.0,
      "completions/mean_length": 194.421875,
      "completions/mean_terminated_length": 194.421875,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "entropy": 2.53125,
      "epoch": 0.09416195856873823,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 1.1576530933380127,
      "learning_rate": 9.077212806026365e-07,
      "loss": 0.0122,
      "num_tokens": 6752119.0,
      "reward": 0.9867604374885559,
      "reward_std": 0.007432654500007629,
      "rewards/acc_reward/mean": 0.9870254993438721,
      "rewards/acc_reward/std": 0.017179692164063454,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 50
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 509.0,
      "completions/max_terminated_length": 509.0,
      "completions/mean_length": 186.71875,
      "completions/mean_terminated_length": 186.71875,
      "completions/min_length": 67.0,
      "completions/min_terminated_length": 67.0,
      "entropy": 2.78125,
      "epoch": 0.096045197740113,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 2.141554355621338,
      "learning_rate": 9.058380414312617e-07,
      "loss": -0.0311,
      "num_tokens": 6890533.0,
      "reward": 0.9580291509628296,
      "reward_std": 0.0707060694694519,
      "rewards/acc_reward/mean": 0.9637823700904846,
      "rewards/acc_reward/std": 0.13111592829227448,
      "rewards/format_reward/mean": 0.90625,
      "rewards/format_reward/std": 0.29378482699394226,
      "step": 51
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 307.0,
      "completions/mean_length": 169.390625,
      "completions/mean_terminated_length": 163.952392578125,
      "completions/min_length": 32.0,
      "completions/min_terminated_length": 32.0,
      "entropy": 2.421875,
      "epoch": 0.09792843691148775,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.8654880523681641,
      "learning_rate": 9.03954802259887e-07,
      "loss": 0.0053,
      "num_tokens": 7027358.0,
      "reward": 0.9961555004119873,
      "reward_std": 0.004419418517500162,
      "rewards/acc_reward/mean": 0.9974644780158997,
      "rewards/acc_reward/std": 0.006761432159692049,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 52
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 464.0,
      "completions/mean_length": 176.078125,
      "completions/mean_terminated_length": 170.74603271484375,
      "completions/min_length": 51.0,
      "completions/min_terminated_length": 51.0,
      "entropy": 2.609375,
      "epoch": 0.09981167608286252,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 1.4219504594802856,
      "learning_rate": 9.020715630885122e-07,
      "loss": -0.0008,
      "num_tokens": 7162123.0,
      "reward": 0.9900810718536377,
      "reward_std": 0.021838055923581123,
      "rewards/acc_reward/mean": 0.9924511909484863,
      "rewards/acc_reward/std": 0.04345937818288803,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "step": 53
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 428.0,
      "completions/mean_length": 171.125,
      "completions/mean_terminated_length": 165.71429443359375,
      "completions/min_length": 43.0,
      "completions/min_terminated_length": 43.0,
      "entropy": 2.453125,
      "epoch": 0.1016949152542373,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.9306376576423645,
      "learning_rate": 9.001883239171374e-07,
      "loss": 0.0039,
      "num_tokens": 7294099.0,
      "reward": 0.9847477674484253,
      "reward_std": 0.005786377005279064,
      "rewards/acc_reward/mean": 0.986525297164917,
      "rewards/acc_reward/std": 0.02815171889960766,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "step": 54
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 317.0,
      "completions/max_terminated_length": 317.0,
      "completions/mean_length": 185.28125,
      "completions/mean_terminated_length": 185.28125,
      "completions/min_length": 70.0,
      "completions/min_terminated_length": 70.0,
      "entropy": 2.59375,
      "epoch": 0.10357815442561205,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 1.0906463861465454,
      "learning_rate": 8.983050847457627e-07,
      "loss": -0.0101,
      "num_tokens": 7421381.0,
      "reward": 0.9820950031280518,
      "reward_std": 0.013950306922197342,
      "rewards/acc_reward/mean": 0.9818416833877563,
      "rewards/acc_reward/std": 0.03777196630835533,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 55
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 189.296875,
      "completions/mean_terminated_length": 189.296875,
      "completions/min_length": 47.0,
      "completions/min_terminated_length": 47.0,
      "entropy": 2.71875,
      "epoch": 0.10546139359698682,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 1.0221842527389526,
      "learning_rate": 8.964218455743879e-07,
      "loss": -0.0019,
      "num_tokens": 7550840.0,
      "reward": 0.9848359823226929,
      "reward_std": 0.004419418517500162,
      "rewards/acc_reward/mean": 0.9848872423171997,
      "rewards/acc_reward/std": 0.026450620964169502,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 56
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 412.0,
      "completions/max_terminated_length": 412.0,
      "completions/mean_length": 204.6875,
      "completions/mean_terminated_length": 204.6875,
      "completions/min_length": 76.0,
      "completions/min_terminated_length": 76.0,
      "entropy": 2.609375,
      "epoch": 0.10734463276836158,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 1.1911293268203735,
      "learning_rate": 8.945386064030131e-07,
      "loss": 0.0145,
      "num_tokens": 7693764.0,
      "reward": 0.9755189418792725,
      "reward_std": 0.04419417679309845,
      "rewards/acc_reward/mean": 0.9745348691940308,
      "rewards/acc_reward/std": 0.1250728815793991,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 57
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 452.0,
      "completions/max_terminated_length": 452.0,
      "completions/mean_length": 210.546875,
      "completions/mean_terminated_length": 210.546875,
      "completions/min_length": 87.0,
      "completions/min_terminated_length": 87.0,
      "entropy": 2.65625,
      "epoch": 0.10922787193973635,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.7357279062271118,
      "learning_rate": 8.926553672316383e-07,
      "loss": 0.0164,
      "num_tokens": 7828263.0,
      "reward": 0.9803258180618286,
      "reward_std": 0.007434290833771229,
      "rewards/acc_reward/mean": 0.9781398177146912,
      "rewards/acc_reward/std": 0.03639683872461319,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 58
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 289.0,
      "completions/max_terminated_length": 289.0,
      "completions/mean_length": 154.046875,
      "completions/mean_terminated_length": 154.046875,
      "completions/min_length": 48.0,
      "completions/min_terminated_length": 48.0,
      "entropy": 2.5625,
      "epoch": 0.1111111111111111,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.907721280602636e-07,
      "loss": 0.0,
      "num_tokens": 7968106.0,
      "reward": 0.9852668046951294,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9836297631263733,
      "rewards/acc_reward/std": 0.024946285411715508,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 59
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 449.0,
      "completions/mean_length": 177.296875,
      "completions/mean_terminated_length": 166.5,
      "completions/min_length": 51.0,
      "completions/min_terminated_length": 51.0,
      "entropy": 2.546875,
      "epoch": 0.11299435028248588,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 1.5839695930480957,
      "learning_rate": 8.888888888888888e-07,
      "loss": 0.0779,
      "num_tokens": 8098157.0,
      "reward": 0.9858125448226929,
      "reward_std": 0.004419418517500162,
      "rewards/acc_reward/mean": 0.9859722852706909,
      "rewards/acc_reward/std": 0.019654173403978348,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 60
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 385.0,
      "completions/mean_length": 167.1875,
      "completions/mean_terminated_length": 161.71429443359375,
      "completions/min_length": 61.0,
      "completions/min_terminated_length": 61.0,
      "entropy": 2.765625,
      "epoch": 0.11487758945386065,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 1.4451484680175781,
      "learning_rate": 8.870056497175141e-07,
      "loss": -0.0,
      "num_tokens": 8227641.0,
      "reward": 0.9645360708236694,
      "reward_std": 0.008838837035000324,
      "rewards/acc_reward/mean": 0.964067816734314,
      "rewards/acc_reward/std": 0.040923893451690674,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "step": 61
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 459.0,
      "completions/max_terminated_length": 459.0,
      "completions/mean_length": 181.734375,
      "completions/mean_terminated_length": 181.734375,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "entropy": 2.625,
      "epoch": 0.1167608286252354,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.3860294222831726,
      "learning_rate": 8.851224105461393e-07,
      "loss": -0.0431,
      "num_tokens": 8359496.0,
      "reward": 0.991644024848938,
      "reward_std": 0.004419418517500162,
      "rewards/acc_reward/mean": 0.9924516677856445,
      "rewards/acc_reward/std": 0.020128827542066574,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 62
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 499.0,
      "completions/mean_length": 207.34375,
      "completions/mean_terminated_length": 202.50794982910156,
      "completions/min_length": 78.0,
      "completions/min_terminated_length": 78.0,
      "entropy": 2.75,
      "epoch": 0.11864406779661017,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.6225361227989197,
      "learning_rate": 8.832391713747645e-07,
      "loss": -0.0011,
      "num_tokens": 8500670.0,
      "reward": 0.9970052242279053,
      "reward_std": 0.004419418517500162,
      "rewards/acc_reward/mean": 0.9984085559844971,
      "rewards/acc_reward/std": 0.004243890754878521,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 63
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 369.0,
      "completions/mean_length": 169.6875,
      "completions/mean_terminated_length": 164.25396728515625,
      "completions/min_length": 60.0,
      "completions/min_terminated_length": 60.0,
      "entropy": 2.3125,
      "epoch": 0.12052730696798493,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 1.537191390991211,
      "learning_rate": 8.813559322033897e-07,
      "loss": 0.0021,
      "num_tokens": 8631754.0,
      "reward": 0.9799120426177979,
      "reward_std": 0.013258256018161774,
      "rewards/acc_reward/mean": 0.982888400554657,
      "rewards/acc_reward/std": 0.022922541946172714,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.21304203569889069,
      "step": 64
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 500.0,
      "completions/max_terminated_length": 500.0,
      "completions/mean_length": 172.59375,
      "completions/mean_terminated_length": 172.59375,
      "completions/min_length": 38.0,
      "completions/min_terminated_length": 38.0,
      "entropy": 2.609375,
      "epoch": 0.1224105461393597,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.79472693032015e-07,
      "loss": 0.0,
      "num_tokens": 8762784.0,
      "reward": 0.9871211051940918,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.985690176486969,
      "rewards/acc_reward/std": 0.017727959901094437,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 65
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 405.0,
      "completions/max_terminated_length": 405.0,
      "completions/mean_length": 181.234375,
      "completions/mean_terminated_length": 181.234375,
      "completions/min_length": 64.0,
      "completions/min_terminated_length": 64.0,
      "entropy": 2.453125,
      "epoch": 0.12429378531073447,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.775894538606404e-07,
      "loss": 0.0,
      "num_tokens": 8887127.0,
      "reward": 0.9676999449729919,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9641109704971313,
      "rewards/acc_reward/std": 0.03724094480276108,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 66
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 319.0,
      "completions/mean_length": 163.078125,
      "completions/mean_terminated_length": 157.53968811035156,
      "completions/min_length": 61.0,
      "completions/min_terminated_length": 61.0,
      "entropy": 2.296875,
      "epoch": 0.12617702448210924,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.6480050086975098,
      "learning_rate": 8.757062146892656e-07,
      "loss": -0.0023,
      "num_tokens": 9024028.0,
      "reward": 0.9807539582252502,
      "reward_std": 0.004419418517500162,
      "rewards/acc_reward/mean": 0.9803516268730164,
      "rewards/acc_reward/std": 0.02998371422290802,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 67
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 382.0,
      "completions/max_terminated_length": 382.0,
      "completions/mean_length": 150.859375,
      "completions/mean_terminated_length": 150.859375,
      "completions/min_length": 48.0,
      "completions/min_terminated_length": 48.0,
      "entropy": 2.375,
      "epoch": 0.128060263653484,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.738229755178908e-07,
      "loss": 0.0,
      "num_tokens": 9149267.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 1.0,
      "rewards/acc_reward/std": 0.0,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 68
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 347.0,
      "completions/max_terminated_length": 347.0,
      "completions/mean_length": 156.34375,
      "completions/mean_terminated_length": 156.34375,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "entropy": 2.359375,
      "epoch": 0.12994350282485875,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 1.1719481945037842,
      "learning_rate": 8.71939736346516e-07,
      "loss": -0.0147,
      "num_tokens": 9283337.0,
      "reward": 0.9746325016021729,
      "reward_std": 0.04419417679309845,
      "rewards/acc_reward/mean": 0.9735499620437622,
      "rewards/acc_reward/std": 0.1257747858762741,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 69
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 427.0,
      "completions/max_terminated_length": 427.0,
      "completions/mean_length": 153.515625,
      "completions/mean_terminated_length": 153.515625,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "entropy": 2.53125,
      "epoch": 0.1318267419962335,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.700564971751412e-07,
      "loss": 0.0,
      "num_tokens": 9422506.0,
      "reward": 0.9936791658401489,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9929767847061157,
      "rewards/acc_reward/std": 0.018728474155068398,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 70
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 311.0,
      "completions/max_terminated_length": 311.0,
      "completions/mean_length": 159.90625,
      "completions/mean_terminated_length": 159.90625,
      "completions/min_length": 64.0,
      "completions/min_terminated_length": 64.0,
      "entropy": 2.234375,
      "epoch": 0.1337099811676083,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.681732580037664e-07,
      "loss": 0.0,
      "num_tokens": 9558724.0,
      "reward": 0.9724128246307373,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9693475365638733,
      "rewards/acc_reward/std": 0.049584269523620605,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 71
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 393.0,
      "completions/max_terminated_length": 393.0,
      "completions/mean_length": 171.421875,
      "completions/mean_terminated_length": 171.421875,
      "completions/min_length": 51.0,
      "completions/min_terminated_length": 51.0,
      "entropy": 2.65625,
      "epoch": 0.13559322033898305,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.662900188323917e-07,
      "loss": 0.0,
      "num_tokens": 9696479.0,
      "reward": 0.9782567620277405,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9758408665657043,
      "rewards/acc_reward/std": 0.035969410091638565,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 72
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 355.0,
      "completions/max_terminated_length": 355.0,
      "completions/mean_length": 146.640625,
      "completions/mean_terminated_length": 146.640625,
      "completions/min_length": 58.0,
      "completions/min_terminated_length": 58.0,
      "entropy": 2.1875,
      "epoch": 0.1374764595103578,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.64406779661017e-07,
      "loss": 0.0,
      "num_tokens": 9833768.0,
      "reward": 0.9815881252288818,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9795423150062561,
      "rewards/acc_reward/std": 0.03601390868425369,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 73
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 361.0,
      "completions/mean_length": 159.484375,
      "completions/mean_terminated_length": 153.88888549804688,
      "completions/min_length": 52.0,
      "completions/min_terminated_length": 52.0,
      "entropy": 2.40625,
      "epoch": 0.1393596986817326,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.625235404896422e-07,
      "loss": 0.0,
      "num_tokens": 9965639.0,
      "reward": 0.9888086318969727,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9875651597976685,
      "rewards/acc_reward/std": 0.02026844024658203,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 74
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 368.0,
      "completions/max_terminated_length": 368.0,
      "completions/mean_length": 159.8125,
      "completions/mean_terminated_length": 159.8125,
      "completions/min_length": 69.0,
      "completions/min_terminated_length": 69.0,
      "entropy": 2.28125,
      "epoch": 0.14124293785310735,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.606403013182674e-07,
      "loss": 0.0,
      "num_tokens": 10102131.0,
      "reward": 0.9853819608688354,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.983757734298706,
      "rewards/acc_reward/std": 0.023104524239897728,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 75
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 456.0,
      "completions/mean_length": 184.40625,
      "completions/mean_terminated_length": 173.8386993408203,
      "completions/min_length": 63.0,
      "completions/min_terminated_length": 63.0,
      "entropy": 2.390625,
      "epoch": 0.1431261770244821,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.7094928026199341,
      "learning_rate": 8.587570621468926e-07,
      "loss": -0.0279,
      "num_tokens": 10234637.0,
      "reward": 0.9758157730102539,
      "reward_std": 0.009495548903942108,
      "rewards/acc_reward/mean": 0.973128616809845,
      "rewards/acc_reward/std": 0.04172620549798012,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 76
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 493.0,
      "completions/max_terminated_length": 493.0,
      "completions/mean_length": 187.34375,
      "completions/mean_terminated_length": 187.34375,
      "completions/min_length": 68.0,
      "completions/min_terminated_length": 68.0,
      "entropy": 2.484375,
      "epoch": 0.14500941619585686,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.6446596384048462,
      "learning_rate": 8.568738229755178e-07,
      "loss": -0.0238,
      "num_tokens": 10380091.0,
      "reward": 0.9779398441314697,
      "reward_std": 0.004419418517500162,
      "rewards/acc_reward/mean": 0.9772248268127441,
      "rewards/acc_reward/std": 0.026913031935691833,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 77
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 433.0,
      "completions/max_terminated_length": 433.0,
      "completions/mean_length": 161.046875,
      "completions/mean_terminated_length": 161.046875,
      "completions/min_length": 71.0,
      "completions/min_terminated_length": 71.0,
      "entropy": 2.359375,
      "epoch": 0.14689265536723164,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.9724075198173523,
      "learning_rate": 8.549905838041431e-07,
      "loss": 0.0269,
      "num_tokens": 10505822.0,
      "reward": 0.988434910774231,
      "reward_std": 0.004419418517500162,
      "rewards/acc_reward/mean": 0.9888859987258911,
      "rewards/acc_reward/std": 0.024498289451003075,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 78
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 446.0,
      "completions/max_terminated_length": 446.0,
      "completions/mean_length": 159.9375,
      "completions/mean_terminated_length": 159.9375,
      "completions/min_length": 40.0,
      "completions/min_terminated_length": 40.0,
      "entropy": 2.375,
      "epoch": 0.1487758945386064,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.531073446327683e-07,
      "loss": 0.0,
      "num_tokens": 10638682.0,
      "reward": 0.990105152130127,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9890057444572449,
      "rewards/acc_reward/std": 0.019625553861260414,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 79
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 471.0,
      "completions/mean_length": 165.125,
      "completions/mean_terminated_length": 159.6190643310547,
      "completions/min_length": 65.0,
      "completions/min_terminated_length": 65.0,
      "entropy": 2.734375,
      "epoch": 0.15065913370998116,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.6837416291236877,
      "learning_rate": 8.512241054613935e-07,
      "loss": -0.0191,
      "num_tokens": 10769954.0,
      "reward": 0.98872971534729,
      "reward_std": 0.004419418517500162,
      "rewards/acc_reward/mean": 0.9892135858535767,
      "rewards/acc_reward/std": 0.02876383066177368,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 80
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 413.0,
      "completions/mean_length": 184.4375,
      "completions/mean_terminated_length": 179.23809814453125,
      "completions/min_length": 56.0,
      "completions/min_terminated_length": 56.0,
      "entropy": 2.625,
      "epoch": 0.15254237288135594,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.493408662900188e-07,
      "loss": 0.0,
      "num_tokens": 10908782.0,
      "reward": 0.9871925115585327,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.985769510269165,
      "rewards/acc_reward/std": 0.02491430565714836,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 81
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 339.0,
      "completions/max_terminated_length": 339.0,
      "completions/mean_length": 155.265625,
      "completions/mean_terminated_length": 155.265625,
      "completions/min_length": 73.0,
      "completions/min_terminated_length": 73.0,
      "entropy": 2.4375,
      "epoch": 0.1544256120527307,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.47457627118644e-07,
      "loss": 0.0,
      "num_tokens": 11042303.0,
      "reward": 0.9902667999267578,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9891853332519531,
      "rewards/acc_reward/std": 0.015684949234128,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 82
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 369.0,
      "completions/max_terminated_length": 369.0,
      "completions/mean_length": 153.015625,
      "completions/mean_terminated_length": 153.015625,
      "completions/min_length": 48.0,
      "completions/min_terminated_length": 48.0,
      "entropy": 2.5,
      "epoch": 0.15630885122410546,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.6075497269630432,
      "learning_rate": 8.455743879472693e-07,
      "loss": -0.027,
      "num_tokens": 11173760.0,
      "reward": 0.9887193441390991,
      "reward_std": 0.004419418517500162,
      "rewards/acc_reward/mean": 0.989202082157135,
      "rewards/acc_reward/std": 0.012971931137144566,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 83
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 297.0,
      "completions/max_terminated_length": 297.0,
      "completions/mean_length": 142.296875,
      "completions/mean_terminated_length": 142.296875,
      "completions/min_length": 38.0,
      "completions/min_terminated_length": 38.0,
      "entropy": 2.265625,
      "epoch": 0.15819209039548024,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.436911487758945e-07,
      "loss": 0.0,
      "num_tokens": 11310011.0,
      "reward": 0.9917968511581421,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9908854365348816,
      "rewards/acc_reward/std": 0.01598946936428547,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 84
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 244.0,
      "completions/max_terminated_length": 244.0,
      "completions/mean_length": 138.125,
      "completions/mean_terminated_length": 138.125,
      "completions/min_length": 54.0,
      "completions/min_terminated_length": 54.0,
      "entropy": 2.265625,
      "epoch": 0.160075329566855,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.6712620854377747,
      "learning_rate": 8.418079096045197e-07,
      "loss": -0.0066,
      "num_tokens": 11440675.0,
      "reward": 0.9767186641693115,
      "reward_std": 0.004419418517500162,
      "rewards/acc_reward/mean": 0.9758679866790771,
      "rewards/acc_reward/std": 0.04487896338105202,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 85
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 451.0,
      "completions/max_terminated_length": 451.0,
      "completions/mean_length": 172.578125,
      "completions/mean_terminated_length": 172.578125,
      "completions/min_length": 48.0,
      "completions/min_terminated_length": 48.0,
      "entropy": 2.640625,
      "epoch": 0.16195856873822975,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.770696222782135,
      "learning_rate": 8.399246704331449e-07,
      "loss": -0.0185,
      "num_tokens": 11569064.0,
      "reward": 0.9869691133499146,
      "reward_std": 0.004419418517500162,
      "rewards/acc_reward/mean": 0.9872573614120483,
      "rewards/acc_reward/std": 0.022249845787882805,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 86
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 426.0,
      "completions/max_terminated_length": 426.0,
      "completions/mean_length": 172.6875,
      "completions/mean_terminated_length": 172.6875,
      "completions/min_length": 41.0,
      "completions/min_terminated_length": 41.0,
      "entropy": 2.3125,
      "epoch": 0.1638418079096045,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.380414312617701e-07,
      "loss": 0.0,
      "num_tokens": 11708700.0,
      "reward": 0.9803584814071655,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9781760573387146,
      "rewards/acc_reward/std": 0.02318427711725235,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 87
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 276.0,
      "completions/mean_length": 155.0625,
      "completions/mean_terminated_length": 143.5483856201172,
      "completions/min_length": 75.0,
      "completions/min_terminated_length": 75.0,
      "entropy": 2.40625,
      "epoch": 0.1657250470809793,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.9133210778236389,
      "learning_rate": 8.361581920903954e-07,
      "loss": 0.0155,
      "num_tokens": 11832608.0,
      "reward": 0.9537662267684937,
      "reward_std": 0.006725744344294071,
      "rewards/acc_reward/mean": 0.9486291408538818,
      "rewards/acc_reward/std": 0.06949032843112946,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 88
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 420.0,
      "completions/max_terminated_length": 420.0,
      "completions/mean_length": 170.796875,
      "completions/mean_terminated_length": 170.796875,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "entropy": 2.359375,
      "epoch": 0.16760828625235405,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.9491801857948303,
      "learning_rate": 8.342749529190208e-07,
      "loss": -0.0359,
      "num_tokens": 11968843.0,
      "reward": 0.9640452265739441,
      "reward_std": 0.010205795988440514,
      "rewards/acc_reward/mean": 0.9652585983276367,
      "rewards/acc_reward/std": 0.046331144869327545,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.21304203569889069,
      "step": 89
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 275.0,
      "completions/max_terminated_length": 275.0,
      "completions/mean_length": 151.96875,
      "completions/mean_terminated_length": 151.96875,
      "completions/min_length": 56.0,
      "completions/min_terminated_length": 56.0,
      "entropy": 2.3125,
      "epoch": 0.1694915254237288,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.32391713747646e-07,
      "loss": 0.0,
      "num_tokens": 12106473.0,
      "reward": 0.9978047609329224,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9975607991218567,
      "rewards/acc_reward/std": 0.006504515651613474,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 90
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 505.0,
      "completions/max_terminated_length": 505.0,
      "completions/mean_length": 170.75,
      "completions/mean_terminated_length": 170.75,
      "completions/min_length": 49.0,
      "completions/min_terminated_length": 49.0,
      "entropy": 2.484375,
      "epoch": 0.1713747645951036,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.305084745762712e-07,
      "loss": 0.0,
      "num_tokens": 12241945.0,
      "reward": 0.9895379543304443,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9883755445480347,
      "rewards/acc_reward/std": 0.018741585314273834,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 91
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 324.0,
      "completions/max_terminated_length": 324.0,
      "completions/mean_length": 165.765625,
      "completions/mean_terminated_length": 165.765625,
      "completions/min_length": 77.0,
      "completions/min_terminated_length": 77.0,
      "entropy": 2.34375,
      "epoch": 0.17325800376647835,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.286252354048964e-07,
      "loss": 0.0,
      "num_tokens": 12375658.0,
      "reward": 0.9758948087692261,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9732165336608887,
      "rewards/acc_reward/std": 0.04799450561404228,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 92
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 451.0,
      "completions/mean_length": 195.265625,
      "completions/mean_terminated_length": 190.23809814453125,
      "completions/min_length": 60.0,
      "completions/min_terminated_length": 60.0,
      "entropy": 2.46875,
      "epoch": 0.1751412429378531,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.267419962335216e-07,
      "loss": 0.0,
      "num_tokens": 12513219.0,
      "reward": 0.9753564596176147,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9726182222366333,
      "rewards/acc_reward/std": 0.045861802995204926,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 93
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 321.0,
      "completions/max_terminated_length": 321.0,
      "completions/mean_length": 155.5,
      "completions/mean_terminated_length": 155.5,
      "completions/min_length": 77.0,
      "completions/min_terminated_length": 77.0,
      "entropy": 2.1875,
      "epoch": 0.17702448210922786,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.248587570621468e-07,
      "loss": 0.0,
      "num_tokens": 12648475.0,
      "reward": 0.9844207763671875,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9826897382736206,
      "rewards/acc_reward/std": 0.02190142311155796,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 94
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 508.0,
      "completions/mean_length": 187.34375,
      "completions/mean_terminated_length": 182.19049072265625,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "entropy": 2.5625,
      "epoch": 0.17890772128060264,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 1.2475651502609253,
      "learning_rate": 8.229755178907722e-07,
      "loss": -0.0212,
      "num_tokens": 12785969.0,
      "reward": 0.9898383617401123,
      "reward_std": 0.008838837035000324,
      "rewards/acc_reward/mean": 0.9921815395355225,
      "rewards/acc_reward/std": 0.015049039386212826,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "step": 95
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 368.0,
      "completions/max_terminated_length": 368.0,
      "completions/mean_length": 153.0,
      "completions/mean_terminated_length": 153.0,
      "completions/min_length": 45.0,
      "completions/min_terminated_length": 45.0,
      "entropy": 2.34375,
      "epoch": 0.1807909604519774,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.210922787193974e-07,
      "loss": 0.0,
      "num_tokens": 12907345.0,
      "reward": 0.9843592047691345,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9826213121414185,
      "rewards/acc_reward/std": 0.03606174886226654,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 96
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 474.0,
      "completions/max_terminated_length": 474.0,
      "completions/mean_length": 168.28125,
      "completions/mean_terminated_length": 168.28125,
      "completions/min_length": 50.0,
      "completions/min_terminated_length": 50.0,
      "entropy": 2.171875,
      "epoch": 0.18267419962335216,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.192090395480226e-07,
      "loss": 0.0,
      "num_tokens": 13033539.0,
      "reward": 0.9740588665008545,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9711765050888062,
      "rewards/acc_reward/std": 0.033579710870981216,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 97
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 367.0,
      "completions/mean_length": 164.03125,
      "completions/mean_terminated_length": 158.5079345703125,
      "completions/min_length": 56.0,
      "completions/min_terminated_length": 56.0,
      "entropy": 2.546875,
      "epoch": 0.18455743879472694,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.173258003766478e-07,
      "loss": 0.0,
      "num_tokens": 13159461.0,
      "reward": 0.983502984046936,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9816700220108032,
      "rewards/acc_reward/std": 0.024904713034629822,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 98
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 322.0,
      "completions/mean_length": 174.546875,
      "completions/mean_terminated_length": 169.19049072265625,
      "completions/min_length": 75.0,
      "completions/min_terminated_length": 75.0,
      "entropy": 2.140625,
      "epoch": 0.1864406779661017,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.15442561205273e-07,
      "loss": 0.0,
      "num_tokens": 13287976.0,
      "reward": 0.9533977508544922,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9482197761535645,
      "rewards/acc_reward/std": 0.043142788112163544,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 99
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 382.0,
      "completions/mean_length": 161.15625,
      "completions/mean_terminated_length": 155.58731079101562,
      "completions/min_length": 57.0,
      "completions/min_terminated_length": 57.0,
      "entropy": 2.25,
      "epoch": 0.18832391713747645,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.5508601665496826,
      "learning_rate": 8.135593220338983e-07,
      "loss": -0.0178,
      "num_tokens": 13424754.0,
      "reward": 0.9814343452453613,
      "reward_std": 0.004419418517500162,
      "rewards/acc_reward/mean": 0.9811076521873474,
      "rewards/acc_reward/std": 0.01872284896671772,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 100
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 310.0,
      "completions/max_terminated_length": 310.0,
      "completions/mean_length": 152.96875,
      "completions/mean_terminated_length": 152.96875,
      "completions/min_length": 64.0,
      "completions/min_terminated_length": 64.0,
      "entropy": 2.40625,
      "epoch": 0.1902071563088512,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.733428418636322,
      "learning_rate": 8.116760828625235e-07,
      "loss": -0.0236,
      "num_tokens": 13554768.0,
      "reward": 0.9719411730766296,
      "reward_std": 0.039774756878614426,
      "rewards/acc_reward/mean": 0.9688235521316528,
      "rewards/acc_reward/std": 0.1259699910879135,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 101
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 331.0,
      "completions/mean_length": 163.6875,
      "completions/mean_terminated_length": 158.1587371826172,
      "completions/min_length": 55.0,
      "completions/min_terminated_length": 55.0,
      "entropy": 2.28125,
      "epoch": 0.192090395480226,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.097928436911488e-07,
      "loss": 0.0,
      "num_tokens": 13684028.0,
      "reward": 0.9804370403289795,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9782633781433105,
      "rewards/acc_reward/std": 0.03396952524781227,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 102
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 442.0,
      "completions/mean_length": 189.28125,
      "completions/mean_terminated_length": 178.8709716796875,
      "completions/min_length": 43.0,
      "completions/min_terminated_length": 43.0,
      "entropy": 2.5625,
      "epoch": 0.19397363465160075,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.07909604519774e-07,
      "loss": 0.0,
      "num_tokens": 13813006.0,
      "reward": 0.9762270450592041,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9735856056213379,
      "rewards/acc_reward/std": 0.05650464445352554,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 103
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 495.0,
      "completions/max_terminated_length": 495.0,
      "completions/mean_length": 176.3125,
      "completions/mean_terminated_length": 176.3125,
      "completions/min_length": 52.0,
      "completions/min_terminated_length": 52.0,
      "entropy": 2.515625,
      "epoch": 0.1958568738229755,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.060263653483992e-07,
      "loss": 0.0,
      "num_tokens": 13945954.0,
      "reward": 0.9901642799377441,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9890714883804321,
      "rewards/acc_reward/std": 0.014226200059056282,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 104
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 385.0,
      "completions/max_terminated_length": 385.0,
      "completions/mean_length": 175.34375,
      "completions/mean_terminated_length": 175.34375,
      "completions/min_length": 63.0,
      "completions/min_terminated_length": 63.0,
      "entropy": 2.515625,
      "epoch": 0.1977401129943503,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.041431261770244e-07,
      "loss": 0.0,
      "num_tokens": 14080760.0,
      "reward": 0.9843112826347351,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.982568085193634,
      "rewards/acc_reward/std": 0.029767252504825592,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 427.0,
      "completions/max_terminated_length": 427.0,
      "completions/mean_length": 162.34375,
      "completions/mean_terminated_length": 162.34375,
      "completions/min_length": 37.0,
      "completions/min_terminated_length": 37.0,
      "entropy": 2.25,
      "epoch": 0.19962335216572505,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.022598870056497e-07,
      "loss": 0.0,
      "num_tokens": 14202574.0,
      "reward": 0.9882901310920715,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9869890213012695,
      "rewards/acc_reward/std": 0.017871392890810966,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 106
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 393.0,
      "completions/max_terminated_length": 393.0,
      "completions/mean_length": 177.140625,
      "completions/mean_terminated_length": 177.140625,
      "completions/min_length": 69.0,
      "completions/min_terminated_length": 69.0,
      "entropy": 2.515625,
      "epoch": 0.2015065913370998,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.003766478342749e-07,
      "loss": 0.0,
      "num_tokens": 14339895.0,
      "reward": 0.9751439094543457,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.972382128238678,
      "rewards/acc_reward/std": 0.04191429913043976,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 107
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 457.0,
      "completions/max_terminated_length": 457.0,
      "completions/mean_length": 182.734375,
      "completions/mean_terminated_length": 182.734375,
      "completions/min_length": 71.0,
      "completions/min_terminated_length": 71.0,
      "entropy": 2.46875,
      "epoch": 0.2033898305084746,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.984934086629001e-07,
      "loss": 0.0,
      "num_tokens": 14480414.0,
      "reward": 0.9581470489501953,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9534966945648193,
      "rewards/acc_reward/std": 0.06734198331832886,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 108
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 315.0,
      "completions/max_terminated_length": 315.0,
      "completions/mean_length": 150.65625,
      "completions/mean_terminated_length": 150.65625,
      "completions/min_length": 57.0,
      "completions/min_terminated_length": 57.0,
      "entropy": 2.203125,
      "epoch": 0.20527306967984935,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.966101694915253e-07,
      "loss": 0.0,
      "num_tokens": 14604040.0,
      "reward": 0.9401878118515015,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9335420727729797,
      "rewards/acc_reward/std": 0.09204845130443573,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 109
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 350.0,
      "completions/max_terminated_length": 350.0,
      "completions/mean_length": 153.203125,
      "completions/mean_terminated_length": 153.203125,
      "completions/min_length": 61.0,
      "completions/min_terminated_length": 61.0,
      "entropy": 2.28125,
      "epoch": 0.2071563088512241,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.947269303201506e-07,
      "loss": 0.0,
      "num_tokens": 14742229.0,
      "reward": 0.9852752685546875,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9836391806602478,
      "rewards/acc_reward/std": 0.04362887144088745,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 110
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 336.0,
      "completions/max_terminated_length": 336.0,
      "completions/mean_length": 170.71875,
      "completions/mean_terminated_length": 170.71875,
      "completions/min_length": 78.0,
      "completions/min_terminated_length": 78.0,
      "entropy": 2.375,
      "epoch": 0.20903954802259886,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.928436911487758e-07,
      "loss": 0.0,
      "num_tokens": 14873859.0,
      "reward": 0.9933172464370728,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9925747513771057,
      "rewards/acc_reward/std": 0.010412875562906265,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 111
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 349.0,
      "completions/max_terminated_length": 349.0,
      "completions/mean_length": 168.53125,
      "completions/mean_terminated_length": 168.53125,
      "completions/min_length": 66.0,
      "completions/min_terminated_length": 66.0,
      "entropy": 2.5,
      "epoch": 0.21092278719397364,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.909604519774011e-07,
      "loss": 0.0,
      "num_tokens": 15006189.0,
      "reward": 0.9692245125770569,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9658049941062927,
      "rewards/acc_reward/std": 0.0648469477891922,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 112
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 379.0,
      "completions/max_terminated_length": 379.0,
      "completions/mean_length": 169.125,
      "completions/mean_terminated_length": 169.125,
      "completions/min_length": 51.0,
      "completions/min_terminated_length": 51.0,
      "entropy": 2.3125,
      "epoch": 0.2128060263653484,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.890772128060263e-07,
      "loss": 0.0,
      "num_tokens": 15144437.0,
      "reward": 0.9728338718414307,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9698153734207153,
      "rewards/acc_reward/std": 0.040937572717666626,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 113
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 362.0,
      "completions/max_terminated_length": 362.0,
      "completions/mean_length": 158.234375,
      "completions/mean_terminated_length": 158.234375,
      "completions/min_length": 66.0,
      "completions/min_terminated_length": 66.0,
      "entropy": 2.421875,
      "epoch": 0.21468926553672316,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.871939736346515e-07,
      "loss": 0.0,
      "num_tokens": 15266348.0,
      "reward": 0.9900326728820801,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9889251589775085,
      "rewards/acc_reward/std": 0.011797359213232994,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 114
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 421.0,
      "completions/mean_length": 184.890625,
      "completions/mean_terminated_length": 179.69842529296875,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "entropy": 2.609375,
      "epoch": 0.21657250470809794,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.853107344632767e-07,
      "loss": 0.0,
      "num_tokens": 15406085.0,
      "reward": 0.9864563941955566,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9849514961242676,
      "rewards/acc_reward/std": 0.023492755368351936,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 436.0,
      "completions/mean_length": 179.328125,
      "completions/mean_terminated_length": 174.04762268066406,
      "completions/min_length": 68.0,
      "completions/min_terminated_length": 68.0,
      "entropy": 2.34375,
      "epoch": 0.2184557438794727,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.8667036294937134,
      "learning_rate": 7.83427495291902e-07,
      "loss": -0.0178,
      "num_tokens": 15537066.0,
      "reward": 0.9839749336242676,
      "reward_std": 0.004419418517500162,
      "rewards/acc_reward/mean": 0.9839304685592651,
      "rewards/acc_reward/std": 0.025710513815283775,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 319.0,
      "completions/max_terminated_length": 319.0,
      "completions/mean_length": 151.453125,
      "completions/mean_terminated_length": 151.453125,
      "completions/min_length": 37.0,
      "completions/min_terminated_length": 37.0,
      "entropy": 2.34375,
      "epoch": 0.22033898305084745,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.815442561205274e-07,
      "loss": 0.0,
      "num_tokens": 15661063.0,
      "reward": 0.9836729168891907,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9818587899208069,
      "rewards/acc_reward/std": 0.03338773176074028,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 117
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 448.0,
      "completions/max_terminated_length": 448.0,
      "completions/mean_length": 173.40625,
      "completions/mean_terminated_length": 173.40625,
      "completions/min_length": 75.0,
      "completions/min_terminated_length": 75.0,
      "entropy": 2.53125,
      "epoch": 0.2222222222222222,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.796610169491526e-07,
      "loss": 0.0,
      "num_tokens": 15794785.0,
      "reward": 0.970079779624939,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9667552709579468,
      "rewards/acc_reward/std": 0.03764721751213074,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 118
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 353.0,
      "completions/max_terminated_length": 353.0,
      "completions/mean_length": 156.765625,
      "completions/mean_terminated_length": 156.765625,
      "completions/min_length": 56.0,
      "completions/min_terminated_length": 56.0,
      "entropy": 2.171875,
      "epoch": 0.224105461393597,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 1.2777234315872192,
      "learning_rate": 7.777777777777778e-07,
      "loss": 0.0405,
      "num_tokens": 15925402.0,
      "reward": 0.9777387380599976,
      "reward_std": 0.013079374097287655,
      "rewards/acc_reward/mean": 0.9752652645111084,
      "rewards/acc_reward/std": 0.04798175394535065,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 119
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 408.0,
      "completions/mean_length": 203.3125,
      "completions/mean_terminated_length": 198.4127197265625,
      "completions/min_length": 54.0,
      "completions/min_terminated_length": 54.0,
      "entropy": 2.65625,
      "epoch": 0.22598870056497175,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.75894538606403e-07,
      "loss": 0.0,
      "num_tokens": 16073038.0,
      "reward": 0.9984081983566284,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9982313513755798,
      "rewards/acc_reward/std": 0.0047163767740130424,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 120
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 458.0,
      "completions/mean_length": 204.984375,
      "completions/mean_terminated_length": 195.08062744140625,
      "completions/min_length": 70.0,
      "completions/min_terminated_length": 70.0,
      "entropy": 2.59375,
      "epoch": 0.2278719397363465,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.740112994350282e-07,
      "loss": 0.0,
      "num_tokens": 16218901.0,
      "reward": 0.9989771842956543,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9988635778427124,
      "rewards/acc_reward/std": 0.0020075358916074038,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 121
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 404.0,
      "completions/max_terminated_length": 404.0,
      "completions/mean_length": 174.4375,
      "completions/mean_terminated_length": 174.4375,
      "completions/min_length": 50.0,
      "completions/min_terminated_length": 50.0,
      "entropy": 2.328125,
      "epoch": 0.2297551789077213,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.721280602636534e-07,
      "loss": 0.0,
      "num_tokens": 16352369.0,
      "reward": 0.988227367401123,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9869192838668823,
      "rewards/acc_reward/std": 0.026611221954226494,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 341.0,
      "completions/max_terminated_length": 341.0,
      "completions/mean_length": 167.890625,
      "completions/mean_terminated_length": 167.890625,
      "completions/min_length": 53.0,
      "completions/min_terminated_length": 53.0,
      "entropy": 2.390625,
      "epoch": 0.23163841807909605,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.702448210922787e-07,
      "loss": 0.0,
      "num_tokens": 16482378.0,
      "reward": 0.9788169860839844,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9764633178710938,
      "rewards/acc_reward/std": 0.03820019215345383,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 123
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 321.0,
      "completions/max_terminated_length": 321.0,
      "completions/mean_length": 178.171875,
      "completions/mean_terminated_length": 178.171875,
      "completions/min_length": 49.0,
      "completions/min_terminated_length": 49.0,
      "entropy": 2.125,
      "epoch": 0.2335216572504708,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.68361581920904e-07,
      "loss": 0.0,
      "num_tokens": 16619925.0,
      "reward": 0.9605213403701782,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9561347961425781,
      "rewards/acc_reward/std": 0.08166956156492233,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 124
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 320.0,
      "completions/mean_length": 173.6875,
      "completions/mean_terminated_length": 168.3174591064453,
      "completions/min_length": 59.0,
      "completions/min_terminated_length": 59.0,
      "entropy": 2.390625,
      "epoch": 0.23540489642184556,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.9748631119728088,
      "learning_rate": 7.664783427495292e-07,
      "loss": 0.0211,
      "num_tokens": 16749945.0,
      "reward": 0.9801042079925537,
      "reward_std": 0.039774756878614426,
      "rewards/acc_reward/mean": 0.9778935313224792,
      "rewards/acc_reward/std": 0.12537133693695068,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 406.0,
      "completions/mean_length": 178.25,
      "completions/mean_terminated_length": 167.48387145996094,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "entropy": 2.5625,
      "epoch": 0.23728813559322035,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.645951035781544e-07,
      "loss": 0.0,
      "num_tokens": 16879657.0,
      "reward": 0.9902485609054565,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9891650676727295,
      "rewards/acc_reward/std": 0.015085420571267605,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 126
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 319.0,
      "completions/max_terminated_length": 319.0,
      "completions/mean_length": 162.15625,
      "completions/mean_terminated_length": 162.15625,
      "completions/min_length": 46.0,
      "completions/min_terminated_length": 46.0,
      "entropy": 2.328125,
      "epoch": 0.2391713747645951,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.627118644067796e-07,
      "loss": 0.0,
      "num_tokens": 17009299.0,
      "reward": 0.9965801239013672,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9962000846862793,
      "rewards/acc_reward/std": 0.010133087635040283,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 127
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 343.0,
      "completions/max_terminated_length": 343.0,
      "completions/mean_length": 160.796875,
      "completions/mean_terminated_length": 160.796875,
      "completions/min_length": 57.0,
      "completions/min_terminated_length": 57.0,
      "entropy": 2.28125,
      "epoch": 0.24105461393596986,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.608286252354048e-07,
      "loss": 0.0,
      "num_tokens": 17146406.0,
      "reward": 0.9833929538726807,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9815477728843689,
      "rewards/acc_reward/std": 0.024218887090682983,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 128
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 436.0,
      "completions/max_terminated_length": 436.0,
      "completions/mean_length": 175.1875,
      "completions/mean_terminated_length": 175.1875,
      "completions/min_length": 56.0,
      "completions/min_terminated_length": 56.0,
      "entropy": 2.484375,
      "epoch": 0.24293785310734464,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.589453860640301e-07,
      "loss": 0.0,
      "num_tokens": 17274242.0,
      "reward": 0.9791369438171387,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9768187999725342,
      "rewards/acc_reward/std": 0.03209559619426727,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 129
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 441.0,
      "completions/mean_length": 174.9375,
      "completions/mean_terminated_length": 169.58731079101562,
      "completions/min_length": 72.0,
      "completions/min_terminated_length": 72.0,
      "entropy": 2.359375,
      "epoch": 0.2448210922787194,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.570621468926553e-07,
      "loss": 0.0,
      "num_tokens": 17409982.0,
      "reward": 0.9768315553665161,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9742573499679565,
      "rewards/acc_reward/std": 0.05567716062068939,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 130
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 394.0,
      "completions/mean_length": 186.6875,
      "completions/mean_terminated_length": 181.52381896972656,
      "completions/min_length": 58.0,
      "completions/min_terminated_length": 58.0,
      "entropy": 2.421875,
      "epoch": 0.24670433145009416,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.551789077212806e-07,
      "loss": 0.0,
      "num_tokens": 17545354.0,
      "reward": 0.9810404777526855,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9789338707923889,
      "rewards/acc_reward/std": 0.03438215330243111,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 131
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 483.0,
      "completions/max_terminated_length": 483.0,
      "completions/mean_length": 155.46875,
      "completions/mean_terminated_length": 155.46875,
      "completions/min_length": 61.0,
      "completions/min_terminated_length": 61.0,
      "entropy": 2.234375,
      "epoch": 0.24858757062146894,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.42140984535217285,
      "learning_rate": 7.532956685499058e-07,
      "loss": 0.0881,
      "num_tokens": 17679648.0,
      "reward": 0.9783517122268677,
      "reward_std": 0.039774756878614426,
      "rewards/acc_reward/mean": 0.975946307182312,
      "rewards/acc_reward/std": 0.12442652136087418,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 132
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 370.0,
      "completions/mean_length": 180.71875,
      "completions/mean_terminated_length": 170.03225708007812,
      "completions/min_length": 62.0,
      "completions/min_terminated_length": 62.0,
      "entropy": 2.5625,
      "epoch": 0.2504708097928437,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.51412429378531e-07,
      "loss": 0.0,
      "num_tokens": 17811718.0,
      "reward": 0.9774539470672607,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9749488830566406,
      "rewards/acc_reward/std": 0.04454605653882027,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 133
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 498.0,
      "completions/mean_length": 177.40625,
      "completions/mean_terminated_length": 172.09524536132812,
      "completions/min_length": 67.0,
      "completions/min_terminated_length": 67.0,
      "entropy": 2.21875,
      "epoch": 0.2523540489642185,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.495291902071563e-07,
      "loss": 0.0,
      "num_tokens": 17944256.0,
      "reward": 0.9872071743011475,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.985785722732544,
      "rewards/acc_reward/std": 0.01633262448012829,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 134
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 388.0,
      "completions/max_terminated_length": 388.0,
      "completions/mean_length": 167.875,
      "completions/mean_terminated_length": 167.875,
      "completions/min_length": 75.0,
      "completions/min_terminated_length": 75.0,
      "entropy": 2.375,
      "epoch": 0.2542372881355932,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.476459510357815e-07,
      "loss": 0.0,
      "num_tokens": 18074344.0,
      "reward": 0.996717095375061,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9963523149490356,
      "rewards/acc_reward/std": 0.009727060794830322,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 341.0,
      "completions/max_terminated_length": 341.0,
      "completions/mean_length": 168.921875,
      "completions/mean_terminated_length": 168.921875,
      "completions/min_length": 71.0,
      "completions/min_terminated_length": 71.0,
      "entropy": 2.3125,
      "epoch": 0.256120527306968,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.457627118644067e-07,
      "loss": 0.0,
      "num_tokens": 18202499.0,
      "reward": 0.9629114866256714,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9587906002998352,
      "rewards/acc_reward/std": 0.05813661590218544,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 136
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 413.0,
      "completions/max_terminated_length": 413.0,
      "completions/mean_length": 152.171875,
      "completions/mean_terminated_length": 152.171875,
      "completions/min_length": 54.0,
      "completions/min_terminated_length": 54.0,
      "entropy": 2.234375,
      "epoch": 0.2580037664783427,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.438794726930319e-07,
      "loss": 0.0,
      "num_tokens": 18325262.0,
      "reward": 0.9783220887184143,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.975913405418396,
      "rewards/acc_reward/std": 0.03493288531899452,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 137
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 468.0,
      "completions/max_terminated_length": 468.0,
      "completions/mean_length": 182.3125,
      "completions/mean_terminated_length": 182.3125,
      "completions/min_length": 61.0,
      "completions/min_terminated_length": 61.0,
      "entropy": 2.421875,
      "epoch": 0.2598870056497175,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.419962335216571e-07,
      "loss": 0.0,
      "num_tokens": 18463394.0,
      "reward": 0.9732788801193237,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9703099131584167,
      "rewards/acc_reward/std": 0.02722685970366001,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 138
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 312.0,
      "completions/max_terminated_length": 312.0,
      "completions/mean_length": 137.0,
      "completions/mean_terminated_length": 137.0,
      "completions/min_length": 47.0,
      "completions/min_terminated_length": 47.0,
      "entropy": 2.171875,
      "epoch": 0.2617702448210923,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 1.0246717929840088,
      "learning_rate": 7.401129943502824e-07,
      "loss": 0.0013,
      "num_tokens": 18588066.0,
      "reward": 0.9853101968765259,
      "reward_std": 0.004419418517500162,
      "rewards/acc_reward/mean": 0.9854141473770142,
      "rewards/acc_reward/std": 0.01550329476594925,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 139
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 462.0,
      "completions/mean_length": 180.15625,
      "completions/mean_terminated_length": 174.88890075683594,
      "completions/min_length": 74.0,
      "completions/min_terminated_length": 74.0,
      "entropy": 2.4375,
      "epoch": 0.263653483992467,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.382297551789078e-07,
      "loss": 0.0,
      "num_tokens": 18720780.0,
      "reward": 0.973831057548523,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9709233641624451,
      "rewards/acc_reward/std": 0.047117821872234344,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 140
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 451.0,
      "completions/mean_length": 190.109375,
      "completions/mean_terminated_length": 179.72579956054688,
      "completions/min_length": 52.0,
      "completions/min_terminated_length": 52.0,
      "entropy": 2.40625,
      "epoch": 0.2655367231638418,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.36346516007533e-07,
      "loss": 0.0,
      "num_tokens": 18857011.0,
      "reward": 0.9712393283843994,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9680436849594116,
      "rewards/acc_reward/std": 0.04809395968914032,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 141
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 410.0,
      "completions/max_terminated_length": 410.0,
      "completions/mean_length": 160.671875,
      "completions/mean_terminated_length": 160.671875,
      "completions/min_length": 72.0,
      "completions/min_terminated_length": 72.0,
      "entropy": 2.140625,
      "epoch": 0.2674199623352166,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 1.3321737051010132,
      "learning_rate": 7.344632768361582e-07,
      "loss": -0.0087,
      "num_tokens": 18989918.0,
      "reward": 0.9906257390975952,
      "reward_std": 0.008838837035000324,
      "rewards/acc_reward/mean": 0.9930564165115356,
      "rewards/acc_reward/std": 0.010120646096765995,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "step": 142
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 352.0,
      "completions/max_terminated_length": 352.0,
      "completions/mean_length": 147.78125,
      "completions/mean_terminated_length": 147.78125,
      "completions/min_length": 54.0,
      "completions/min_terminated_length": 54.0,
      "entropy": 2.25,
      "epoch": 0.2693032015065913,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 1.1365125179290771,
      "learning_rate": 7.325800376647834e-07,
      "loss": 0.0151,
      "num_tokens": 19115760.0,
      "reward": 0.9455662965774536,
      "reward_std": 0.004419418517500162,
      "rewards/acc_reward/mean": 0.9412541389465332,
      "rewards/acc_reward/std": 0.09196340292692184,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 143
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 335.0,
      "completions/max_terminated_length": 335.0,
      "completions/mean_length": 161.796875,
      "completions/mean_terminated_length": 161.796875,
      "completions/min_length": 53.0,
      "completions/min_terminated_length": 53.0,
      "entropy": 2.234375,
      "epoch": 0.2711864406779661,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.306967984934086e-07,
      "loss": 0.0,
      "num_tokens": 19244779.0,
      "reward": 0.9764645099639893,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9738494753837585,
      "rewards/acc_reward/std": 0.043730027973651886,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 144
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 354.0,
      "completions/mean_length": 164.109375,
      "completions/mean_terminated_length": 158.58731079101562,
      "completions/min_length": 68.0,
      "completions/min_terminated_length": 68.0,
      "entropy": 2.28125,
      "epoch": 0.2730696798493409,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 1.4404025077819824,
      "learning_rate": 7.288135593220338e-07,
      "loss": 0.0261,
      "num_tokens": 19387506.0,
      "reward": 0.9824453592300415,
      "reward_std": 0.017121607437729836,
      "rewards/acc_reward/mean": 0.9822309613227844,
      "rewards/acc_reward/std": 0.04615851864218712,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 145
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 322.0,
      "completions/max_terminated_length": 322.0,
      "completions/mean_length": 161.578125,
      "completions/mean_terminated_length": 161.578125,
      "completions/min_length": 83.0,
      "completions/min_terminated_length": 83.0,
      "entropy": 2.21875,
      "epoch": 0.2749529190207156,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.269303201506592e-07,
      "loss": 0.0,
      "num_tokens": 19526271.0,
      "reward": 0.9767446517944336,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9741606712341309,
      "rewards/acc_reward/std": 0.02946525067090988,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 146
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 299.0,
      "completions/max_terminated_length": 299.0,
      "completions/mean_length": 162.734375,
      "completions/mean_terminated_length": 162.734375,
      "completions/min_length": 70.0,
      "completions/min_terminated_length": 70.0,
      "entropy": 2.296875,
      "epoch": 0.2768361581920904,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.250470809792844e-07,
      "loss": 0.0,
      "num_tokens": 19656430.0,
      "reward": 0.9855356216430664,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9839285016059875,
      "rewards/acc_reward/std": 0.03281255066394806,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 147
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 328.0,
      "completions/max_terminated_length": 328.0,
      "completions/mean_length": 152.265625,
      "completions/mean_terminated_length": 152.265625,
      "completions/min_length": 54.0,
      "completions/min_terminated_length": 54.0,
      "entropy": 2.328125,
      "epoch": 0.2787193973634652,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.231638418079096e-07,
      "loss": 0.0,
      "num_tokens": 19791039.0,
      "reward": 0.9486349821090698,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9429277777671814,
      "rewards/acc_reward/std": 0.0684126690030098,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 148
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 335.0,
      "completions/max_terminated_length": 335.0,
      "completions/mean_length": 138.96875,
      "completions/mean_terminated_length": 138.96875,
      "completions/min_length": 44.0,
      "completions/min_terminated_length": 44.0,
      "entropy": 2.046875,
      "epoch": 0.2806026365348399,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.212806026365348e-07,
      "loss": 0.0,
      "num_tokens": 19920157.0,
      "reward": 0.9899982213973999,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9888868927955627,
      "rewards/acc_reward/std": 0.014463546685874462,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 149
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 420.0,
      "completions/mean_length": 171.4375,
      "completions/mean_terminated_length": 166.03173828125,
      "completions/min_length": 54.0,
      "completions/min_terminated_length": 54.0,
      "entropy": 2.1875,
      "epoch": 0.2824858757062147,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.1939736346516e-07,
      "loss": 0.0,
      "num_tokens": 20054233.0,
      "reward": 0.9820305109024048,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9800339341163635,
      "rewards/acc_reward/std": 0.03505489602684975,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 150
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 353.0,
      "completions/max_terminated_length": 353.0,
      "completions/mean_length": 154.09375,
      "completions/mean_terminated_length": 154.09375,
      "completions/min_length": 66.0,
      "completions/min_terminated_length": 66.0,
      "entropy": 2.265625,
      "epoch": 0.2843691148775895,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.175141242937853e-07,
      "loss": 0.0,
      "num_tokens": 20177839.0,
      "reward": 0.9695570468902588,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9661744832992554,
      "rewards/acc_reward/std": 0.051941804587841034,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 151
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 317.0,
      "completions/max_terminated_length": 317.0,
      "completions/mean_length": 159.078125,
      "completions/mean_terminated_length": 159.078125,
      "completions/min_length": 52.0,
      "completions/min_terminated_length": 52.0,
      "entropy": 2.21875,
      "epoch": 0.2862523540489642,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 1.135265588760376,
      "learning_rate": 7.156308851224105e-07,
      "loss": 0.0085,
      "num_tokens": 20313964.0,
      "reward": 0.9910129308700562,
      "reward_std": 0.004419418517500162,
      "rewards/acc_reward/mean": 0.9917504787445068,
      "rewards/acc_reward/std": 0.015448656864464283,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 152
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 384.0,
      "completions/mean_length": 176.328125,
      "completions/mean_terminated_length": 176.328125,
      "completions/min_length": 44.0,
      "completions/min_terminated_length": 44.0,
      "entropy": 2.546875,
      "epoch": 0.288135593220339,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.137476459510358e-07,
      "loss": 0.0,
      "num_tokens": 20454113.0,
      "reward": 0.9844459295272827,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9827176928520203,
      "rewards/acc_reward/std": 0.018290938809514046,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 447.0,
      "completions/max_terminated_length": 447.0,
      "completions/mean_length": 154.140625,
      "completions/mean_terminated_length": 154.140625,
      "completions/min_length": 46.0,
      "completions/min_terminated_length": 46.0,
      "entropy": 2.265625,
      "epoch": 0.2900188323917137,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.11864406779661e-07,
      "loss": 0.0,
      "num_tokens": 20591402.0,
      "reward": 0.9614624977111816,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9571805596351624,
      "rewards/acc_reward/std": 0.06432777643203735,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 154
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 323.0,
      "completions/max_terminated_length": 323.0,
      "completions/mean_length": 151.9375,
      "completions/mean_terminated_length": 151.9375,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "entropy": 2.1875,
      "epoch": 0.2919020715630885,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.099811676082862e-07,
      "loss": 0.0,
      "num_tokens": 20714630.0,
      "reward": 0.9820876717567444,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9800974130630493,
      "rewards/acc_reward/std": 0.037367500364780426,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 155
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 436.0,
      "completions/mean_length": 173.15625,
      "completions/mean_terminated_length": 167.77780151367188,
      "completions/min_length": 63.0,
      "completions/min_terminated_length": 63.0,
      "entropy": 2.296875,
      "epoch": 0.2937853107344633,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.080979284369114e-07,
      "loss": 0.0,
      "num_tokens": 20853136.0,
      "reward": 0.9967447519302368,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9963830709457397,
      "rewards/acc_reward/std": 0.009645064361393452,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 156
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 382.0,
      "completions/max_terminated_length": 382.0,
      "completions/mean_length": 158.5,
      "completions/mean_terminated_length": 158.5,
      "completions/min_length": 60.0,
      "completions/min_terminated_length": 60.0,
      "entropy": 2.21875,
      "epoch": 0.295668549905838,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.062146892655367e-07,
      "loss": 0.0,
      "num_tokens": 20979664.0,
      "reward": 0.9870873689651489,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9856526851654053,
      "rewards/acc_reward/std": 0.020136423408985138,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 157
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 383.0,
      "completions/max_terminated_length": 383.0,
      "completions/mean_length": 164.1875,
      "completions/mean_terminated_length": 164.1875,
      "completions/min_length": 68.0,
      "completions/min_terminated_length": 68.0,
      "entropy": 2.234375,
      "epoch": 0.2975517890772128,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.043314500941619e-07,
      "loss": 0.0,
      "num_tokens": 21116156.0,
      "reward": 0.9893269538879395,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9881410598754883,
      "rewards/acc_reward/std": 0.023672664538025856,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 158
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 404.0,
      "completions/mean_length": 177.203125,
      "completions/mean_terminated_length": 166.40322875976562,
      "completions/min_length": 64.0,
      "completions/min_terminated_length": 64.0,
      "entropy": 2.328125,
      "epoch": 0.2994350282485876,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.024482109227871e-07,
      "loss": 0.0,
      "num_tokens": 21245321.0,
      "reward": 0.9947940111160278,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9942155480384827,
      "rewards/acc_reward/std": 0.0101578738540411,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 159
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 427.0,
      "completions/max_terminated_length": 427.0,
      "completions/mean_length": 167.0625,
      "completions/mean_terminated_length": 167.0625,
      "completions/min_length": 52.0,
      "completions/min_terminated_length": 52.0,
      "entropy": 2.1875,
      "epoch": 0.3013182674199623,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.005649717514124e-07,
      "loss": 0.0,
      "num_tokens": 21376237.0,
      "reward": 0.9860028624534607,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9844475984573364,
      "rewards/acc_reward/std": 0.019196392968297005,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 160
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 338.0,
      "completions/mean_length": 169.265625,
      "completions/mean_terminated_length": 163.82540893554688,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "entropy": 2.359375,
      "epoch": 0.3032015065913371,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.986817325800376e-07,
      "loss": 0.0,
      "num_tokens": 21516414.0,
      "reward": 0.9679353833198547,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9643726348876953,
      "rewards/acc_reward/std": 0.05980806425213814,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 161
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 348.0,
      "completions/max_terminated_length": 348.0,
      "completions/mean_length": 153.296875,
      "completions/mean_terminated_length": 153.296875,
      "completions/min_length": 68.0,
      "completions/min_terminated_length": 68.0,
      "entropy": 2.09375,
      "epoch": 0.3050847457627119,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.967984934086629e-07,
      "loss": 0.0,
      "num_tokens": 21642289.0,
      "reward": 0.9967857003211975,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9964286088943481,
      "rewards/acc_reward/std": 0.009523809887468815,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 162
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 327.0,
      "completions/max_terminated_length": 327.0,
      "completions/mean_length": 164.515625,
      "completions/mean_terminated_length": 164.515625,
      "completions/min_length": 78.0,
      "completions/min_terminated_length": 78.0,
      "entropy": 2.171875,
      "epoch": 0.3069679849340866,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.949152542372881e-07,
      "loss": 0.0,
      "num_tokens": 21782162.0,
      "reward": 0.9368254542350769,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.929806113243103,
      "rewards/acc_reward/std": 0.0714760348200798,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 163
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 387.0,
      "completions/max_terminated_length": 387.0,
      "completions/mean_length": 168.53125,
      "completions/mean_terminated_length": 168.53125,
      "completions/min_length": 64.0,
      "completions/min_terminated_length": 64.0,
      "entropy": 2.234375,
      "epoch": 0.3088512241054614,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.930320150659133e-07,
      "loss": 0.0,
      "num_tokens": 21911732.0,
      "reward": 0.9614138603210449,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9571264982223511,
      "rewards/acc_reward/std": 0.08462820202112198,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 164
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 389.0,
      "completions/max_terminated_length": 389.0,
      "completions/mean_length": 172.609375,
      "completions/mean_terminated_length": 172.609375,
      "completions/min_length": 46.0,
      "completions/min_terminated_length": 46.0,
      "entropy": 2.015625,
      "epoch": 0.3107344632768362,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.911487758945385e-07,
      "loss": 0.0,
      "num_tokens": 22043643.0,
      "reward": 0.9874755144119263,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9860839247703552,
      "rewards/acc_reward/std": 0.02431458979845047,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 165
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 468.0,
      "completions/max_terminated_length": 468.0,
      "completions/mean_length": 156.015625,
      "completions/mean_terminated_length": 156.015625,
      "completions/min_length": 70.0,
      "completions/min_terminated_length": 70.0,
      "entropy": 2.234375,
      "epoch": 0.3126177024482109,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.892655367231638e-07,
      "loss": 0.0,
      "num_tokens": 22167892.0,
      "reward": 0.9494754076004028,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9438616037368774,
      "rewards/acc_reward/std": 0.05841909348964691,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 166
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 285.0,
      "completions/max_terminated_length": 285.0,
      "completions/mean_length": 144.0,
      "completions/mean_terminated_length": 144.0,
      "completions/min_length": 51.0,
      "completions/min_terminated_length": 51.0,
      "entropy": 1.8828125,
      "epoch": 0.3145009416195857,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.87382297551789e-07,
      "loss": 0.0,
      "num_tokens": 22302132.0,
      "reward": 0.9812496900558472,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9791662693023682,
      "rewards/acc_reward/std": 0.023800579831004143,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 167
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 309.0,
      "completions/mean_length": 163.0,
      "completions/mean_terminated_length": 157.4603271484375,
      "completions/min_length": 69.0,
      "completions/min_terminated_length": 69.0,
      "entropy": 2.203125,
      "epoch": 0.3163841807909605,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.854990583804144e-07,
      "loss": 0.0,
      "num_tokens": 22428188.0,
      "reward": 0.9839420318603516,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9821577668190002,
      "rewards/acc_reward/std": 0.03933866694569588,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 388.0,
      "completions/max_terminated_length": 388.0,
      "completions/mean_length": 160.859375,
      "completions/mean_terminated_length": 160.859375,
      "completions/min_length": 49.0,
      "completions/min_terminated_length": 49.0,
      "entropy": 2.28125,
      "epoch": 0.3182674199623352,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.836158192090396e-07,
      "loss": 0.0,
      "num_tokens": 22559187.0,
      "reward": 0.9785705804824829,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9761895537376404,
      "rewards/acc_reward/std": 0.0258196871727705,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 169
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 296.0,
      "completions/max_terminated_length": 296.0,
      "completions/mean_length": 162.125,
      "completions/mean_terminated_length": 162.125,
      "completions/min_length": 72.0,
      "completions/min_terminated_length": 72.0,
      "entropy": 2.015625,
      "epoch": 0.32015065913371,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.817325800376648e-07,
      "loss": 0.0,
      "num_tokens": 22686427.0,
      "reward": 0.9450536966323853,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9389485120773315,
      "rewards/acc_reward/std": 0.07760415226221085,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 170
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 280.0,
      "completions/max_terminated_length": 280.0,
      "completions/mean_length": 136.40625,
      "completions/mean_terminated_length": 136.40625,
      "completions/min_length": 64.0,
      "completions/min_terminated_length": 64.0,
      "entropy": 1.953125,
      "epoch": 0.3220338983050847,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.7984934086629e-07,
      "loss": 0.0,
      "num_tokens": 22812501.0,
      "reward": 0.9613984823226929,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9571093916893005,
      "rewards/acc_reward/std": 0.07542510330677032,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 171
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 316.0,
      "completions/max_terminated_length": 316.0,
      "completions/mean_length": 157.84375,
      "completions/mean_terminated_length": 157.84375,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "entropy": 2.046875,
      "epoch": 0.3239171374764595,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.779661016949152e-07,
      "loss": 0.0,
      "num_tokens": 22934187.0,
      "reward": 0.9912809729576111,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9903122186660767,
      "rewards/acc_reward/std": 0.020957766100764275,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 172
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 389.0,
      "completions/max_terminated_length": 389.0,
      "completions/mean_length": 176.359375,
      "completions/mean_terminated_length": 176.359375,
      "completions/min_length": 68.0,
      "completions/min_terminated_length": 68.0,
      "entropy": 2.328125,
      "epoch": 0.3258003766478343,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.760828625235404e-07,
      "loss": 0.0,
      "num_tokens": 23070498.0,
      "reward": 0.9652288556098938,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9613653421401978,
      "rewards/acc_reward/std": 0.06810762733221054,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 173
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 464.0,
      "completions/mean_length": 196.28125,
      "completions/mean_terminated_length": 191.2698516845703,
      "completions/min_length": 77.0,
      "completions/min_terminated_length": 77.0,
      "entropy": 2.171875,
      "epoch": 0.327683615819209,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.741996233521658e-07,
      "loss": 0.0,
      "num_tokens": 23202324.0,
      "reward": 0.9608478546142578,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9564976692199707,
      "rewards/acc_reward/std": 0.06687356531620026,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 174
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 296.0,
      "completions/mean_length": 150.796875,
      "completions/mean_terminated_length": 145.06350708007812,
      "completions/min_length": 40.0,
      "completions/min_terminated_length": 40.0,
      "entropy": 1.9609375,
      "epoch": 0.3295668549905838,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.72316384180791e-07,
      "loss": 0.0,
      "num_tokens": 23333319.0,
      "reward": 0.9884651899337769,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9871835112571716,
      "rewards/acc_reward/std": 0.026243234053254128,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 420.0,
      "completions/max_terminated_length": 420.0,
      "completions/mean_length": 165.828125,
      "completions/mean_terminated_length": 165.828125,
      "completions/min_length": 66.0,
      "completions/min_terminated_length": 66.0,
      "entropy": 2.28125,
      "epoch": 0.3314500941619586,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.704331450094162e-07,
      "loss": 0.0,
      "num_tokens": 23460796.0,
      "reward": 0.9813232421875,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.979248046875,
      "rewards/acc_reward/std": 0.04185057431459427,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 176
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 384.0,
      "completions/mean_length": 146.734375,
      "completions/mean_terminated_length": 146.734375,
      "completions/min_length": 50.0,
      "completions/min_terminated_length": 50.0,
      "entropy": 2.09375,
      "epoch": 0.3333333333333333,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.685499058380414e-07,
      "loss": 0.0,
      "num_tokens": 23584811.0,
      "reward": 0.9937499761581421,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9930555820465088,
      "rewards/acc_reward/std": 0.01851852796971798,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 177
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 436.0,
      "completions/max_terminated_length": 436.0,
      "completions/mean_length": 161.828125,
      "completions/mean_terminated_length": 161.828125,
      "completions/min_length": 67.0,
      "completions/min_terminated_length": 67.0,
      "entropy": 2.34375,
      "epoch": 0.3352165725047081,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 1.0041779279708862,
      "learning_rate": 6.666666666666666e-07,
      "loss": 0.0029,
      "num_tokens": 23723072.0,
      "reward": 0.9907628893852234,
      "reward_std": 0.004419418517500162,
      "rewards/acc_reward/mean": 0.9914726614952087,
      "rewards/acc_reward/std": 0.014887169934809208,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 178
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 447.0,
      "completions/max_terminated_length": 447.0,
      "completions/mean_length": 170.140625,
      "completions/mean_terminated_length": 170.140625,
      "completions/min_length": 69.0,
      "completions/min_terminated_length": 69.0,
      "entropy": 2.296875,
      "epoch": 0.3370998116760829,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.647834274952919e-07,
      "loss": 0.0,
      "num_tokens": 23852745.0,
      "reward": 0.9886301755905151,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9873669147491455,
      "rewards/acc_reward/std": 0.02594558708369732,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 179
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 399.0,
      "completions/max_terminated_length": 399.0,
      "completions/mean_length": 159.09375,
      "completions/mean_terminated_length": 159.09375,
      "completions/min_length": 65.0,
      "completions/min_terminated_length": 65.0,
      "entropy": 2.203125,
      "epoch": 0.3389830508474576,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.629001883239171e-07,
      "loss": 0.0,
      "num_tokens": 23976431.0,
      "reward": 0.9778439402580261,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9753821492195129,
      "rewards/acc_reward/std": 0.02938609942793846,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 180
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 366.0,
      "completions/mean_length": 155.328125,
      "completions/mean_terminated_length": 149.66668701171875,
      "completions/min_length": 49.0,
      "completions/min_terminated_length": 49.0,
      "entropy": 2.265625,
      "epoch": 0.3408662900188324,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.610169491525423e-07,
      "loss": 0.0,
      "num_tokens": 24103236.0,
      "reward": 0.9857558012008667,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9841731190681458,
      "rewards/acc_reward/std": 0.02304108813405037,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 181
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 379.0,
      "completions/max_terminated_length": 379.0,
      "completions/mean_length": 157.25,
      "completions/mean_terminated_length": 157.25,
      "completions/min_length": 58.0,
      "completions/min_terminated_length": 58.0,
      "entropy": 1.921875,
      "epoch": 0.3427495291902072,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.591337099811676e-07,
      "loss": 0.0,
      "num_tokens": 24240244.0,
      "reward": 0.9847475290298462,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9830528497695923,
      "rewards/acc_reward/std": 0.018334772437810898,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 182
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 353.0,
      "completions/max_terminated_length": 353.0,
      "completions/mean_length": 153.484375,
      "completions/mean_terminated_length": 153.484375,
      "completions/min_length": 45.0,
      "completions/min_terminated_length": 45.0,
      "entropy": 2.09375,
      "epoch": 0.3446327683615819,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.572504708097928e-07,
      "loss": 0.0,
      "num_tokens": 24373251.0,
      "reward": 0.9889024496078491,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9876693487167358,
      "rewards/acc_reward/std": 0.014938557520508766,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 183
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 365.0,
      "completions/mean_length": 173.65625,
      "completions/mean_terminated_length": 168.2857208251953,
      "completions/min_length": 55.0,
      "completions/min_terminated_length": 55.0,
      "entropy": 2.34375,
      "epoch": 0.3465160075329567,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.55367231638418e-07,
      "loss": 0.0,
      "num_tokens": 24507229.0,
      "reward": 0.9813482165336609,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9792758226394653,
      "rewards/acc_reward/std": 0.03865106776356697,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 184
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 350.0,
      "completions/max_terminated_length": 350.0,
      "completions/mean_length": 165.078125,
      "completions/mean_terminated_length": 165.078125,
      "completions/min_length": 63.0,
      "completions/min_terminated_length": 63.0,
      "entropy": 2.203125,
      "epoch": 0.3483992467043315,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.534839924670433e-07,
      "loss": 0.0,
      "num_tokens": 24646554.0,
      "reward": 0.9556671380996704,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9507412910461426,
      "rewards/acc_reward/std": 0.07888054847717285,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 285.0,
      "completions/max_terminated_length": 285.0,
      "completions/mean_length": 154.4375,
      "completions/mean_terminated_length": 154.4375,
      "completions/min_length": 70.0,
      "completions/min_terminated_length": 70.0,
      "entropy": 1.984375,
      "epoch": 0.3502824858757062,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.516007532956685e-07,
      "loss": 0.0,
      "num_tokens": 24785782.0,
      "reward": 0.9671032428741455,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9634479880332947,
      "rewards/acc_reward/std": 0.06446754932403564,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 186
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 309.0,
      "completions/max_terminated_length": 309.0,
      "completions/mean_length": 140.21875,
      "completions/mean_terminated_length": 140.21875,
      "completions/min_length": 49.0,
      "completions/min_terminated_length": 49.0,
      "entropy": 2.09375,
      "epoch": 0.352165725047081,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 1.060726284980774,
      "learning_rate": 6.497175141242937e-07,
      "loss": 0.0088,
      "num_tokens": 24903940.0,
      "reward": 0.932281494140625,
      "reward_std": 0.004419418517500162,
      "rewards/acc_reward/mean": 0.9264933466911316,
      "rewards/acc_reward/std": 0.08563832193613052,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 187
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 460.0,
      "completions/mean_length": 150.15625,
      "completions/mean_terminated_length": 144.4127197265625,
      "completions/min_length": 60.0,
      "completions/min_terminated_length": 60.0,
      "entropy": 2.015625,
      "epoch": 0.3540489642184557,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.478342749529189e-07,
      "loss": 0.0,
      "num_tokens": 25035214.0,
      "reward": 0.9913280010223389,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9903644323348999,
      "rewards/acc_reward/std": 0.01791100949048996,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 188
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 493.0,
      "completions/max_terminated_length": 493.0,
      "completions/mean_length": 156.28125,
      "completions/mean_terminated_length": 156.28125,
      "completions/min_length": 74.0,
      "completions/min_terminated_length": 74.0,
      "entropy": 2.109375,
      "epoch": 0.3559322033898305,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.459510357815442e-07,
      "loss": 0.0,
      "num_tokens": 25168800.0,
      "reward": 0.9506161212921143,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9451290369033813,
      "rewards/acc_reward/std": 0.07419686019420624,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 189
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 319.0,
      "completions/max_terminated_length": 319.0,
      "completions/mean_length": 138.859375,
      "completions/mean_terminated_length": 138.859375,
      "completions/min_length": 58.0,
      "completions/min_terminated_length": 58.0,
      "entropy": 1.8359375,
      "epoch": 0.3578154425612053,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.440677966101694e-07,
      "loss": 0.0,
      "num_tokens": 25302271.0,
      "reward": 0.9889830350875854,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9877589344978333,
      "rewards/acc_reward/std": 0.013018240220844746,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 190
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 250.0,
      "completions/max_terminated_length": 250.0,
      "completions/mean_length": 143.65625,
      "completions/mean_terminated_length": 143.65625,
      "completions/min_length": 70.0,
      "completions/min_terminated_length": 70.0,
      "entropy": 2.203125,
      "epoch": 0.35969868173258,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.421845574387948e-07,
      "loss": 0.0,
      "num_tokens": 25422569.0,
      "reward": 0.9856148958206177,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9840165376663208,
      "rewards/acc_reward/std": 0.020176060497760773,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 191
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 493.0,
      "completions/mean_length": 172.125,
      "completions/mean_terminated_length": 166.73016357421875,
      "completions/min_length": 61.0,
      "completions/min_terminated_length": 61.0,
      "entropy": 2.015625,
      "epoch": 0.3615819209039548,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.4030131826742e-07,
      "loss": 0.0,
      "num_tokens": 25560809.0,
      "reward": 0.9829681515693665,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9810757637023926,
      "rewards/acc_reward/std": 0.0207473486661911,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 192
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 359.0,
      "completions/max_terminated_length": 359.0,
      "completions/mean_length": 150.859375,
      "completions/mean_terminated_length": 150.859375,
      "completions/min_length": 67.0,
      "completions/min_terminated_length": 67.0,
      "entropy": 1.9453125,
      "epoch": 0.3634651600753296,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.384180790960452e-07,
      "loss": 0.0,
      "num_tokens": 25691888.0,
      "reward": 0.959905743598938,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9554507732391357,
      "rewards/acc_reward/std": 0.07276061922311783,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 193
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 329.0,
      "completions/max_terminated_length": 329.0,
      "completions/mean_length": 164.3125,
      "completions/mean_terminated_length": 164.3125,
      "completions/min_length": 59.0,
      "completions/min_terminated_length": 59.0,
      "entropy": 2.171875,
      "epoch": 0.3653483992467043,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.365348399246704e-07,
      "loss": 0.0,
      "num_tokens": 25819652.0,
      "reward": 0.9704650044441223,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9671833515167236,
      "rewards/acc_reward/std": 0.04358522593975067,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 288.0,
      "completions/max_terminated_length": 288.0,
      "completions/mean_length": 145.09375,
      "completions/mean_terminated_length": 145.09375,
      "completions/min_length": 57.0,
      "completions/min_terminated_length": 57.0,
      "entropy": 2.15625,
      "epoch": 0.3672316384180791,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.346516007532956e-07,
      "loss": 0.0,
      "num_tokens": 25950602.0,
      "reward": 0.9926788806915283,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9918653964996338,
      "rewards/acc_reward/std": 0.011418163776397705,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 195
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 405.0,
      "completions/max_terminated_length": 405.0,
      "completions/mean_length": 138.3125,
      "completions/mean_terminated_length": 138.3125,
      "completions/min_length": 69.0,
      "completions/min_terminated_length": 69.0,
      "entropy": 1.953125,
      "epoch": 0.3691148775894539,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.32768361581921e-07,
      "loss": 0.0,
      "num_tokens": 26070558.0,
      "reward": 0.9680017232894897,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9644463658332825,
      "rewards/acc_reward/std": 0.051819488406181335,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 196
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 368.0,
      "completions/max_terminated_length": 368.0,
      "completions/mean_length": 164.3125,
      "completions/mean_terminated_length": 164.3125,
      "completions/min_length": 68.0,
      "completions/min_terminated_length": 68.0,
      "entropy": 2.109375,
      "epoch": 0.3709981167608286,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.308851224105462e-07,
      "loss": 0.0,
      "num_tokens": 26211378.0,
      "reward": 0.992339015007019,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9914877414703369,
      "rewards/acc_reward/std": 0.012202701531350613,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 197
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 301.0,
      "completions/max_terminated_length": 301.0,
      "completions/mean_length": 150.03125,
      "completions/mean_terminated_length": 150.03125,
      "completions/min_length": 57.0,
      "completions/min_terminated_length": 57.0,
      "entropy": 2.078125,
      "epoch": 0.3728813559322034,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.290018832391714e-07,
      "loss": 0.0,
      "num_tokens": 26341204.0,
      "reward": 0.9656549692153931,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9618387818336487,
      "rewards/acc_reward/std": 0.03533172979950905,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 198
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 349.0,
      "completions/max_terminated_length": 349.0,
      "completions/mean_length": 144.90625,
      "completions/mean_terminated_length": 144.90625,
      "completions/min_length": 62.0,
      "completions/min_terminated_length": 62.0,
      "entropy": 2.046875,
      "epoch": 0.3747645951035782,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.271186440677966e-07,
      "loss": 0.0,
      "num_tokens": 26460142.0,
      "reward": 0.9934231042861938,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9926923513412476,
      "rewards/acc_reward/std": 0.0151524618268013,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 199
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 340.0,
      "completions/max_terminated_length": 340.0,
      "completions/mean_length": 156.90625,
      "completions/mean_terminated_length": 156.90625,
      "completions/min_length": 62.0,
      "completions/min_terminated_length": 62.0,
      "entropy": 2.0625,
      "epoch": 0.3766478342749529,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.252354048964218e-07,
      "loss": 0.0,
      "num_tokens": 26586088.0,
      "reward": 0.9820280075073242,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9800311326980591,
      "rewards/acc_reward/std": 0.018364734947681427,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 200
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 468.0,
      "completions/max_terminated_length": 468.0,
      "completions/mean_length": 143.34375,
      "completions/mean_terminated_length": 143.34375,
      "completions/min_length": 37.0,
      "completions/min_terminated_length": 37.0,
      "entropy": 2.125,
      "epoch": 0.3785310734463277,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.23352165725047e-07,
      "loss": 0.0,
      "num_tokens": 26717886.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 1.0,
      "rewards/acc_reward/std": 0.0,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 201
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 409.0,
      "completions/max_terminated_length": 409.0,
      "completions/mean_length": 175.90625,
      "completions/mean_terminated_length": 175.90625,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "entropy": 2.140625,
      "epoch": 0.3804143126177024,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.214689265536723e-07,
      "loss": 0.0,
      "num_tokens": 26854168.0,
      "reward": 0.9935948252677917,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9928831458091736,
      "rewards/acc_reward/std": 0.0077277072705328465,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 202
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 396.0,
      "completions/max_terminated_length": 396.0,
      "completions/mean_length": 165.25,
      "completions/mean_terminated_length": 165.25,
      "completions/min_length": 52.0,
      "completions/min_terminated_length": 52.0,
      "entropy": 2.171875,
      "epoch": 0.3822975517890772,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.195856873822976e-07,
      "loss": 0.0,
      "num_tokens": 26989488.0,
      "reward": 0.9852421283721924,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9836024045944214,
      "rewards/acc_reward/std": 0.01928338035941124,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 203
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 261.0,
      "completions/mean_length": 154.296875,
      "completions/mean_terminated_length": 148.61904907226562,
      "completions/min_length": 46.0,
      "completions/min_terminated_length": 46.0,
      "entropy": 2.1875,
      "epoch": 0.384180790960452,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.177024482109228e-07,
      "loss": 0.0,
      "num_tokens": 27112987.0,
      "reward": 0.9929645657539368,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9921828508377075,
      "rewards/acc_reward/std": 0.02084571123123169,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 204
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 342.0,
      "completions/max_terminated_length": 342.0,
      "completions/mean_length": 147.5,
      "completions/mean_terminated_length": 147.5,
      "completions/min_length": 50.0,
      "completions/min_terminated_length": 50.0,
      "entropy": 2.125,
      "epoch": 0.3860640301318267,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.15819209039548e-07,
      "loss": 0.0,
      "num_tokens": 27235691.0,
      "reward": 0.9827622175216675,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9808468818664551,
      "rewards/acc_reward/std": 0.027835894376039505,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 205
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 367.0,
      "completions/max_terminated_length": 367.0,
      "completions/mean_length": 156.53125,
      "completions/mean_terminated_length": 156.53125,
      "completions/min_length": 67.0,
      "completions/min_terminated_length": 67.0,
      "entropy": 2.09375,
      "epoch": 0.3879472693032015,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.139359698681732e-07,
      "loss": 0.0,
      "num_tokens": 27372173.0,
      "reward": 0.9731494784355164,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9701660871505737,
      "rewards/acc_reward/std": 0.020826132968068123,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 206
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 409.0,
      "completions/max_terminated_length": 409.0,
      "completions/mean_length": 164.34375,
      "completions/mean_terminated_length": 164.34375,
      "completions/min_length": 78.0,
      "completions/min_terminated_length": 78.0,
      "entropy": 2.09375,
      "epoch": 0.3898305084745763,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.120527306967984e-07,
      "loss": 0.0,
      "num_tokens": 27499555.0,
      "reward": 0.9984294176101685,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9982548952102661,
      "rewards/acc_reward/std": 0.0046535334549844265,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 207
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 416.0,
      "completions/max_terminated_length": 416.0,
      "completions/mean_length": 157.484375,
      "completions/mean_terminated_length": 157.484375,
      "completions/min_length": 55.0,
      "completions/min_terminated_length": 55.0,
      "entropy": 2.25,
      "epoch": 0.391713747645951,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.101694915254237e-07,
      "loss": 0.0,
      "num_tokens": 27636098.0,
      "reward": 0.9711763262748718,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9679737091064453,
      "rewards/acc_reward/std": 0.029890142381191254,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 208
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 408.0,
      "completions/max_terminated_length": 408.0,
      "completions/mean_length": 174.828125,
      "completions/mean_terminated_length": 174.828125,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "entropy": 2.046875,
      "epoch": 0.3935969868173258,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.4382655620574951,
      "learning_rate": 6.082862523540489e-07,
      "loss": -0.0223,
      "num_tokens": 27777591.0,
      "reward": 0.9556211829185486,
      "reward_std": 0.004419418517500162,
      "rewards/acc_reward/mean": 0.9524263143539429,
      "rewards/acc_reward/std": 0.04751761257648468,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 209
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 384.0,
      "completions/mean_length": 139.09375,
      "completions/mean_terminated_length": 139.09375,
      "completions/min_length": 52.0,
      "completions/min_terminated_length": 52.0,
      "entropy": 2.15625,
      "epoch": 0.3954802259887006,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.064030131826741e-07,
      "loss": 0.0,
      "num_tokens": 27906397.0,
      "reward": 0.9859374761581421,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.984375,
      "rewards/acc_reward/std": 0.0416666679084301,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 210
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 363.0,
      "completions/max_terminated_length": 363.0,
      "completions/mean_length": 144.625,
      "completions/mean_terminated_length": 144.625,
      "completions/min_length": 41.0,
      "completions/min_terminated_length": 41.0,
      "entropy": 1.8984375,
      "epoch": 0.3973634651600753,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.045197740112994e-07,
      "loss": 0.0,
      "num_tokens": 28038757.0,
      "reward": 0.9920339584350586,
      "reward_std": 0.0,
      "rewards/acc_reward/mean": 0.9911487698554993,
      "rewards/acc_reward/std": 0.013452098704874516,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 211
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 455.0,
      "completions/max_terminated_length": 455.0,
      "completions/mean_length": 171.578125,
      "completions/mean_terminated_length": 171.578125,
      "completions/min_length": 59.0,
      "completions/min_terminated_length": 59.0,
      "entropy": 2.1875,
      "epoch": 0.3992467043314501,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 1.4742763042449951,
      "learning_rate": 6.026365348399246e-07,
      "loss": 0.0537,
      "num_tokens": 28163002.0,
      "reward": 0.9887884259223938,
      "reward_std": 0.004419418517500162,
      "rewards/acc_reward/mean": 0.9892787933349609,
      "rewards/acc_reward/std": 0.023635946214199066,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 212
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 381.0,
      "completions/max_terminated_length": 381.0,
      "completions/mean_length": 135.84375,
      "completions/mean_terminated_length": 135.84375,
      "completions/min_length": 55.0,
      "completions/min_terminated_length": 55.0,
      "entropy": 1.921875,
      "epoch": 0.4011299435028249,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.6700330972671509,
      "learning_rate": 6.007532956685499e-07,
      "loss": -0.0233,
      "num_tokens": 28279440.0,
      "reward": 0.9760647416114807,
      "reward_std": 0.004419418517500162,
      "rewards/acc_reward/mean": 0.9751413464546204,
      "rewards/acc_reward/std": 0.03155434504151344,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 213
    }
  ],
  "logging_steps": 1.0,
  "max_steps": 531,
  "num_input_tokens_seen": 28279440,
  "num_train_epochs": 1,
  "save_steps": 213,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}